Beispiel #1
0
    def get_fancy_results_dict(self,
                               max_per_query=10,
                               defline_white_space_mask=None):
        b6 = b6lib.B6Source(self.output)

        input_fasta = u.SequenceSource(self.input)
        target_db = u.SequenceSource(self.target)

        query_counts = {}
        fancy_results_dict = {}

        while next(b6):
            if b6.entry.query_id not in query_counts:
                query_counts[b6.entry.query_id] = 1

            if query_counts[b6.entry.query_id] - 1 == max_per_query:
                continue
            else:
                query_counts[b6.entry.query_id] += 1

            if b6.entry.query_id not in fancy_results_dict:
                fancy_results_dict[b6.entry.query_id] = []

            query_seq = input_fasta.get_seq_by_read_id(
                b6.entry.query_id).replace('-', '')
            target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id)

            if defline_white_space_mask:
                b6.entry = remove_white_space_mask_from_B6_entry(
                    b6.entry, defline_white_space_mask)

            # parts that were aligned during the search are being aligned to each other to generate
            # hsp_match data to include into results
            query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\
                                                         target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)])

            query_aligned, target_aligned = query_aligned.upper(
            ), target_aligned.upper()

            coverage = (b6.entry.q_end -
                        (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len
            hsp_match = ''.join([
                '|' if query_aligned[i] == target_aligned[i] else ' '
                for i in range(0, len(query_aligned))
            ])

            entry = copy.deepcopy(b6.entry)
            entry.coverage = coverage
            entry.hsp_query = query_aligned
            entry.hsp_subject = target_aligned
            entry.hsp_match = hsp_match

            entry = remove_white_space_mask_from_B6_entry(entry)

            fancy_results_dict[entry.query_id].append(entry)

        return fancy_results_dict
Beispiel #2
0
def homopolymer_indel_exists(seq1, seq2):
    seq1, seq2 = trim_uninformative_gaps_from_sequences(seq1, seq2)
    
    # sometimes alignments look like this:
    #
    #    CCCGAAAAAA--TAT
    #    CCCGAAA---AATAT
    #
    # where the correct alignment should look like this:
    #
    #    CCCGAAAAAATAT
    #    CCCGAAAAA-TAT
    # 
    # causes this function to return false. in order to fix that problem
    # we perform needleman-wunch alignment here:
    if sum([seq1.count('-'), seq2.count('-')]) > 1:
        seq1, seq2 = nw_align(seq1.replace('-', ''), seq2.replace('-', ''))

    gap_index = seq1.find('-')
    if gap_index == -1:
        gap_index = seq2.find('-')
        
        # so the gap is in seq2. replace seq1 and 2 so it would be certain
        # that the sequence with gap is seq1:
        seq1, seq2 = seq2, seq1
        
    if gap_index == -1:
        return False

    isHP = lambda x: len(set(x)) == 1
    isHPindel = lambda (s, e): seq1[s:e] == seq2[s:e] and isHP(seq1[s:e]) == 1 and seq2[gap_index] == seq2[s]
    
    def DownStream(sequence):
        i = 3
        while isHP(sequence[gap_index - i - 1:gap_index]):
            i += 1
        return (gap_index - i, gap_index)

    def UpStream(sequence):
        i = 4
        while isHP(sequence[gap_index + 1:gap_index + i + 1]):
            i += 1
        return (gap_index + 1, gap_index + i)

    # check downstream of the gap
    if gap_index >= 3:
        if isHPindel(DownStream(seq1)):
            return True
        
    # check upstream of the gap
    if len(seq1) - gap_index > 3:
        if isHPindel(UpStream(seq1)):
            return True
        
    return None
Beispiel #3
0
    def get_fancy_results_dict(self, max_per_query = 10, defline_white_space_mask = None):
        b6 = b6lib.B6Source(self.output)

        input_fasta = u.SequenceSource(self.input)
        target_db = u.SequenceSource(self.target)

        query_counts = {}
        fancy_results_dict = {}

        while b6.next():
            if not query_counts.has_key(b6.entry.query_id):
                query_counts[b6.entry.query_id] = 1

            if query_counts[b6.entry.query_id] - 1 == max_per_query:
                continue
            else:
                query_counts[b6.entry.query_id] += 1

            if not fancy_results_dict.has_key(b6.entry.query_id):
                fancy_results_dict[b6.entry.query_id] = []

            query_seq = input_fasta.get_seq_by_read_id(b6.entry.query_id).replace('-', '')
            target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id)
            
            if defline_white_space_mask:
                b6.entry = remove_white_space_mask_from_B6_entry(b6.entry, defline_white_space_mask)
            
            # parts that were aligned during the search are being aligned to each other to generate
            # hsp_match data to include into results
            query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\
                                                         target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)]) 

            query_aligned, target_aligned = query_aligned.upper(), target_aligned.upper()

            coverage = (b6.entry.q_end - (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len
            hsp_match = ''.join(['|' if query_aligned[i] == target_aligned[i] else ' ' for i in range(0, len(query_aligned))])
            
            entry = copy.deepcopy(b6.entry)
            entry.coverage = coverage
            entry.hsp_query = query_aligned
            entry.hsp_subject = target_aligned
            entry.hsp_match = hsp_match
            
            entry = remove_white_space_mask_from_B6_entry(entry)

            fancy_results_dict[entry.query_id].append(entry)

        return fancy_results_dict