Example #1
0
    def get_fancy_results_dict(self,
                               max_per_query=10,
                               defline_white_space_mask=None):
        b6 = b6lib.B6Source(self.output)

        input_fasta = u.SequenceSource(self.input)
        target_db = u.SequenceSource(self.target)

        query_counts = {}
        fancy_results_dict = {}

        while b6.next():
            if not query_counts.has_key(b6.entry.query_id):
                query_counts[b6.entry.query_id] = 1

            if query_counts[b6.entry.query_id] - 1 == max_per_query:
                continue
            else:
                query_counts[b6.entry.query_id] += 1

            if not fancy_results_dict.has_key(b6.entry.query_id):
                fancy_results_dict[b6.entry.query_id] = []

            query_seq = input_fasta.get_seq_by_read_id(
                b6.entry.query_id).replace('-', '')
            target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id)

            if defline_white_space_mask:
                b6.entry = remove_white_space_mask_from_B6_entry(
                    b6.entry, defline_white_space_mask)

            # parts that were aligned during the search are being aligned to each other to generate
            # hsp_match data to include into results
            query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\
                                                         target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)])

            query_aligned, target_aligned = query_aligned.upper(
            ), target_aligned.upper()

            coverage = (b6.entry.q_end -
                        (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len
            hsp_match = ''.join([
                '|' if query_aligned[i] == target_aligned[i] else ' '
                for i in range(0, len(query_aligned))
            ])

            entry = copy.deepcopy(b6.entry)
            entry.coverage = coverage
            entry.hsp_query = query_aligned
            entry.hsp_subject = target_aligned
            entry.hsp_match = hsp_match

            entry = remove_white_space_mask_from_B6_entry(entry)

            fancy_results_dict[entry.query_id].append(entry)

        return fancy_results_dict
Example #2
0
def homopolymer_indel_exists(seq1, seq2):
    seq1, seq2 = trim_uninformative_gaps_from_sequences(seq1, seq2)

    # sometimes alignments look like this:
    #
    #    CCCGAAAAAA--TAT
    #    CCCGAAA---AATAT
    #
    # where the correct alignment should look like this:
    #
    #    CCCGAAAAAATAT
    #    CCCGAAAAA-TAT
    #
    # causes this function to return false. in order to fix that problem
    # we perform needleman-wunch alignment here:
    if sum([seq1.count('-'), seq2.count('-')]) > 1:
        seq1, seq2 = nw_align(seq1.replace('-', ''), seq2.replace('-', ''))

    gap_index = seq1.find('-')
    if gap_index == -1:
        gap_index = seq2.find('-')

        # so the gap is in seq2. replace seq1 and 2 so it would be certain
        # that the sequence with gap is seq1:
        seq1, seq2 = seq2, seq1

    if gap_index == -1:
        return False

    isHP = lambda x: len(set(x)) == 1
    isHPindel = lambda (s, e): seq1[s:e] == seq2[s:e] and isHP(seq1[
        s:e]) == 1 and seq2[gap_index] == seq2[s]

    def DownStream(sequence):
        i = 3
        while isHP(sequence[gap_index - i - 1:gap_index]):
            i += 1
        return (gap_index - i, gap_index)

    def UpStream(sequence):
        i = 4
        while isHP(sequence[gap_index + 1:gap_index + i + 1]):
            i += 1
        return (gap_index + 1, gap_index + i)

    # check downstream of the gap
    if gap_index >= 3:
        if isHPindel(DownStream(seq1)):
            return True

    # check upstream of the gap
    if len(seq1) - gap_index > 3:
        if isHPindel(UpStream(seq1)):
            return True

    return None
Example #3
0
def homopolymer_indel_exists(seq1, seq2):
    seq1, seq2 = trim_uninformative_gaps_from_sequences(seq1, seq2)
    
    # sometimes alignments look like this:
    #
    #    CCCGAAAAAA--TAT
    #    CCCGAAA---AATAT
    #
    # where the correct alignment should look like this:
    #
    #    CCCGAAAAAATAT
    #    CCCGAAAAA-TAT
    # 
    # causes this function to return false. in order to fix that problem
    # we perform needleman-wunch alignment here:
    if sum([seq1.count('-'), seq2.count('-')]) > 1:
        seq1, seq2 = nw_align(seq1.replace('-', ''), seq2.replace('-', ''))

    gap_index = seq1.find('-')
    if gap_index == -1:
        gap_index = seq2.find('-')
        
        # so the gap is in seq2. replace seq1 and 2 so it would be certain
        # that the sequence with gap is seq1:
        seq1, seq2 = seq2, seq1
        
    if gap_index == -1:
        return False

    isHP = lambda x: len(set(x)) == 1
    isHPindel = lambda (s, e): seq1[s:e] == seq2[s:e] and isHP(seq1[s:e]) == 1 and seq2[gap_index] == seq2[s]
    
    def DownStream(sequence):
        i = 3
        while isHP(sequence[gap_index - i - 1:gap_index]):
            i += 1
        return (gap_index - i, gap_index)

    def UpStream(sequence):
        i = 4
        while isHP(sequence[gap_index + 1:gap_index + i + 1]):
            i += 1
        return (gap_index + 1, gap_index + i)

    # check downstream of the gap
    if gap_index >= 3:
        if isHPindel(DownStream(seq1)):
            return True
        
    # check upstream of the gap
    if len(seq1) - gap_index > 3:
        if isHPindel(UpStream(seq1)):
            return True
        
    return None
Example #4
0
    def get_fancy_results_dict(self, max_per_query = 10, defline_white_space_mask = None):
        b6 = b6lib.B6Source(self.output)

        input_fasta = u.SequenceSource(self.input)
        target_db = u.SequenceSource(self.target)

        query_counts = {}
        fancy_results_dict = {}

        while b6.next():
            if not query_counts.has_key(b6.entry.query_id):
                query_counts[b6.entry.query_id] = 1

            if query_counts[b6.entry.query_id] - 1 == max_per_query:
                continue
            else:
                query_counts[b6.entry.query_id] += 1

            if not fancy_results_dict.has_key(b6.entry.query_id):
                fancy_results_dict[b6.entry.query_id] = []

            query_seq = input_fasta.get_seq_by_read_id(b6.entry.query_id).replace('-', '')
            target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id)
            
            if defline_white_space_mask:
                b6.entry = remove_white_space_mask_from_B6_entry(b6.entry, defline_white_space_mask)
            
            # parts that were aligned during the search are being aligned to each other to generate
            # hsp_match data to include into results
            query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\
                                                         target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)]) 

            query_aligned, target_aligned = query_aligned.upper(), target_aligned.upper()

            coverage = (b6.entry.q_end - (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len
            hsp_match = ''.join(['|' if query_aligned[i] == target_aligned[i] else ' ' for i in range(0, len(query_aligned))])
            
            entry = copy.deepcopy(b6.entry)
            entry.coverage = coverage
            entry.hsp_query = query_aligned
            entry.hsp_subject = target_aligned
            entry.hsp_match = hsp_match
            
            entry = remove_white_space_mask_from_B6_entry(entry)

            fancy_results_dict[entry.query_id].append(entry)

        return fancy_results_dict
Example #5
0
def compare_seqs(knownseq, testseq, header):
    #keep if exact match of substrings
    if knownseq in testseq or testseq in knownseq:
        return ">%s\n%s" % (header, testseq)
    #START: keep if one difference between each sequence
    alnknown, alnseq = nw_align(knownseq, testseq)
    #find where to start comparison by trimming gaps at ends
    knownstart, knownend = find_ends(alnknown)
    seqstart, seqend = find_ends(alnseq)
    start = min(seqstart, knownstart)
    end = max(seqend, knownend)
    if start < 0 or end < 0:
        raise ValueError("Start and end must be greater than 0!")
    #if only one difference, print out sequence
    if distance(knownseq[start:end], testseq[start:end]) <= 1:
        return ">%s\n%s" % (header, testseq)
    return ""
Example #6
0
def compare_seqs(knownseq, testseq, header):
    #keep if exact match of substrings
    if knownseq in testseq or testseq in knownseq:
        return ">%s\n%s" % (header, testseq)
    #START: keep if one difference between each sequence
    alnknown, alnseq = nw_align(knownseq, testseq)
    #find where to start comparison by trimming gaps at ends
    knownstart, knownend = find_ends(alnknown)
    seqstart, seqend = find_ends(alnseq)
    start = min(seqstart, knownstart)
    end = max(seqend, knownend)
    if start < 0 or end < 0:
        raise ValueError("Start and end must be greater than 0!")
    #if only one difference, print out sequence
    if distance(knownseq[start:end], testseq[start:end]) <= 1:
        return ">%s\n%s" % (header, testseq)
    return ""
Example #7
0
 def test_nw_align(self):
     """Tests for nw_align function."""
     (first, second), score = nw_align('ACGU', 'CAGU', return_score=True)
     self.assertEqual(first, 'AC-GU')
     self.assertEqual(second, '-CAGU')
     self.assertEqual(score, 1)
Example #8
0
 def test_nw_align_empty(self):
     """Tests for nw_align function."""
     (first, second), score = nw_align('', '', return_score=True)
     self.assertEqual(first, '')
     self.assertEqual(second, '')
     self.assertEqual(score, 0)
Example #9
0
 def test_nw_align(self):
     """Tests for nw_align function."""
     (first, second), score = nw_align("ACGU", "CAGU", return_score=True)
     self.assertEqual(first, "AC-GU")
     self.assertEqual(second, "-CAGU")
     self.assertEqual(score, 1)
Example #10
0
 def test_nw_align_empty(self):
     """Tests for nw_align function."""
     (first, second), score = nw_align("", "", return_score=True)
     self.assertEqual(first, "")
     self.assertEqual(second, "")
     self.assertEqual(score, 0)
Example #11
0
 def test_nw_align(self):
     """Tests for nw_align function."""
     (first,second),score = nw_align('ACGU','CAGU',return_score=True)
     self.assertEqual(first,'AC-GU')
     self.assertEqual(second,'-CAGU')
     self.assertEqual(score,1)