Example #1
0
def merge_reads(reads):
    """ Generator over merged reads.

    :param reads: iterable of reads from FastqReader
    :return: a generator with items (merged_bases may be None if merge fails):
    (pair_name,
     (read1_name, bases, quality),
     (read2_name, bases, quality),
     merged_bases)
    """
    for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads:
        if not (seq1 and seq2):
            score = -1
            aligned1 = aligned2 = None
        else:
            seq2_rev = reverse_and_complement(seq2)
            aligned1, aligned2, score = align_it(seq1,
                                                 seq2_rev,
                                                 GAP_OPEN_COST,
                                                 GAP_EXTEND_COST,
                                                 USE_TERMINAL_COST)
        if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-':
            aligned_qual1 = align_quality(aligned1, qual1)
            aligned_qual2 = align_quality(aligned2, reversed(qual2))
            merged = merge_pairs(aligned1,
                                 aligned2,
                                 aligned_qual1,
                                 aligned_qual2,
                                 q_cutoff=Q_CUTOFF)
        else:
            merged = None
        yield (pair_name,
               (r1_name, seq1, qual1),
               (r2_name, seq2, qual2),
               merged)
Example #2
0
def merge_reads(reads):
    """ Generator over merged reads.

    :param reads: iterable of reads from FastqReader
    :return: a generator with items (merged_bases may be None if merge fails):
    (pair_name,
     (read1_name, bases, quality),
     (read2_name, bases, quality),
     merged_bases)
    """
    for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads:
        if not (seq1 and seq2):
            score = -1
        else:
            seq2_rev = reverse_and_complement(seq2)
            aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST,
                                                 GAP_EXTEND_COST,
                                                 USE_TERMINAL_COST)
        if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-':
            aligned_qual1 = align_quality(aligned1, qual1)
            aligned_qual2 = align_quality(aligned2, reversed(qual2))
            merged = merge_pairs(aligned1,
                                 aligned2,
                                 aligned_qual1,
                                 aligned_qual2,
                                 q_cutoff=Q_CUTOFF)
        else:
            merged = None
        yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2),
               merged)
Example #3
0
    def testLowQualityInSecondRead(self):
        seq1 = 'AGT'  # @IgnorePep8
        seq2 = '---GCA'  # @IgnorePep8
        qual1 = 'JJJ'  # @IgnorePep8
        qual2 = '!!!J*J'  # @IgnorePep8
        expected_mseq = 'AGTGNA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #4
0
    def testGap(self):
        seq1 = 'AGT'  # @IgnorePep8
        seq2 = '------GCA'  # @IgnorePep8
        qual1 = 'JJJ'  # @IgnorePep8
        qual2 = '!!!!!!JJJ'  # @IgnorePep8
        expected_mseq = 'AGTnnnGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #5
0
    def testDisagreementWithLowQuality(self):
        seq1 = 'AGTGCA'  # @IgnorePep8
        seq2 = 'ACTGCA'  # @IgnorePep8
        qual1 = 'J!JJJJ'  # @IgnorePep8
        qual2 = 'J*JJJJ'  # @IgnorePep8
        expected_mseq = 'ANTGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #6
0
    def testReverseDeletion(self):
        seq1 = 'CTGCA'  # @IgnorePep8
        seq2 = '--GCAT-T'  # @IgnorePep8
        qual1 = 'JJJJJ'  # @IgnorePep8
        qual2 = '!!JJJJ!J'  # @IgnorePep8
        expected_mseq = 'CTGCAT-T'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #7
0
    def testForwardDeletion(self):
        seq1 = 'C-GCA'  # @IgnorePep8
        seq2 = '---CATCT'  # @IgnorePep8
        qual1 = 'J!JJJ'  # @IgnorePep8
        qual2 = '!!!JJJJJ'  # @IgnorePep8
        expected_mseq = 'C-GCATCT'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #8
0
    def testOffset(self):
        seq1 = '-CTGCA'  # @IgnorePep8
        seq2 = '---GCATCT'  # @IgnorePep8
        qual1 = '!JJJJJ'  # @IgnorePep8
        qual2 = '!!!JJJJJJ'  # @IgnorePep8
        expected_mseq = '-CTGCATCT'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #9
0
    def testDifferentLength(self):
        seq1 = 'ACTGCATCT'  # @IgnorePep8
        seq2 = 'ACTGCA'  # @IgnorePep8
        qual1 = 'JJJJJJJJJ'  # @IgnorePep8
        qual2 = 'JJJJJJ'  # @IgnorePep8
        expected_mseq = 'ACTGCATCT'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #10
0
    def testSimple(self):
        seq1 = 'ACTGCA'  # @IgnorePep8
        seq2 = 'ACTGCA'  # @IgnorePep8
        qual1 = 'JJJJJJ'  # @IgnorePep8
        qual2 = 'JJJJJJ'  # @IgnorePep8
        expected_mseq = 'ACTGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #11
0
    def testLowQualityInSecondRead(self):
        sequence1 = 'AGT'
        sequence2 = '---GCA'
        quality_1 = 'JJJ'
        quality_2 = '!!!J*J'
        exp_m_seq = 'AGTGNA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #12
0
    def testOffset(self):
        seq1          = '-CTGCA'  # @IgnorePep8
        seq2          = '---GCATCT'  # @IgnorePep8
        qual1         = '!JJJJJ'  # @IgnorePep8
        qual2         = '!!!JJJJJJ'  # @IgnorePep8
        expected_mseq = '-CTGCATCT'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #13
0
    def testDisagreementWithLowQuality(self):
        sequence1 = 'AGTGCA'
        sequence2 = 'ACTGCA'
        quality_1 = 'J!JJJJ'
        quality_2 = 'J*JJJJ'
        exp_m_seq = 'ANTGCA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #14
0
    def testGap(self):
        sequence1 = 'AGT'
        sequence2 = '------GCA'
        quality_1 = 'JJJ'
        quality_2 = '!!!!!!JJJ'
        exp_m_seq = 'AGTnnnGCA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #15
0
    def testForwardDeletion(self):
        seq1          = 'C-GCA'  # @IgnorePep8
        seq2          = '---CATCT'  # @IgnorePep8
        qual1         = 'J!JJJ'  # @IgnorePep8
        qual2         = '!!!JJJJJ'  # @IgnorePep8
        expected_mseq = 'C-GCATCT'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #16
0
    def testDifferentLength(self):
        seq1          = 'ACTGCATCT'  # @IgnorePep8
        seq2          = 'ACTGCA'  # @IgnorePep8
        qual1         = 'JJJJJJJJJ'  # @IgnorePep8
        qual2         = 'JJJJJJ'  # @IgnorePep8
        expected_mseq = 'ACTGCATCT'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #17
0
    def testDisagreementWithDifferentQualityFirstHigher(self):
        sequence1 = 'AGTGCA'
        sequence2 = 'ACTGCA'
        quality_1 = 'JJJJJJ'
        quality_2 = 'JEJJJJ'
        exp_m_seq = 'AGTGCA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #18
0
    def testReverseDeletion(self):
        seq1          = 'CTGCA'  # @IgnorePep8
        seq2          = '--GCAT-T'  # @IgnorePep8
        qual1         = 'JJJJJ'  # @IgnorePep8
        qual2         = '!!JJJJ!J'  # @IgnorePep8
        expected_mseq = 'CTGCAT-T'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #19
0
    def testReverseDeletion(self):
        sequence1 = 'CTGCA'
        sequence2 = '--GCAT-T'
        quality_1 = 'JJJJJ'
        quality_2 = '!!JJJJ!J'
        exp_m_seq = 'CTGCAT-T'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #20
0
    def testOffset(self):
        sequence1 = '-CTGCA'
        sequence2 = '---GCATCT'
        quality_1 = '!JJJJJ'
        quality_2 = '!!!JJJJJJ'
        exp_m_seq = '-CTGCATCT'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #21
0
    def testForwardDeletion(self):
        sequence1 = 'C-GCA'
        sequence2 = '---CATCT'
        quality_1 = 'J!JJJ'
        quality_2 = '!!!JJJJJ'
        exp_m_seq = 'C-GCATCT'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #22
0
    def testSimple(self):
        sequence1 = 'ACTGCA'
        sequence2 = 'ACTGCA'
        quality_1 = 'JJJJJJ'
        quality_2 = 'JJJJJJ'
        exp_m_seq = 'ACTGCA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #23
0
    def testDifferentLength(self):
        sequence1 = 'ACTGCATCT'
        sequence2 = 'ACTGCA'
        quality_1 = 'JJJJJJJJJ'
        quality_2 = 'JJJJJJ'
        exp_m_seq = 'ACTGCATCT'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #24
0
    def testSimple(self):
        seq1          = 'ACTGCA'  # @IgnorePep8
        seq2          = 'ACTGCA'  # @IgnorePep8
        qual1         = 'JJJJJJ'  # @IgnorePep8
        qual2         = 'JJJJJJ'  # @IgnorePep8
        expected_mseq = 'ACTGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #25
0
    def testDisagreementWithCloseQualitySecondHigher(self):
        sequence1 = 'AGTGCA'
        sequence2 = 'ACTGCA'
        quality_1 = 'JFJJJJ'
        quality_2 = 'JJJJJJ'
        exp_m_seq = 'ANTGCA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2)

        self.assertEqual(exp_m_seq, mseq)
Example #26
0
    def testLowQualityInSecondRead(self):
        seq1          = 'AGT'  # @IgnorePep8
        seq2          = '---GCA'  # @IgnorePep8
        qual1         = 'JJJ'  # @IgnorePep8
        qual2         = '!!!J*J'  # @IgnorePep8
        expected_mseq = 'AGTGNA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #27
0
    def testGap(self):
        seq1          = 'AGT'  # @IgnorePep8
        seq2          = '------GCA'  # @IgnorePep8
        qual1         = 'JJJ'  # @IgnorePep8
        qual2         = '!!!!!!JJJ'  # @IgnorePep8
        expected_mseq = 'AGTnnnGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #28
0
    def testDisagreementWithLowQuality(self):
        seq1          = 'AGTGCA'  # @IgnorePep8
        seq2          = 'ACTGCA'  # @IgnorePep8
        qual1         = 'J!JJJJ'  # @IgnorePep8
        qual2         = 'J*JJJJ'  # @IgnorePep8
        expected_mseq = 'ANTGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2)

        self.assertEqual(expected_mseq, mseq)
Example #29
0
    def testTwoInsertions(self):
        seq1          = 'AGT'  # @IgnorePep8
        seq2          = '---GCA'  # @IgnorePep8
        qual1         = 'JJJ'  # @IgnorePep8
        qual2         = '!!!JJJ'  # @IgnorePep8
        ins1 = {2: ('CCC', 'JJJ')}
        ins2 = {5: ('TTT', 'JJJ')}
        expected_mseq = 'AGCCCTGCTTTA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2)

        self.assertEqual(expected_mseq, mseq)
Example #30
0
def merge_reads(quality_cutoff, read_pair):
    """ Merge a pair of reads.

    Also skip reads that don't meet certain criteria.
    @param quality_cutoff: minimum quality score for a base to be counted
    @param read_pair: a sequence of two sequences, each with fields from a
    SAM file record
    @return: (rname, mseq, merged_inserts, qual1, qual2) or None to skip the pair
    """
    read1, read2 = read_pair
    if read2 and read1[2] != read2[2]:
        # region mismatch, ignore the read pair.
        return None
    filtered_reads = []
    rname = None
    for read in read_pair:
        if not read:
            continue
        (_qname,
         flag,
         rname,
         refpos_str,
         _mapq,
         cigar,
         _rnext,
         _pnext,
         _tlen,
         seq,
         qual) = read[:11]  # ignore optional fields
        if is_unmapped_read(flag):
            continue
        filtered_reads.append(dict(rname=rname,
                                   cigar=cigar,
                                   seq=seq,
                                   qual=qual,
                                   pos=int(refpos_str)))
    if not filtered_reads:
        return None
    seq1, qual1, ins1 = apply_cigar(filtered_reads[0]['cigar'],
                                    filtered_reads[0]['seq'],
                                    filtered_reads[0]['qual'],
                                    filtered_reads[0]['pos']-1)
    if len(filtered_reads) == 1:
        seq2 = qual2 = ''
        ins2 = None
    else:
        seq2, qual2, ins2 = apply_cigar(filtered_reads[1]['cigar'],
                                        filtered_reads[1]['seq'],
                                        filtered_reads[1]['qual'],
                                        filtered_reads[1]['pos']-1)
    mseq = merge_pairs(seq1, seq2, qual1, qual2, q_cutoff=quality_cutoff)
    merged_inserts = merge_inserts(ins1, ins2, quality_cutoff)
    return rname, mseq, merged_inserts, qual1, qual2
Example #31
0
    def testTwoInsertions(self):
        seq1 = 'AGT'  # @IgnorePep8
        seq2 = '---GCA'  # @IgnorePep8
        qual1 = 'JJJ'  # @IgnorePep8
        qual2 = '!!!JJJ'  # @IgnorePep8
        ins1 = {2: ('CCC', 'JJJ')}
        ins2 = {5: ('TTT', 'JJJ')}
        expected_mseq = 'AGCCCTGCTTTA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2)

        self.assertEqual(expected_mseq, mseq)
Example #32
0
    def testConflictingInsertions(self):
        seq1          = 'AGTGCA'  # @IgnorePep8
        seq2          = 'AGTGCA'  # @IgnorePep8
        qual1         = 'JJJJJJ'  # @IgnorePep8
        qual2         = 'JJJJJJ'  # @IgnorePep8
        ins1 = {2: ('CCC', 'JJJ')}
        ins2 = {2: ('CTC', 'JAJ')}
        expected_mseq = 'AGCCCTGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2)

        self.assertEqual(expected_mseq, mseq)
Example #33
0
    def testConflictingInsertions(self):
        seq1 = 'AGTGCA'  # @IgnorePep8
        seq2 = 'AGTGCA'  # @IgnorePep8
        qual1 = 'JJJJJJ'  # @IgnorePep8
        qual2 = 'JJJJJJ'  # @IgnorePep8
        ins1 = {2: ('CCC', 'JJJ')}
        ins2 = {2: ('CTC', 'JAJ')}
        expected_mseq = 'AGCCCTGCA'

        mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2)

        self.assertEqual(expected_mseq, mseq)
Example #34
0
    def testConflictingInsertions(self):
        sequence1 = 'AGTGCA'
        sequence2 = 'AGTGCA'
        quality_1 = 'JJJJJJ'
        quality_2 = 'JJJJJJ'
        ins1 = {2: ('CCC', 'JJJ')}
        ins2 = {2: ('CTC', 'JAJ')}
        exp_m_seq = 'AGCCCTGCA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2, ins1, ins2)

        self.assertEqual(exp_m_seq, mseq)
Example #35
0
    def testTwoInsertions(self):
        sequence1 = 'AGT'
        sequence2 = '---GCA'
        quality_1 = 'JJJ'
        quality_2 = '!!!JJJ'
        ins1 = {2: ('CCC', 'JJJ')}
        ins2 = {5: ('TTT', 'JJJ')}
        exp_m_seq = 'AGCCCTGCTTTA'

        mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2, ins1, ins2)

        self.assertEqual(exp_m_seq, mseq)
Example #36
0
def merge_reads(quality_cutoff, read_pair):
    """ Merge a pair of reads.

    Also skip reads that don't meet certain criteria.
    @param quality_cutoff: minimum quality score for a base to be counted
    @param read_pair: a sequence of two sequences, each with fields from a
    SAM file record
    @return: (rname, mseq, merged_inserts, qual1, qual2) or None to skip the pair
    """
    read1, read2 = read_pair
    if read2 and read1[2] != read2[2]:
        # region mismatch, ignore the read pair.
        return None
    filtered_reads = []
    for read in read_pair:
        if not read:
            continue
        (_qname,
         flag,
         rname,
         refpos_str,
         _mapq,
         cigar,
         _rnext,
         _pnext,
         _tlen,
         seq,
         qual) = read[:11]  # ignore optional fields
        if is_unmapped_read(flag):
            continue
        filtered_reads.append(dict(rname=rname,
                                   cigar=cigar,
                                   seq=seq,
                                   qual=qual,
                                   pos=int(refpos_str)))
    if not filtered_reads:
        return None
    seq1, qual1, ins1 = apply_cigar(filtered_reads[0]['cigar'],
                                    filtered_reads[0]['seq'],
                                    filtered_reads[0]['qual'],
                                    filtered_reads[0]['pos']-1)
    if len(filtered_reads) == 1:
        seq2 = qual2 = ''
        ins2 = None
    else:
        seq2, qual2, ins2 = apply_cigar(filtered_reads[1]['cigar'],
                                        filtered_reads[1]['seq'],
                                        filtered_reads[1]['qual'],
                                        filtered_reads[1]['pos']-1)
    mseq = merge_pairs(seq1, seq2, qual1, qual2, q_cutoff=quality_cutoff)
    merged_inserts = merge_inserts(ins1, ins2, quality_cutoff)
    return rname, mseq, merged_inserts, qual1, qual2
Example #37
0
def sam_g2p(pssm, remap_csv, nuc_csv, g2p_csv, g2p_summary_csv=None, min_count=1):
    pairs = {}  # cache read for pairing
    merged = Counter()  # { merged_nuc_seq: count }
    tracker = RegionTracker('V3LOOP')

    # look up clipping region for each read
    reader = csv.DictReader(nuc_csv)
    for row in reader:
        if row['query.nuc.pos'] == '':
            # skip deletions in query relative to reference
            continue
        tracker.add_nuc(row['seed'], row['region'], int(row['query.nuc.pos'])-1)

    # parse contents of remap CSV output
    reader = csv.DictReader(remap_csv)
    for row in reader:
        clip_from, clip_to = tracker.get_range(row['rname'])
        if clip_from is None or row['cigar'] == '*':
            # uninteresting region
            continue

        seq2, qual2, ins2 = apply_cigar(row['cigar'],
                                        row['seq'],
                                        row['qual'],
                                        int(row['pos'])-1,
                                        clip_from,
                                        clip_to)

        mate = pairs.pop(row['qname'], None)
        if mate:
            seq1 = mate['seq']
            qual1 = mate['qual']
            ins1 = mate['ins']

            mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2)

            merged[mseq] += 1

        else:
            pairs.update({row['qname']: {'seq': seq2,
                                         'qual': qual2,
                                         'ins': ins2}})

    # apply g2p algorithm to merged reads
    g2p_writer = csv.DictWriter(
        g2p_csv,
        ['rank',
         'count',
         'g2p',
         'fpr',
         'call',
         'seq',
         'aligned',
         'error',
         'comment'],
        lineterminator=os.linesep)
    g2p_writer.writeheader()
    counts = Counter()
    skip_count = 0
    for s, count in merged.most_common():
        if count < min_count:
            skip_count += count
            continue
        # remove in-frame deletions
        seq = re.sub(pat, r'\g<1>\g<3>', s)

        row = _build_row(seq, count, counts, pssm)
        g2p_writer.writerow(row)
    if skip_count:
        counts['mapped'] += skip_count
        g2p_writer.writerow(dict(rank=counts['rank'] + 1,
                                 count=skip_count,
                                 error='count < {}'.format(min_count)))

    if g2p_summary_csv is not None:
        if counts['valid'] == 0:
            x4_pct_display = ''
            final_call = ''
        else:
            x4_pct = 100.0 * counts['x4'] / counts['valid']
            final_call = 'X4' if x4_pct >= 2.0 else 'R5'
            x4_pct_display = '{:0.2f}'.format(x4_pct)
        summary_writer = csv.writer(g2p_summary_csv, lineterminator=os.linesep)
        summary_writer.writerow(['mapped', 'valid', 'X4calls', 'X4pct', 'final'])
        summary_writer.writerow([counts['mapped'],
                                 counts['valid'],
                                 counts['x4'],
                                 x4_pct_display,
                                 final_call])