Esempio n. 1
0
def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')
Esempio n. 2
0
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments,
                          maxreads=-1, VERBOSE=0,
                          minisize=100,
                          include_tests=False, summary=True):
    '''Trim reads and divide them into fragments'''
    if VERBOSE:
        print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\
                ' '.join(fragments)

    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Fragments used: '+' '.join(fragments)+'\n')

    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')
    smat = np.array(refseq, 'S1')
    len_reference = len(refseq)

    # Get the positions of fragment start/end, w/ and w/o primers
    frags_pos = get_fragment_positions(smat, fragments)
    store_reference_fragmented(data_folder, adaID, refseq,
                               dict(zip(fragments, frags_pos['trim'])))
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Primer positions (for fragments):\n')
            for (fragment, poss_full, poss_trim) in izip(fragments,
                                                         frags_pos['full'],
                                                         frags_pos['trim']):
                f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\
                                 ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n')
    write_fragment_positions(data_folder, adaID, fragments, frags_pos)

    # Get the positions of the unwanted outer primers (in case we DO nested PCR
    # for that fragment)
    # NOTE: the LTRs make no problem, because the rev outer primer of F6
    # is not in the reference anymore if F6 has undergone nested PCR
    # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting
    from re import findall
    primers_out = {'fwd': [], 'rev': []}
    for i, fr in enumerate(fragments):
        if (i != 0) and findall(r'F[2-6][a-z]?i', fr):
            primers_out['fwd'].append(fr[:-1]+'o')
        if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr):
            primers_out['rev'].append(fr[:-1]+'o')

    # Get all possible unambiguous primers for the unwanted outer primers
    from hivwholeseq.data.primers import primers_PCR
    from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
    primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])),
                                        'S1', ndmin=2)
                               for fr in primers_out['fwd']],
                       'rev': [np.array(map(list, eas(primers_PCR[fr][1])),
                                        'S1', ndmin=2)
                               for fr in primers_out['rev']],
                      }
    primers_out_pos = {'fwd': [], 'rev': []}
    if primers_out['fwd']:
        primers_out_pos['fwd'] = map(itemgetter(0),
                                     get_primer_positions(smat,
                                                          primers_out['fwd'], 'fwd'))
    if primers_out['rev']:
        primers_out_pos['rev'] = map(itemgetter(1),
                                     get_primer_positions(smat,
                                                          primers_out['rev'], 'rev'))

    # Input and output files
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        convert_sam_to_bam(input_filename)
    output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:

        try:
            file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile)
                            for ofn in output_filenames[:len(fragments)]]
    
            fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile)
            fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile)
            fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile)
            fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile)

            # Iterate over the mapped reads and assign fragments
            n_mapped = [0 for fragment in fragments]
            n_unmapped = 0
            n_crossfrag = 0
            n_ambiguous = 0
            n_outer = 0
            n_lowq = 0
            for irp, reads in enumerate(pair_generator(bamfile)):

                if irp == maxreads:
                    if VERBOSE:
                        print 'Maximal number of read pairs reached:', maxreads
                    break

                if VERBOSE >= 2:
                    if not ((irp+1) % 10000):
                        print irp+1

                i_fwd = reads[0].is_reverse

                # If unmapped or unpaired, mini, or insert size mini, or
                # divergent read pair (fully cross-overlapping), discard
                if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
                   reads[1].is_unmapped or (not reads[1].is_proper_pair) or \
                   (reads[0].rlen < 50) or (reads[1].rlen < 50) or \
                   (reads[i_fwd].isize < minisize):
                    if VERBOSE >= 3:
                        print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname
                    n_unmapped += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # If the insert is a misamplification from the outer primers
                # in fragments that underwent nested PCR,
                # trash it (it will have skewed amplification anyway). We cannot
                # find all of those, rather only the ones still carrying the
                # primer itself (some others have lost it while shearing). For
                # those, no matter what happens at the end (reading into adapters,
                # etc.), ONE of the reads in the pair will start exactly with one
                # outer primer: if the rev read with a rev primer, if the fwd
                # with a fwd one. Test all six.
                if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \
                   test_outer_primer(reads,
                                     primers_out_pos, primers_out_seq,
                                     len_reference):
                    if VERBOSE >= 3:
                        print 'Read pair from outer primer:', reads[0].qname
                    n_outer += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # FIXME: the following becomes a bit harder when we mix parallel
                # PCRs, e.g. F5a+b, to get more product

                # Assign to a fragment now, so that primer trimming is faster 
                pair_identity = assign_to_fragment(reads, frags_pos['full'],
                                                   VERBOSE=VERBOSE)

                # 1. If no fragments are possible (e.g. one read crosses the
                # fragment boundary, they map to different fragments), dump it
                # into a special bucket
                if pair_identity == 'cross':
                    n_crossfrag += 1
                    fo_cm.write(reads[0])
                    fo_cm.write(reads[1])
                    continue

                # 2. If 2+ fragments are possible (tie), put into a special bucket
                # (essentially excluded, because we want two independent measurements
                # in the overlapping region, but we might want to recover them)
                elif pair_identity == 'ambiguous':
                    n_ambiguous += 1
                    fo_am.write(reads[0])
                    fo_am.write(reads[1])
                    continue

                # 3. If the intersection is a single fragment, good: trim the primers
                # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0
                n_frag = int(pair_identity)
                frag_pos = frags_pos['trim'][n_frag]
                if not np.isscalar(frag_pos[0]):
                    frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']]
                trashed_primers = trim_primers(reads, frag_pos,
                                               include_tests=include_tests)
                if trashed_primers or (reads[i_fwd].isize < 100):
                    n_unmapped += 1
                    if VERBOSE >= 3:
                        print 'Read pair is mismapped:', reads[0].qname
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # Quality trimming: if no decently long pair survives, trash
                #trashed_quality = main_block_low_quality(reads, phred_min=20,
                #                                         include_tests=include_tests)
                trashed_quality = trim_low_quality(reads, phred_min=20,
                                                   include_tests=include_tests)
                if trashed_quality or (reads[i_fwd].isize < 100):
                    n_lowq += 1
                    if VERBOSE >= 3:
                        print 'Read pair has low phred quality:', reads[0].qname
                    fo_lq.write(reads[0])
                    fo_lq.write(reads[1])
                    continue

                # Check for cross-overhangs or COH (reading into the adapters)
                #        --------------->
                #    <-----------
                # In that case, trim to perfect overlap.
                if test_coh(reads, VERBOSE=False):
                    trim_coh(reads, trim=0, include_tests=include_tests)

                # Change coordinates into the fragmented reference (primer-trimmed)
                for read in reads:
                    read.pos -= frag_pos[0]
                    read.mpos -= frag_pos[0]

                # Here the tests
                if include_tests:
                    lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0]
                    if test_sanity(reads, n_frag, lfr):
                        print 'Tests failed:', reads[0].qname
                        import ipdb; ipdb.set_trace()

                # There we go!
                n_mapped[n_frag] += 1
                file_handles[n_frag].write(reads[0])
                file_handles[n_frag].write(reads[1])

        finally:
            for f in file_handles:
                f.close()
            fo_am.close()
            fo_cm.close()
            fo_um.close()
            fo_lq.close()


    if VERBOSE:
        print 'Trim and divide results: adaID '+adaID
        print 'Total:\t\t', irp
        print 'Mapped:\t\t', sum(n_mapped), n_mapped
        print 'Unmapped/unpaired/tiny:\t', n_unmapped
        print 'Outer primer\t', n_outer
        print 'Crossfrag:\t', n_crossfrag
        print 'Ambiguous:\t', n_ambiguous
        print 'Low-quality:\t', n_lowq

    # Write summary to file
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Trim and divide results: adaID '+adaID+'\n')
            f.write('Total:\t\t'+str(irp + 1)+'\n')
            f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n')
            f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n')
            f.write('Outer primer\t'+str(n_outer)+'\n')
            f.write('Crossfrag:\t'+str(n_crossfrag)+'\n')
            f.write('Ambiguous:\t'+str(n_ambiguous)+'\n')
            f.write('Low-quality:\t'+str(n_lowq)+'\n')
Esempio n. 3
0
def get_primer_positions(smat, fragments, type='both'):
    '''Get the primer positions for fwd, rev, or both primers'''
    from hivwholeseq.data.primers import primers_PCR
    from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas

    # j controls the direction: j = 0 --> FWD, j = 1 --> REV
    types = {'fwd': [0], 'rev': [1], 'both': [0, 1]}
    js = types[type]
    primer_poss = [[], []]
    for j in js:
        pr_old_pos = 0
        pr_old = ''
        for ifr, fragment in enumerate(fragments):
            # Sometimes, the PCR is performed with different primer sets in parallel
            # to get more DNA, so we must take the innermost to be conservative (we
            # lose some reads, but the ones we keep are good).
            # NOTE: F6 is not ambiguous, so we do not treat it specially here (see below)
            if '+' in fragment:
                if fragment[-1] == 'o':
                    pcs = pcos
                elif fragment[-1] == 'i':
                    pcs = pcis
                else:
                    raise ValueError('Neither PCR1 nor PCR2??')

                fragment_subs = [fragment[:2]+fsub for fsub in fragment[2:-1].split('+')]
                pco_inn = [pcs[fsub][j][not j] for fsub in fragment_subs]
                if not j:
                    fragment_inn = fragment_subs[np.argmax(pco_inn)]+fragment[-1]
                    fragment_out = fragment_subs[np.argmin(pco_inn)]+fragment[-1]
                else:
                    fragment_inn = fragment_subs[np.argmin(pco_inn)]+fragment[-1]
                    fragment_out = fragment_subs[np.argmax(pco_inn)]+fragment[-1]

                if VERBOSE >= 3:
                    print j, fragment_subs, pco_inn, fragment_inn, fragment_out

                pr_pos_pair = {}
                # Get the left first
                if not j:
                    fragment = fragment_out
                    label = 'outer'
                else:
                    fragment = fragment_inn
                    label = 'inner'

                # Expand ambiguous primers in a list of all possible unambiguous ones,
                # and look for the best approximate match between all primers and a
                # sliding window in the HIV genome
                pr = primers_PCR[fragment][j]
                pr_mat = np.array(map(list, eas(pr)), 'S1')
                n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max()
                             for i in xrange(pr_old_pos + len(pr_old),
                                             len(smat) - len(pr) + 1)]

                pr_pos = pr_old_pos + len(pr_old) + np.argmax(n_matches)
                pr_pos_pair[label] = [pr_pos, pr_pos + len(pr)]

                # Get the right one second
                if not j:
                    fragment = fragment_inn
                    label = 'inner'
                else:
                    fragment = fragment_out
                    label = 'outer'

                pr = primers_PCR[fragment][j]
                pr_mat = np.array(map(list, eas(pr)), 'S1')
                n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max()
                             for i in xrange(pr_pos,
                                             len(smat) - len(pr) + 1)]

                pr_pos = pr_pos + np.argmax(n_matches)
                pr_pos_pair[label] = [pr_pos, pr_pos + len(pr)]

            else:

                # Expand ambiguous primers in a list of all possible unambiguous ones,
                # and look for the best approximate match between all primers and a
                # sliding window in the HIV genome
                pr = primers_PCR[fragment][j]
                pr_mat = np.array(map(list, eas(pr)), 'S1')
                n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max()
                             for i in xrange(pr_old_pos + len(pr_old),
                                             len(smat) - len(pr) + 1)]

                # NOTE: F6 rev lies in the LTR, so we risk reading the other LTR.
                # Treat it as a special case: come from the right!
                if j and ('F6' in fragment):
                    pr_pos = len(smat) - len(pr) - np.argmax(n_matches[::-1])
                else:
                    pr_pos = pr_old_pos + len(pr_old) + np.argmax(n_matches)

                pr_pos_pair = [pr_pos, pr_pos + len(pr)]

            primer_poss[j].append(pr_pos_pair)
            pr_old_pos = pr_pos
            pr_old = pr

    if type != 'both':
        return primer_poss[type == 'rev']
    else:
        return primer_poss
def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')