Python eas Examples

Programming Language: Python

Namespace/Package Name: hivwholeseq.utils.sequence

Method/Function: eas

Examples at hotexamples.com: 4

Python eas - 4 examples found. These are the top rated real world Python examples of hivwholeseq.utils.sequence.eas extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: premap_to_reference.py Project: 5l1v3r1/hivwholeseq

def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')

Example #2

Show file

File: trim_and_divide.py Project: 5l1v3r1/hivwholeseq

def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments,
                          maxreads=-1, VERBOSE=0,
                          minisize=100,
                          include_tests=False, summary=True):
    '''Trim reads and divide them into fragments'''
    if VERBOSE:
        print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\
                ' '.join(fragments)

    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Fragments used: '+' '.join(fragments)+'\n')

    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')
    smat = np.array(refseq, 'S1')
    len_reference = len(refseq)

    # Get the positions of fragment start/end, w/ and w/o primers
    frags_pos = get_fragment_positions(smat, fragments)
    store_reference_fragmented(data_folder, adaID, refseq,
                               dict(zip(fragments, frags_pos['trim'])))
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Primer positions (for fragments):\n')
            for (fragment, poss_full, poss_trim) in izip(fragments,
                                                         frags_pos['full'],
                                                         frags_pos['trim']):
                f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\
                                 ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n')
    write_fragment_positions(data_folder, adaID, fragments, frags_pos)

    # Get the positions of the unwanted outer primers (in case we DO nested PCR
    # for that fragment)
    # NOTE: the LTRs make no problem, because the rev outer primer of F6
    # is not in the reference anymore if F6 has undergone nested PCR
    # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting
    from re import findall
    primers_out = {'fwd': [], 'rev': []}
    for i, fr in enumerate(fragments):
        if (i != 0) and findall(r'F[2-6][a-z]?i', fr):
            primers_out['fwd'].append(fr[:-1]+'o')
        if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr):
            primers_out['rev'].append(fr[:-1]+'o')

    # Get all possible unambiguous primers for the unwanted outer primers
    from hivwholeseq.data.primers import primers_PCR
    from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
    primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])),
                                        'S1', ndmin=2)
                               for fr in primers_out['fwd']],
                       'rev': [np.array(map(list, eas(primers_PCR[fr][1])),
                                        'S1', ndmin=2)
                               for fr in primers_out['rev']],
                      }
    primers_out_pos = {'fwd': [], 'rev': []}
    if primers_out['fwd']:
        primers_out_pos['fwd'] = map(itemgetter(0),
                                     get_primer_positions(smat,
                                                          primers_out['fwd'], 'fwd'))
    if primers_out['rev']:
        primers_out_pos['rev'] = map(itemgetter(1),
                                     get_primer_positions(smat,
                                                          primers_out['rev'], 'rev'))

    # Input and output files
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        convert_sam_to_bam(input_filename)
    output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:

        try:
            file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile)
                            for ofn in output_filenames[:len(fragments)]]
    
            fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile)
            fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile)
            fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile)
            fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile)

            # Iterate over the mapped reads and assign fragments
            n_mapped = [0 for fragment in fragments]
            n_unmapped = 0
            n_crossfrag = 0
            n_ambiguous = 0
            n_outer = 0
            n_lowq = 0
            for irp, reads in enumerate(pair_generator(bamfile)):

                if irp == maxreads:
                    if VERBOSE:
                        print 'Maximal number of read pairs reached:', maxreads
                    break

                if VERBOSE >= 2:
                    if not ((irp+1) % 10000):
                        print irp+1

                i_fwd = reads[0].is_reverse

                # If unmapped or unpaired, mini, or insert size mini, or
                # divergent read pair (fully cross-overlapping), discard
                if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
                   reads[1].is_unmapped or (not reads[1].is_proper_pair) or \
                   (reads[0].rlen < 50) or (reads[1].rlen < 50) or \
                   (reads[i_fwd].isize < minisize):
                    if VERBOSE >= 3:
                        print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname
                    n_unmapped += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # If the insert is a misamplification from the outer primers
                # in fragments that underwent nested PCR,
                # trash it (it will have skewed amplification anyway). We cannot
                # find all of those, rather only the ones still carrying the
                # primer itself (some others have lost it while shearing). For
                # those, no matter what happens at the end (reading into adapters,
                # etc.), ONE of the reads in the pair will start exactly with one
                # outer primer: if the rev read with a rev primer, if the fwd
                # with a fwd one. Test all six.
                if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \
                   test_outer_primer(reads,
                                     primers_out_pos, primers_out_seq,
                                     len_reference):
                    if VERBOSE >= 3:
                        print 'Read pair from outer primer:', reads[0].qname
                    n_outer += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # FIXME: the following becomes a bit harder when we mix parallel
                # PCRs, e.g. F5a+b, to get more product

                # Assign to a fragment now, so that primer trimming is faster 
                pair_identity = assign_to_fragment(reads, frags_pos['full'],
                                                   VERBOSE=VERBOSE)

                # 1. If no fragments are possible (e.g. one read crosses the
                # fragment boundary, they map to different fragments), dump it
                # into a special bucket
                if pair_identity == 'cross':
                    n_crossfrag += 1
                    fo_cm.write(reads[0])
                    fo_cm.write(reads[1])
                    continue

                # 2. If 2+ fragments are possible (tie), put into a special bucket
                # (essentially excluded, because we want two independent measurements
                # in the overlapping region, but we might want to recover them)
                elif pair_identity == 'ambiguous':
                    n_ambiguous += 1
                    fo_am.write(reads[0])
                    fo_am.write(reads[1])
                    continue

                # 3. If the intersection is a single fragment, good: trim the primers
                # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0
                n_frag = int(pair_identity)
                frag_pos = frags_pos['trim'][n_frag]
                if not np.isscalar(frag_pos[0]):
                    frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']]
                trashed_primers = trim_primers(reads, frag_pos,
                                               include_tests=include_tests)
                if trashed_primers or (reads[i_fwd].isize < 100):
                    n_unmapped += 1
                    if VERBOSE >= 3:
                        print 'Read pair is mismapped:', reads[0].qname
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # Quality trimming: if no decently long pair survives, trash
                #trashed_quality = main_block_low_quality(reads, phred_min=20,
                #                                         include_tests=include_tests)
                trashed_quality = trim_low_quality(reads, phred_min=20,
                                                   include_tests=include_tests)
                if trashed_quality or (reads[i_fwd].isize < 100):
                    n_lowq += 1
                    if VERBOSE >= 3:
                        print 'Read pair has low phred quality:', reads[0].qname
                    fo_lq.write(reads[0])
                    fo_lq.write(reads[1])
                    continue

                # Check for cross-overhangs or COH (reading into the adapters)
                #        --------------->
                #    <-----------
                # In that case, trim to perfect overlap.
                if test_coh(reads, VERBOSE=False):
                    trim_coh(reads, trim=0, include_tests=include_tests)

                # Change coordinates into the fragmented reference (primer-trimmed)
                for read in reads:
                    read.pos -= frag_pos[0]
                    read.mpos -= frag_pos[0]

                # Here the tests
                if include_tests:
                    lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0]
                    if test_sanity(reads, n_frag, lfr):
                        print 'Tests failed:', reads[0].qname
                        import ipdb; ipdb.set_trace()

                # There we go!
                n_mapped[n_frag] += 1
                file_handles[n_frag].write(reads[0])
                file_handles[n_frag].write(reads[1])

        finally:
            for f in file_handles:
                f.close()
            fo_am.close()
            fo_cm.close()
            fo_um.close()
            fo_lq.close()


    if VERBOSE:
        print 'Trim and divide results: adaID '+adaID
        print 'Total:\t\t', irp
        print 'Mapped:\t\t', sum(n_mapped), n_mapped
        print 'Unmapped/unpaired/tiny:\t', n_unmapped
        print 'Outer primer\t', n_outer
        print 'Crossfrag:\t', n_crossfrag
        print 'Ambiguous:\t', n_ambiguous
        print 'Low-quality:\t', n_lowq

    # Write summary to file
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Trim and divide results: adaID '+adaID+'\n')
            f.write('Total:\t\t'+str(irp + 1)+'\n')
            f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n')
            f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n')
            f.write('Outer primer\t'+str(n_outer)+'\n')
            f.write('Crossfrag:\t'+str(n_crossfrag)+'\n')
            f.write('Ambiguous:\t'+str(n_ambiguous)+'\n')
            f.write('Low-quality:\t'+str(n_lowq)+'\n')

Example #3

Show file

File: trim_and_divide.py Project: 5l1v3r1/hivwholeseq

def get_primer_positions(smat, fragments, type='both'):
    '''Get the primer positions for fwd, rev, or both primers'''
    from hivwholeseq.data.primers import primers_PCR
    from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas

    # j controls the direction: j = 0 --> FWD, j = 1 --> REV
    types = {'fwd': [0], 'rev': [1], 'both': [0, 1]}
    js = types[type]
    primer_poss = [[], []]
    for j in js:
        pr_old_pos = 0
        pr_old = ''
        for ifr, fragment in enumerate(fragments):
            # Sometimes, the PCR is performed with different primer sets in parallel
            # to get more DNA, so we must take the innermost to be conservative (we
            # lose some reads, but the ones we keep are good).
            # NOTE: F6 is not ambiguous, so we do not treat it specially here (see below)
            if '+' in fragment:
                if fragment[-1] == 'o':
                    pcs = pcos
                elif fragment[-1] == 'i':
                    pcs = pcis
                else:
                    raise ValueError('Neither PCR1 nor PCR2??')

                fragment_subs = [fragment[:2]+fsub for fsub in fragment[2:-1].split('+')]
                pco_inn = [pcs[fsub][j][not j] for fsub in fragment_subs]
                if not j:
                    fragment_inn = fragment_subs[np.argmax(pco_inn)]+fragment[-1]
                    fragment_out = fragment_subs[np.argmin(pco_inn)]+fragment[-1]
                else:
                    fragment_inn = fragment_subs[np.argmin(pco_inn)]+fragment[-1]
                    fragment_out = fragment_subs[np.argmax(pco_inn)]+fragment[-1]

                if VERBOSE >= 3:
                    print j, fragment_subs, pco_inn, fragment_inn, fragment_out

                pr_pos_pair = {}
                # Get the left first
                if not j:
                    fragment = fragment_out
                    label = 'outer'
                else:
                    fragment = fragment_inn
                    label = 'inner'

                # Expand ambiguous primers in a list of all possible unambiguous ones,
                # and look for the best approximate match between all primers and a
                # sliding window in the HIV genome
                pr = primers_PCR[fragment][j]
                pr_mat = np.array(map(list, eas(pr)), 'S1')
                n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max()
                             for i in xrange(pr_old_pos + len(pr_old),
                                             len(smat) - len(pr) + 1)]

                pr_pos = pr_old_pos + len(pr_old) + np.argmax(n_matches)
                pr_pos_pair[label] = [pr_pos, pr_pos + len(pr)]

                # Get the right one second
                if not j:
                    fragment = fragment_inn
                    label = 'inner'
                else:
                    fragment = fragment_out
                    label = 'outer'

                pr = primers_PCR[fragment][j]
                pr_mat = np.array(map(list, eas(pr)), 'S1')
                n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max()
                             for i in xrange(pr_pos,
                                             len(smat) - len(pr) + 1)]

                pr_pos = pr_pos + np.argmax(n_matches)
                pr_pos_pair[label] = [pr_pos, pr_pos + len(pr)]

            else:

                # Expand ambiguous primers in a list of all possible unambiguous ones,
                # and look for the best approximate match between all primers and a
                # sliding window in the HIV genome
                pr = primers_PCR[fragment][j]
                pr_mat = np.array(map(list, eas(pr)), 'S1')
                n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max()
                             for i in xrange(pr_old_pos + len(pr_old),
                                             len(smat) - len(pr) + 1)]

                # NOTE: F6 rev lies in the LTR, so we risk reading the other LTR.
                # Treat it as a special case: come from the right!
                if j and ('F6' in fragment):
                    pr_pos = len(smat) - len(pr) - np.argmax(n_matches[::-1])
                else:
                    pr_pos = pr_old_pos + len(pr_old) + np.argmax(n_matches)

                pr_pos_pair = [pr_pos, pr_pos + len(pr)]

            primer_poss[j].append(pr_pos_pair)
            pr_old_pos = pr_pos
            pr_old = pr

    if type != 'both':
        return primer_poss[type == 'rev']
    else:
        return primer_poss

Example #4

Show file

File: premap_to_reference.py Project: iosonofabio/hivwholeseq

def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')