def check_division(data_folder, adaID, fragment, seq_run, qual_min=35,
                   reference='HXB2', maxreads=-1, VERBOSE=0, minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_reference_premap_filename(data_folder, adaID, fragment)

    # FIXME: old nomenclature for F3a
    if not os.path.isfile(ref_fn):
        if fragment[:2] == 'F3':
            ref_fn = ref_fn.replace('F3a', 'F3')

    refseq = SeqIO.read(ref_fn, 'fasta')

    # Scan reads
    input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam')

    # FIXME: old nomenclature for F3a
    if not os.path.isfile(input_filename):
        if fragment[:2] == 'F3':
            input_filename = input_filename.replace('F3a', 'F3')

    counts, inserts = get_allele_counts_insertions_from_file(input_filename,
                                                             len(refseq),
                                                             maxreads=maxreads,
                                                             VERBOSE=VERBOSE)

    # Plot results
    title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]),
                        [['run', seq_run],
                         ['adaID', adaID],
                         ['fragment', fragment],
                         ['maxreads', maxreads],
                        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True):
    '''Make index and hash files for reference or consensus'''
    if VERBOSE:
        print 'Making index and hash files: adaID', adaID

    # 1. Make genome index file for reference
    if os.path.isfile(get_reference_premap_index_filename(data_folder, adaID, ext=True)):
        os.remove(get_reference_premap_index_filename(data_folder, adaID, ext=True))
    stdout = sp.check_output([stampy_bin,
                              '--species="HIV"',
                              '--overwrite',
                              '-G', get_reference_premap_index_filename(data_folder, adaID, ext=False),
                              get_reference_premap_filename(data_folder, adaID),
                              ],
                              stderr=sp.STDOUT)
    if VERBOSE:
        print 'Built index: '+adaID
    
    # 2. Build a hash file for reference
    if os.path.isfile(get_reference_premap_hash_filename(data_folder, adaID, ext=True)):
        os.remove(get_reference_premap_hash_filename(data_folder, adaID, ext=True))
    stdout = sp.check_output([stampy_bin,
                              '--overwrite',
                              '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False),
                              '-H', get_reference_premap_hash_filename(data_folder, adaID, ext=False),
                              ],
                              stderr=sp.STDOUT)
    if VERBOSE:
        print 'Built hash: '+adaID

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Stampy index and hash written.')
            f.write('\n')
def get_reference_filename(data_folder, adaID, fragment, n_iter, ext=True):
    '''Get the reference filename for the intermediate mappings'''
    if n_iter == 1:
        fn = get_reference_premap_filename(data_folder, adaID, fragment)
        if not ext:
            fn = fn[:-6]
    else:
        fn = '_'.join(['consensus', str(n_iter-1), fragment])
        fn = data_folder+foldername_adapter(adaID)+'map_iter/'+fn
        if ext:
            fn = fn+'.fasta'
    return fn
Example #4
0
def get_reference_filename(data_folder, adaID, fragment, n_iter, ext=True):
    '''Get the reference filename for the intermediate mappings'''
    if n_iter == 1:
        fn = get_reference_premap_filename(data_folder, adaID, fragment)
        if not ext:
            fn = fn[:-6]
    else:
        fn = '_'.join(['consensus', str(n_iter - 1), fragment])
        fn = data_folder + foldername_adapter(adaID) + 'map_iter/' + fn
        if ext:
            fn = fn + '.fasta'
    return fn
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True):
    '''Produce a report on rough coverage on reference (ignore inserts)'''
    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')

    # Prepare data structures
    coverage = np.zeros(len(refseq), int)

    # Parse the BAM file
    unmapped = 0
    mapped = 0
    bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for read in bamfile:
            if read.is_unmapped or (not read.is_proper_pair) or (not len(
                    read.cigar)):
                unmapped += 1
                continue

            # Proceed along CIGARs
            ref_pos = read.pos
            for (bt, bl) in read.cigar:
                if bt not in (0, 2):
                    continue
                # Treat deletions as 'covered'
                coverage[ref_pos:ref_pos + bl] += 1
                ref_pos += bl
            mapped += 1

    # Save results
    from hivwholeseq.sequencing.filenames import get_coverage_figure_filename
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1, 1, figsize=(13, 6))
    ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b')
    ax.set_xlabel('Position')
    ax.set_ylabel('Coverage')
    ax.set_yscale('log')
    ax.set_title('adaID ' + adaID + ', premapped', fontsize=18)
    ax.set_xlim(-20, len(refseq) + 20)
    plt.tight_layout()

    from hivwholeseq.utils.generic import mkdirs
    from hivwholeseq.sequencing.filenames import get_figure_folder
    mkdirs(get_figure_folder(data_folder, adaID))
    plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    plt.close(fig)

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\nPremapping results: '+\
                    str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n')
            f.write('\nCoverage plotted: '+\
                    get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True):
    '''Produce a report on rough coverage on reference (ignore inserts)'''
    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')

    # Prepare data structures
    coverage = np.zeros(len(refseq), int)

    # Parse the BAM file
    unmapped = 0
    mapped = 0
    bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for read in bamfile:
            if read.is_unmapped or (not read.is_proper_pair) or (not len(
                    read.cigar)):
                unmapped += 1
                continue

            # Proceed along CIGARs
            ref_pos = read.pos
            for (bt, bl) in read.cigar:
                if bt not in (0, 2):
                    continue
                # Treat deletions as 'covered'
                coverage[ref_pos:ref_pos + bl] += 1
                ref_pos += bl
            mapped += 1

    # Save results
    from hivwholeseq.sequencing.filenames import get_coverage_figure_filename
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1, 1, figsize=(13, 6))
    ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b')
    ax.set_xlabel('Position')
    ax.set_ylabel('Coverage')
    ax.set_yscale('log')
    ax.set_title('adaID ' + adaID + ', premapped', fontsize=18)
    ax.set_xlim(-20, len(refseq) + 20)
    plt.tight_layout()

    from hivwholeseq.utils.generic import mkdirs
    from hivwholeseq.sequencing.filenames import get_figure_folder
    mkdirs(get_figure_folder(data_folder, adaID))
    plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    plt.close(fig)

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\nPremapping results: '+\
                    str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n')
            f.write('\nCoverage plotted: '+\
                    get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
Example #7
0
def store_reference_fragmented(data_folder, adaID, refseq, fragment_trim_poss_dict):
    '''Store FASTA files for the reference in fragments'''
    for fragment, poss in fragment_trim_poss_dict.iteritems():
        if not np.isscalar(poss[0]):
            poss = [poss[0]['inner'], poss[1]['inner']]
        refseq_frag = refseq[poss[0]: poss[1]]
        refseq_frag.id = refseq_frag.id+'_'+fragment
        refseq_frag.name = refseq_frag.name+'_'+fragment
        refseq_frag.description = refseq_frag.description+', fragment '+fragment

        SeqIO.write(refseq_frag,
                    get_reference_premap_filename(data_folder, adaID, fragment),
                    'fasta')
Example #8
0
def score_consensus(sample, VERBOSE=0):
    '''Score a consensus based on completeness and quality'''
    data_folder = sample.sequencing_run.folder
    adaID = sample.adapter

    frag_spec = filter(lambda x: fragment in x, sample.regions_complete)
    if not len(frag_spec):
        field = ''
        return (True, '')

    fn = get_consensus_filename(data_folder, adaID, fragment)
    if not os.path.isfile(fn):
        return (False, 'MISS')

    frag_spec = frag_spec[0]
    fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec)
    if not os.path.isfile(fn_ref):
        if frag_spec[:3] == 'F3a':
            frag_spec = frag_spec.replace('a', '')
            fn_ref = get_reference_premap_filename(data_folder, adaID,
                                                   frag_spec)
            if not os.path.isfile(fn_ref):
                return (False, 'MISSREF')
        else:
            return (False, 'MISSREF')

    ref = SeqIO.read(fn_ref, 'fasta')
    cons = SeqIO.read(fn, 'fasta')
    if len(cons) < len(ref) - 200:
        return (False, 'SHORT')
    elif len(cons) > len(ref) + 200:
        return (False, 'LONG')

    #ali = align_global(str(ref.seq), str(cons.seq), band=200)
    #alim1 = np.fromstring(ali[1], 'S1')
    #alim2 = np.fromstring(ali[2], 'S1')
    #if (alim1 != alim2).sum() >
    return (True, 'OK')
def score_consensus(sample, VERBOSE=0):
    '''Score a consensus based on completeness and quality'''
    data_folder = sample.sequencing_run.folder
    adaID = sample.adapter

    frag_spec = filter(lambda x: fragment in x, sample.regions_complete)
    if not len(frag_spec):
        field = ''
        return (True, '')

    fn = get_consensus_filename(data_folder, adaID, fragment)
    if not os.path.isfile(fn):
        return (False, 'MISS')

    frag_spec = frag_spec[0]
    fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec)
    if not os.path.isfile(fn_ref):
        if frag_spec[:3] == 'F3a':
            frag_spec = frag_spec.replace('a', '')
            fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec)
            if not os.path.isfile(fn_ref):
                return (False, 'MISSREF')
        else:
            return (False, 'MISSREF')

    ref = SeqIO.read(fn_ref, 'fasta')
    cons = SeqIO.read(fn, 'fasta')
    if len(cons) < len(ref) - 200:
        return (False, 'SHORT')
    elif len(cons) > len(ref) + 200:
        return (False, 'LONG')

    #ali = align_global(str(ref.seq), str(cons.seq), band=200)
    #alim1 = np.fromstring(ali[1], 'S1')
    #alim2 = np.fromstring(ali[2], 'S1')
    #if (alim1 != alim2).sum() >
    return (True, 'OK')
def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True):
    '''Make index and hash files for reference or consensus'''
    if VERBOSE:
        print 'Making index and hash files: adaID', adaID

    # 1. Make genome index file for reference
    if os.path.isfile(
            get_reference_premap_index_filename(data_folder, adaID, ext=True)):
        os.remove(
            get_reference_premap_index_filename(data_folder, adaID, ext=True))
    stdout = sp.check_output([
        stampy_bin,
        '--species="HIV"',
        '--overwrite',
        '-G',
        get_reference_premap_index_filename(data_folder, adaID, ext=False),
        get_reference_premap_filename(data_folder, adaID),
    ],
                             stderr=sp.STDOUT)
    if VERBOSE:
        print 'Built index: ' + adaID

    # 2. Build a hash file for reference
    if os.path.isfile(
            get_reference_premap_hash_filename(data_folder, adaID, ext=True)):
        os.remove(
            get_reference_premap_hash_filename(data_folder, adaID, ext=True))
    stdout = sp.check_output([
        stampy_bin,
        '--overwrite',
        '-g',
        get_reference_premap_index_filename(data_folder, adaID, ext=False),
        '-H',
        get_reference_premap_hash_filename(data_folder, adaID, ext=False),
    ],
                             stderr=sp.STDOUT)
    if VERBOSE:
        print 'Built hash: ' + adaID

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Stampy index and hash written.')
            f.write('\n')
Example #11
0
def check_division(data_folder,
                   adaID,
                   fragment,
                   seq_run,
                   qual_min=35,
                   reference='HXB2',
                   maxreads=-1,
                   VERBOSE=0,
                   minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_reference_premap_filename(data_folder, adaID, fragment)

    # FIXME: old nomenclature for F3a
    if not os.path.isfile(ref_fn):
        if fragment[:2] == 'F3':
            ref_fn = ref_fn.replace('F3a', 'F3')

    refseq = SeqIO.read(ref_fn, 'fasta')

    # Scan reads
    input_filename = get_divided_filename(data_folder,
                                          adaID,
                                          fragment,
                                          type='bam')

    # FIXME: old nomenclature for F3a
    if not os.path.isfile(input_filename):
        if fragment[:2] == 'F3':
            input_filename = input_filename.replace('F3a', 'F3')

    counts, inserts = get_allele_counts_insertions_from_file(input_filename,
                                                             len(refseq),
                                                             maxreads=maxreads,
                                                             VERBOSE=VERBOSE)

    # Plot results
    title = ', '.join(
        map(lambda x: ' '.join([x[0], str(x[1])]), [
            ['run', seq_run],
            ['adaID', adaID],
            ['fragment', fragment],
            ['maxreads', maxreads],
        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
Example #12
0
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments,
                          maxreads=-1, VERBOSE=0,
                          minisize=100,
                          include_tests=False, summary=True):
    '''Trim reads and divide them into fragments'''
    if VERBOSE:
        print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\
                ' '.join(fragments)

    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Fragments used: '+' '.join(fragments)+'\n')

    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')
    smat = np.array(refseq, 'S1')
    len_reference = len(refseq)

    # Get the positions of fragment start/end, w/ and w/o primers
    frags_pos = get_fragment_positions(smat, fragments)
    store_reference_fragmented(data_folder, adaID, refseq,
                               dict(zip(fragments, frags_pos['trim'])))
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Primer positions (for fragments):\n')
            for (fragment, poss_full, poss_trim) in izip(fragments,
                                                         frags_pos['full'],
                                                         frags_pos['trim']):
                f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\
                                 ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n')
    write_fragment_positions(data_folder, adaID, fragments, frags_pos)

    # Get the positions of the unwanted outer primers (in case we DO nested PCR
    # for that fragment)
    # NOTE: the LTRs make no problem, because the rev outer primer of F6
    # is not in the reference anymore if F6 has undergone nested PCR
    # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting
    from re import findall
    primers_out = {'fwd': [], 'rev': []}
    for i, fr in enumerate(fragments):
        if (i != 0) and findall(r'F[2-6][a-z]?i', fr):
            primers_out['fwd'].append(fr[:-1]+'o')
        if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr):
            primers_out['rev'].append(fr[:-1]+'o')

    # Get all possible unambiguous primers for the unwanted outer primers
    from hivwholeseq.data.primers import primers_PCR
    from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
    primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])),
                                        'S1', ndmin=2)
                               for fr in primers_out['fwd']],
                       'rev': [np.array(map(list, eas(primers_PCR[fr][1])),
                                        'S1', ndmin=2)
                               for fr in primers_out['rev']],
                      }
    primers_out_pos = {'fwd': [], 'rev': []}
    if primers_out['fwd']:
        primers_out_pos['fwd'] = map(itemgetter(0),
                                     get_primer_positions(smat,
                                                          primers_out['fwd'], 'fwd'))
    if primers_out['rev']:
        primers_out_pos['rev'] = map(itemgetter(1),
                                     get_primer_positions(smat,
                                                          primers_out['rev'], 'rev'))

    # Input and output files
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        convert_sam_to_bam(input_filename)
    output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:

        try:
            file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile)
                            for ofn in output_filenames[:len(fragments)]]
    
            fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile)
            fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile)
            fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile)
            fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile)

            # Iterate over the mapped reads and assign fragments
            n_mapped = [0 for fragment in fragments]
            n_unmapped = 0
            n_crossfrag = 0
            n_ambiguous = 0
            n_outer = 0
            n_lowq = 0
            for irp, reads in enumerate(pair_generator(bamfile)):

                if irp == maxreads:
                    if VERBOSE:
                        print 'Maximal number of read pairs reached:', maxreads
                    break

                if VERBOSE >= 2:
                    if not ((irp+1) % 10000):
                        print irp+1

                i_fwd = reads[0].is_reverse

                # If unmapped or unpaired, mini, or insert size mini, or
                # divergent read pair (fully cross-overlapping), discard
                if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
                   reads[1].is_unmapped or (not reads[1].is_proper_pair) or \
                   (reads[0].rlen < 50) or (reads[1].rlen < 50) or \
                   (reads[i_fwd].isize < minisize):
                    if VERBOSE >= 3:
                        print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname
                    n_unmapped += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # If the insert is a misamplification from the outer primers
                # in fragments that underwent nested PCR,
                # trash it (it will have skewed amplification anyway). We cannot
                # find all of those, rather only the ones still carrying the
                # primer itself (some others have lost it while shearing). For
                # those, no matter what happens at the end (reading into adapters,
                # etc.), ONE of the reads in the pair will start exactly with one
                # outer primer: if the rev read with a rev primer, if the fwd
                # with a fwd one. Test all six.
                if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \
                   test_outer_primer(reads,
                                     primers_out_pos, primers_out_seq,
                                     len_reference):
                    if VERBOSE >= 3:
                        print 'Read pair from outer primer:', reads[0].qname
                    n_outer += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # FIXME: the following becomes a bit harder when we mix parallel
                # PCRs, e.g. F5a+b, to get more product

                # Assign to a fragment now, so that primer trimming is faster 
                pair_identity = assign_to_fragment(reads, frags_pos['full'],
                                                   VERBOSE=VERBOSE)

                # 1. If no fragments are possible (e.g. one read crosses the
                # fragment boundary, they map to different fragments), dump it
                # into a special bucket
                if pair_identity == 'cross':
                    n_crossfrag += 1
                    fo_cm.write(reads[0])
                    fo_cm.write(reads[1])
                    continue

                # 2. If 2+ fragments are possible (tie), put into a special bucket
                # (essentially excluded, because we want two independent measurements
                # in the overlapping region, but we might want to recover them)
                elif pair_identity == 'ambiguous':
                    n_ambiguous += 1
                    fo_am.write(reads[0])
                    fo_am.write(reads[1])
                    continue

                # 3. If the intersection is a single fragment, good: trim the primers
                # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0
                n_frag = int(pair_identity)
                frag_pos = frags_pos['trim'][n_frag]
                if not np.isscalar(frag_pos[0]):
                    frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']]
                trashed_primers = trim_primers(reads, frag_pos,
                                               include_tests=include_tests)
                if trashed_primers or (reads[i_fwd].isize < 100):
                    n_unmapped += 1
                    if VERBOSE >= 3:
                        print 'Read pair is mismapped:', reads[0].qname
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # Quality trimming: if no decently long pair survives, trash
                #trashed_quality = main_block_low_quality(reads, phred_min=20,
                #                                         include_tests=include_tests)
                trashed_quality = trim_low_quality(reads, phred_min=20,
                                                   include_tests=include_tests)
                if trashed_quality or (reads[i_fwd].isize < 100):
                    n_lowq += 1
                    if VERBOSE >= 3:
                        print 'Read pair has low phred quality:', reads[0].qname
                    fo_lq.write(reads[0])
                    fo_lq.write(reads[1])
                    continue

                # Check for cross-overhangs or COH (reading into the adapters)
                #        --------------->
                #    <-----------
                # In that case, trim to perfect overlap.
                if test_coh(reads, VERBOSE=False):
                    trim_coh(reads, trim=0, include_tests=include_tests)

                # Change coordinates into the fragmented reference (primer-trimmed)
                for read in reads:
                    read.pos -= frag_pos[0]
                    read.mpos -= frag_pos[0]

                # Here the tests
                if include_tests:
                    lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0]
                    if test_sanity(reads, n_frag, lfr):
                        print 'Tests failed:', reads[0].qname
                        import ipdb; ipdb.set_trace()

                # There we go!
                n_mapped[n_frag] += 1
                file_handles[n_frag].write(reads[0])
                file_handles[n_frag].write(reads[1])

        finally:
            for f in file_handles:
                f.close()
            fo_am.close()
            fo_cm.close()
            fo_um.close()
            fo_lq.close()


    if VERBOSE:
        print 'Trim and divide results: adaID '+adaID
        print 'Total:\t\t', irp
        print 'Mapped:\t\t', sum(n_mapped), n_mapped
        print 'Unmapped/unpaired/tiny:\t', n_unmapped
        print 'Outer primer\t', n_outer
        print 'Crossfrag:\t', n_crossfrag
        print 'Ambiguous:\t', n_ambiguous
        print 'Low-quality:\t', n_lowq

    # Write summary to file
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Trim and divide results: adaID '+adaID+'\n')
            f.write('Total:\t\t'+str(irp + 1)+'\n')
            f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n')
            f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n')
            f.write('Outer primer\t'+str(n_outer)+'\n')
            f.write('Crossfrag:\t'+str(n_crossfrag)+'\n')
            f.write('Ambiguous:\t'+str(n_ambiguous)+'\n')
            f.write('Low-quality:\t'+str(n_lowq)+'\n')
Example #13
0
                    f.write('Call: python build_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
                            ' --block-length '+str(block_len_initial)+\
                            ' --reads-per-alignment '+str(n_reads_per_ali)+\
                            ' --verbose '+str(VERBOSE))
                    if store_allele_counts:
                        f.write(' --allele-counts')
                    f.write('\n')

            if VERBOSE:
                print seq_run, adaID, fragment
            if fragment == 'genomewide':
                refseq = SeqIO.read(
                    get_reference_premap_filename(data_folder, adaID), 'fasta')
                bamfilename = get_premapped_filename(data_folder,
                                                     adaID,
                                                     type='bam')
                frag_out = fragment
            else:
                fn = get_reference_premap_filename(data_folder, adaID,
                                                   fragment)
                bamfilename = get_divided_filename(data_folder,
                                                   adaID,
                                                   fragment,
                                                   type='bam')

                #FIXME: old nomenclature for F3a is F3
                if not os.path.isfile(fn) and fragment[:3] == 'F3a':
                    fn = get_reference_premap_filename(data_folder, adaID,
def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')
def check_premap(data_folder, adaID, fragments, seq_run, samplename,
                 qual_min=30, match_len_min=10,
                 maxreads=-1, VERBOSE=0,
                 savefig=True,
                 title=None):
    '''Check premap to reference: coverage, etc.'''
    refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta')

    # FIXME: do this possibly better than parsing the description!
    try:
        fields = refseq.description.split()
        refseq_start = int(fields[fields.index('(indices') - 3])
    except ValueError:
        refseq_start = 550

    fragpos_filename = get_fragment_positions_filename(data_folder, adaID)
    if os.path.isfile(fragpos_filename):
        # Load the fragment positions, considering mixed fragments (e.g. F5a+b)
        fragtmp = []
        postmp = []
        with open(fragpos_filename, 'r') as f:
            f.readline() #HEADER
            for line in f:
                fields = line[:-1].split('\t')
                fragtmp.append(fields[0])
                if 'inner' not in fields[1]:
                    postmp.append([fields[1], fields[4]])
                else:
                    start = int(fields[1].split(',')[1].split(': ')[1].rstrip('}'))
                    end = int(fields[4].split(',')[1].split(': ')[1].rstrip('}'))
                    postmp.append([start, end])

        postmp = np.array(postmp, int)
        # NOTE: In a lot of old files, it says F3o instead of F3ao
        if 'F3o' in fragtmp:
            fragtmp[fragtmp.index('F3o')] = 'F3ao'
        elif 'F3i' in fragtmp:
            fragtmp[fragtmp.index('F3i')] = 'F3ai'


        frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T

    else:
        frags_pos = None
    
    frags_pos_out = None

    # Open BAM and scan reads
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        if VERBOSE:
            print 'Premapped BAM file not found'
        return (None, None)

    # Count reads if requested
    n_reads = get_number_reads(input_filename)
    if VERBOSE:
        print 'N. of reads:', n_reads

    # Get counts
    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename,
                                                             len(refseq),
                                                             qual_min=qual_min,
                                                             match_len_min=match_len_min,
                                                             maxreads=maxreads,
                                                             VERBOSE=VERBOSE)

    # Plot results
    if title is None:
        title=', '.join(['run '+seq_run+' '+adaID,
                         'sample '+samplename,
                         'reads '+str(min(maxreads, n_reads))+'/'+str(n_reads),
                        ])
    plot_coverage(counts,
                  offset_x=refseq_start,
                  frags_pos=frags_pos,
                  frags_pos_out=frags_pos_out,
                  title=title)

    if savefig:
        from hivwholeseq.sequencing.adapter_info import foldername_adapter
        plt.savefig(data_folder+foldername_adapter(adaID)+'figures/coverage_premapped_'+samplename+'.png')

    return (counts, inserts)
                with open(sfn, 'w') as f:
                    f.write('Call: python build_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
                            ' --block-length '+str(block_len_initial)+\
                            ' --reads-per-alignment '+str(n_reads_per_ali)+\
                            ' --verbose '+str(VERBOSE))
                    if store_allele_counts:
                        f.write(' --allele-counts')
                    f.write('\n')

            if VERBOSE:
                print seq_run, adaID, fragment
            if fragment == 'genomewide':
                refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta')
                bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
                frag_out = fragment
            else:
                fn = get_reference_premap_filename(data_folder, adaID, fragment)
                bamfilename = get_divided_filename(data_folder, adaID, fragment, type='bam')

                #FIXME: old nomenclature for F3a is F3
                if not os.path.isfile(fn) and fragment[:3] == 'F3a':
                    fn = get_reference_premap_filename(data_folder, adaID, 'F3'+fragment[-1])
                if not os.path.isfile(bamfilename) and fragment[:3] == 'F3a':
                    bamfilename = get_divided_filename(data_folder, adaID, 'F3'+fragment[-1], type='bam')

                refseq = SeqIO.read(fn, 'fasta')
                frag_out = fragment[:2]
def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')
Example #18
0
def check_premap(data_folder,
                 adaID,
                 fragments,
                 seq_run,
                 samplename,
                 qual_min=30,
                 match_len_min=10,
                 maxreads=-1,
                 VERBOSE=0,
                 savefig=True,
                 title=None):
    '''Check premap to reference: coverage, etc.'''
    refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID),
                        'fasta')

    # FIXME: do this possibly better than parsing the description!
    try:
        fields = refseq.description.split()
        refseq_start = int(fields[fields.index('(indices') - 3])
    except ValueError:
        refseq_start = 550

    fragpos_filename = get_fragment_positions_filename(data_folder, adaID)
    if os.path.isfile(fragpos_filename):
        # Load the fragment positions, considering mixed fragments (e.g. F5a+b)
        fragtmp = []
        postmp = []
        with open(fragpos_filename, 'r') as f:
            f.readline()  #HEADER
            for line in f:
                fields = line[:-1].split('\t')
                fragtmp.append(fields[0])
                if 'inner' not in fields[1]:
                    postmp.append([fields[1], fields[4]])
                else:
                    start = int(
                        fields[1].split(',')[1].split(': ')[1].rstrip('}'))
                    end = int(
                        fields[4].split(',')[1].split(': ')[1].rstrip('}'))
                    postmp.append([start, end])

        postmp = np.array(postmp, int)
        # NOTE: In a lot of old files, it says F3o instead of F3ao
        if 'F3o' in fragtmp:
            fragtmp[fragtmp.index('F3o')] = 'F3ao'
        elif 'F3i' in fragtmp:
            fragtmp[fragtmp.index('F3i')] = 'F3ai'

        frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments],
                             int).T

    else:
        frags_pos = None

    frags_pos_out = None

    # Open BAM and scan reads
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        if VERBOSE:
            print 'Premapped BAM file not found'
        return (None, None)

    # Count reads if requested
    n_reads = get_number_reads(input_filename)
    if VERBOSE:
        print 'N. of reads:', n_reads

    # Get counts
    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(
        input_filename,
        len(refseq),
        qual_min=qual_min,
        match_len_min=match_len_min,
        maxreads=maxreads,
        VERBOSE=VERBOSE)

    # Plot results
    if title is None:
        title = ', '.join([
            'run ' + seq_run + ' ' + adaID,
            'sample ' + samplename,
            'reads ' + str(min(maxreads, n_reads)) + '/' + str(n_reads),
        ])
    plot_coverage(counts,
                  offset_x=refseq_start,
                  frags_pos=frags_pos,
                  frags_pos_out=frags_pos_out,
                  title=title)

    if savefig:
        from hivwholeseq.sequencing.adapter_info import foldername_adapter
        plt.savefig(data_folder + foldername_adapter(adaID) +
                    'figures/coverage_premapped_' + samplename + '.png')

    return (counts, inserts)