Ejemplo n.º 1
0
def build_local_consensus(seqs, VERBOSE=0, store_allele_counts=False, full_cover=True):
    '''Build a local consensus from an MSA
    
    There is only ONE tricky point: what to do if some reads do not cover the whole
    block, e.g. at the end of a fragment because of low coverage?
    If full_cover == False, convert MSA gaps at the end of too short reads into N

    Args:
      seqs (list of SeqRecords): seqs to build consensus from
      store_allele_counts (bool): return also allele counts from the alignment
      full_cover (bool): if True, assume the reads fully cover the region (no gaps at edges)
    '''

    import numpy as np
    from hivwholeseq.utils.miseq import alpha
    from hivwholeseq.utils.mapping import align_muscle

    ali = np.array(align_muscle(*seqs, sort=True), 'S1', ndmin=2)
    if full_cover:
        allele_counts = np.array([(ali == a).sum(axis=0) for a in alpha], int, ndmin=2)
    else:
        allele_counts = np.zeros((len(alpha), len(ali[0])),int)
        for i in xrange(len(seqs)):
            if ali[i, -1] == '-':
                first_finalgap = len(ali[i].tostring().rstrip('-'))
                ali[i, first_finalgap:] = 'X'
            for ai, a in enumerate(alpha):
                allele_counts[ai] += ali[i] == a

        cov = allele_counts.sum(axis=0)
        allele_counts = allele_counts[:, cov > 0]

    cons_local = []
    for counts in allele_counts.T:
        # Pick max count nucleotide, ignoring N
        maxinds = (counts[:-1] == counts.max()).nonzero()[0]
        if len(maxinds) < 1:
            cons_local.append('-')
            continue
        # Pick a random nucleotide in case of a tie
        elif len(maxinds) > 1:
            np.random.shuffle(maxinds)
        maxind = maxinds[0]
        cons_local.append(alpha[maxind])
    cons_local = np.array(cons_local, 'S1')

    ind_nongap = cons_local != '-'
    cons_local = ''.join(cons_local[ind_nongap])
    
    if store_allele_counts:
        allele_counts = allele_counts[:, ind_nongap]
        return (cons_local, allele_counts)

    return cons_local
Ejemplo n.º 2
0
def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0):
    '''Find the overlap coordinates for the two fragments'''
    from hivwholeseq.utils.mapping import align_muscle

    seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), 'fasta')
    seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), 'fasta')
    sm1 = np.array(seq1)
    sm2 = np.array(seq2)

    # Find the beginning of s2 in s1
    seed_len = 20
    matches_min = 16
    seed = sm2[:seed_len]
    found = False
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(len(seq1) - 700, len(seq1) - seed_len):
            if (sm1[pos: pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                start_s2 = pos
                break
        if not found:
            trials += 1

    if not found:
        return None

    if VERBOSE >= 3:
        print 'Beginning of '+frag2+' found in '+frag1

    # In an ideal world, the overlap is a holy place in which no indels happen.
    # We cannot assume that, sadly. However, we can search from the other side
    # and align: find the end of s1 in s2
    found = False
    seed = sm1[-seed_len:]
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(700):
            if (sm2[pos: pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                end_s1 = pos + seed_len
                break
        if not found:
            trials += 1
    if not found:
        return None

    if VERBOSE >= 3:
        print 'End of '+frag1+' found in '+frag2

    # Align
    ali = align_muscle(seq1[start_s2:], seq2[:end_s1])
    return (start_s2, end_s1, ali)
Ejemplo n.º 3
0
def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0):
    """Find the overlap coordinates for the two fragments"""
    from hivwholeseq.utils.mapping import align_muscle

    seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), "fasta")
    seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), "fasta")
    sm1 = np.array(seq1)
    sm2 = np.array(seq2)

    # Find the beginning of s2 in s1
    seed_len = 20
    matches_min = 16
    seed = sm2[:seed_len]
    found = False
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(len(seq1) - 700, len(seq1) - seed_len):
            if (sm1[pos : pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                start_s2 = pos
                break
        if not found:
            trials += 1

    if not found:
        return None

    if VERBOSE >= 3:
        print "Beginning of " + frag2 + " found in " + frag1

    # In an ideal world, the overlap is a holy place in which no indels happen.
    # We cannot assume that, sadly. However, we can search from the other side
    # and align: find the end of s1 in s2
    found = False
    seed = sm1[-seed_len:]
    trials = 0
    while (not found) and (trials < 3):
        for pos in xrange(700):
            if (sm2[pos : pos + seed_len] == seed).sum() >= matches_min - trials:
                found = True
                end_s1 = pos + seed_len
                break
        if not found:
            trials += 1
    if not found:
        return None

    if VERBOSE >= 3:
        print "End of " + frag1 + " found in " + frag2

    # Align
    ali = align_muscle(seq1[start_s2:], seq2[:end_s1])
    return (start_s2, end_s1, ali)
Ejemplo n.º 4
0
def build_msa(htseqs, VERBOSE=0):
    '''Build multiple sequence alignment from cluster of haplotypes'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet.IUPAC import ambiguous_dna
    
    seqs = [SeqRecord(Seq(seq, ambiguous_dna),
                      id='#'+str(i),
                      name='#'+str(i))
            for i, seq in enumerate(htseqs)]

    from hivwholeseq.utils.mapping import align_muscle
    ali = align_muscle(*seqs, sort=True)

    return ali
Ejemplo n.º 5
0
def build_msa(htseqs, VERBOSE=0):
    '''Build multiple sequence alignment from cluster of haplotypes'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet.IUPAC import ambiguous_dna

    seqs = [
        SeqRecord(Seq(seq, ambiguous_dna), id='#' + str(i), name='#' + str(i))
        for i, seq in enumerate(htseqs)
    ]

    from hivwholeseq.utils.mapping import align_muscle
    ali = align_muscle(*seqs, sort=True)

    return ali
Ejemplo n.º 6
0
def build_msa_haplotypes(haploc, VERBOSE=0, label=''):
    '''Build multiple sequence alignment from cluster of haplotypes'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet.IUPAC import ambiguous_dna
    
    seqs = [SeqRecord(Seq(seq, ambiguous_dna),
                      id=label+'count_'+str(count)+'_rank_'+str(i),
                      name=label+'count_'+str(count)+'_rank_'+str(i),
                      description='')
            for i, (seq, count) in enumerate(haploc.most_common())]

    from hivwholeseq.utils.mapping import align_muscle
    ali = align_muscle(*seqs, sort=True)

    return ali
def write_consensus_final(seq_run, adaID, fragment, consensus):
    '''Write the final consensus (fragments are now called F5 instead of F5ai)'''
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']
    samplename = dataset['samples'][dataset['adapters'].index(adaID)]

    frag_out = fragment[:2]
    name = samplename+'_seqrun_'+seq_run+'_adaID_'+adaID+'_'+frag_out+'_consensus'
    consensusseq = SeqRecord(Seq(consensus), id=name, name=name)

    outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True)
    SeqIO.write(consensusseq, outfile, 'fasta')

    # Align all consensi via muscle and store
    seqs = list(SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment), 'fasta'))
    ali = align_muscle(*seqs)
    AlignIO.write(ali, get_reference_all_filename(data_folder, adaID, fragment), 'fasta')
Ejemplo n.º 8
0
def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0):
    '''Align consensi from different samples in a dataset'''

    data_folder = dataset['folder']

    # Collect consensi
    if VERBOSE >= 1:
        print 'Collecting consensi...',
    consensi = defaultdict(dict)
    for adaID in adaIDs:
        samplename = dataset['samples'][dataset['adapters'].index(adaID)]
        fragments_sample = samples[samplename]['fragments']
        for frag in fragments_sample:
            frag_gen = frag[:2]
            if frag_gen not in fragments:
                continue
            con_fn = get_consensus_filename(data_folder, adaID, frag_gen)
            if os.path.isfile(con_fn):
                con = SeqIO.read(con_fn, 'fasta')
                consensi[frag_gen][adaID] = con

        if 'genomewide' in fragments:
            frag_gens = [frag[:2] for frag in fragments_sample]
            con_gw_fn = get_merged_consensus_filename(data_folder, adaID,
                                                      frag_gens)
            if os.path.isfile(con_gw_fn):
                con = SeqIO.read(con_gw_fn, 'fasta')
                consensi['genomewide'][adaID] = con

    if VERBOSE >= 1:
        print 'done.'
        print 'Aligning...',

    # Align
    alis = {}
    for (frag, con_dict) in consensi.iteritems():
        if VERBOSE >= 2:
            print frag,
        ali_frag = align_muscle(*(con_dict.values()))
        alis[frag] = ali_frag

    if VERBOSE >= 1:
        print 'done.'

    return alis
def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0):
    '''Align consensi from different samples in a dataset'''

    data_folder = dataset['folder']

    # Collect consensi
    if VERBOSE >= 1:
        print 'Collecting consensi...',
    consensi = defaultdict(dict)
    for adaID in adaIDs:
        samplename = dataset['samples'][dataset['adapters'].index(adaID)]
        fragments_sample = samples[samplename]['fragments']
        for frag in fragments_sample:
            frag_gen = frag[:2]
            if frag_gen not in fragments:
                continue
            con_fn = get_consensus_filename(data_folder, adaID, frag_gen)
            if os.path.isfile(con_fn):
                con = SeqIO.read(con_fn, 'fasta')
                consensi[frag_gen][adaID] = con

        if 'genomewide' in fragments:
            frag_gens = [frag[:2] for frag in fragments_sample]
            con_gw_fn = get_merged_consensus_filename(data_folder, adaID, frag_gens)
            if os.path.isfile(con_gw_fn):
                con = SeqIO.read(con_gw_fn, 'fasta')
                consensi['genomewide'][adaID] = con
    
    if VERBOSE >= 1:
        print 'done.'
        print 'Aligning...',

    # Align
    alis = {}
    for (frag, con_dict) in consensi.iteritems():
        if VERBOSE >= 2:
            print frag,
        ali_frag = align_muscle(*(con_dict.values()))
        alis[frag] = ali_frag

    if VERBOSE >= 1:
        print 'done.'

    return alis
Ejemplo n.º 10
0
def write_consensus_final(seq_run, adaID, fragment, consensus):
    '''Write the final consensus (fragments are now called F5 instead of F5ai)'''
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']
    samplename = dataset['samples'][dataset['adapters'].index(adaID)]

    frag_out = fragment[:2]
    name = samplename + '_seqrun_' + seq_run + '_adaID_' + adaID + '_' + frag_out + '_consensus'
    consensusseq = SeqRecord(Seq(consensus), id=name, name=name)

    outfile = get_consensus_filename(data_folder,
                                     adaID,
                                     frag_out,
                                     trim_primers=True)
    SeqIO.write(consensusseq, outfile, 'fasta')

    # Align all consensi via muscle and store
    seqs = list(
        SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment),
                    'fasta'))
    ali = align_muscle(*seqs)
    AlignIO.write(ali, get_reference_all_filename(data_folder, adaID,
                                                  fragment), 'fasta')
Ejemplo n.º 11
0
                accept_holes=(fragment == 'genomewide'),
                store_allele_counts=store_allele_counts)
            if store_allele_counts:
                (consensus, allele_counts) = consensus

            # Store to file
            if VERBOSE:
                print 'Store to file'
            name = samplename + '_seqrun_' + seq_run + '_adaID_' + adaID + '_' + frag_out + '_consensus'
            consensusseq = SeqRecord(Seq(consensus, ambiguous_dna),
                                     id=name,
                                     name=name)

            # Align consensus to reference via muscle and trim end gaps in ref
            # (improper primer trimming in trim_and_divide)
            ali = align_muscle(refseq, consensusseq, sort=True)

            if ali[0][-1] == '-':
                start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-'))
                end_nongap = len(ali[0].seq.rstrip('-'))
                ali = ali[:, start_nongap:end_nongap]

            if VERBOSE >= 2:
                print ali[:, :30]
                print ali[:, -30:]
                print 'Lenghts: ref', len(refseq), 'consensus', len(
                    consensusseq)
                len_ali = ali.get_alignment_length()
                n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali))
                print 'Differences from ref:', n_diff, '(' + '{:3.1f}'.format(
                    100.0 * n_diff / len_ali) + '%)'
Ejemplo n.º 12
0
                                        block_len_initial=block_len_initial,
                                        reads_per_alignment=n_reads_per_ali,
                                        accept_holes=(fragment == 'genomewide'),
                                        store_allele_counts=store_allele_counts)
            if store_allele_counts:
                (consensus, allele_counts) = consensus

            # Store to file
            if VERBOSE:
                print 'Store to file'
            name = samplename+'_seqrun_'+seq_run+'_adaID_'+adaID+'_'+frag_out+'_consensus'
            consensusseq = SeqRecord(Seq(consensus, ambiguous_dna), id=name, name=name)

            # Align consensus to reference via muscle and trim end gaps in ref
            # (improper primer trimming in trim_and_divide)
            ali = align_muscle(refseq, consensusseq, sort=True)

            if ali[0][-1] == '-':
                start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-'))
                end_nongap = len(ali[0].seq.rstrip('-'))
                ali = ali[:, start_nongap: end_nongap]

            if VERBOSE >= 2:
                print ali[:, :30]
                print ali[:, -30:]
                print 'Lenghts: ref', len(refseq), 'consensus', len(consensusseq)
                len_ali = ali.get_alignment_length()
                n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali))
                print 'Differences from ref:', n_diff, '('+'{:3.1f}'.format(100.0 * n_diff / len_ali)+'%)'

            # Ungap consensus
                cons_seq.description = ', '.join([
                    'Patient: ' + patient.code, time, 'region: ' + region,
                    'consensus'
                ])
                cons_seq.cell_count = sample['CD4+ count']
                cons_seq.viral_load = sample['viral load']
                cons_seq.subtype = patient['Subtype']
                seqs.append(cons_seq)
                if use_joint:
                    seqs_all.append(cons_seq)
                if VERBOSE >= 2:
                    print 'OK'

            if VERBOSE >= 2:
                print 'Align',
            ali = align_muscle(*seqs, sort=True)
            if VERBOSE >= 2:
                print 'OK'

            if use_save:
                if VERBOSE >= 2:
                    print 'Save alignment',
                fn_out = patient.get_consensi_alignment_filename(region)
                mkdirs(os.path.dirname(fn_out))
                AlignIO.write(ali, fn_out, 'fasta')
                if VERBOSE >= 2:
                    print 'OK'

            if VERBOSE >= 2:
                print 'Build local tree'
            tree = build_tree_fasttree(ali, VERBOSE=VERBOSE)
                cons_seq.description = ', '.join(['Patient: '+patient.code,
                                                  time,
                                                  'region: '+region,
                                                  'consensus'])
                cons_seq.cell_count = sample['CD4+ count']
                cons_seq.viral_load = sample['viral load']
                cons_seq.subtype = patient['Subtype']
                seqs.append(cons_seq)
                if use_joint:
                    seqs_all.append(cons_seq)
                if VERBOSE >= 2:
                    print 'OK'
                    
            if VERBOSE >= 2:
                print 'Align',
            ali = align_muscle(*seqs, sort=True)
            if VERBOSE >= 2:
                print 'OK'

            if use_save:
                if VERBOSE >= 2:
                    print 'Save alignment',
                fn_out = patient.get_consensi_alignment_filename(region)
                mkdirs(os.path.dirname(fn_out))
                AlignIO.write(ali, fn_out, 'fasta')
                if VERBOSE >= 2:
                    print 'OK'

            if VERBOSE >= 2:
                print 'Build local tree'
            tree = build_tree_fasttree(ali, VERBOSE=VERBOSE)