def build_coordinate_map(refseq, patseq, VERBOSE=0, score_gapopen=-20, **kwargs):
    '''Build the coordinate map
    
    Parameters
      **kwargs: passed to alignment function (e.g. alignment penalties)
    '''
    from seqanpy import align_global
    (score, ali1, ali2) = align_global(refseq, patseq, score_gapopen=score_gapopen,
                                       **kwargs)
    patseq_start = len(ali2) - len(ali2.lstrip('-'))
    patseq_end = len(ali2.rstrip('-'))

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali([ali1[patseq_start: patseq_end],
                                   ali2[patseq_start: patseq_end]],
                                  name1=refseq.name, name2=patseq.name)

    # Bijective map
    mapbi = []
    pos_ref = patseq_start
    pos_ini = 0
    for col in xrange(patseq_start, patseq_end):
        nuc_ref = ali1[col]
        nuc_ini = ali2[col]
        if (nuc_ref != '-') and (nuc_ini != '-'):
            mapbi.append((pos_ref, pos_ini))
            pos_ref += 1
            pos_ini += 1
        elif (nuc_ref != '-'):
            pos_ref += 1
        elif (nuc_ini != '-'):
            pos_ini += 1

    return mapbi
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0):
    '''Merge the allele counts of all fragments
    
    Note: we do not require full coverage of all fragments, the missing
          ones will just have zero counts. Sometimes, cherry-picking the data
          fragment by fragment might be a better choice.
    '''
    from hivwholeseq.utils.miseq import alpha, read_types
    from seqanpy import align_overlap

    ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int)

    pos_ref = 1000
    for (fr, ref, acsi) in acs:

        # Find the coordinates
        (score, ali1, ali2) = align_overlap(ref_genomewide[pos_ref - 1000:],
                                            ref,
                                            #score_gapopen=-20,
                                           )
        fr_start = len(ali2) - len(ali2.lstrip('-'))
        fr_end = len(ali2.rstrip('-'))

        if VERBOSE:
            print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end

        # Scan the alignment
        pos_ref = pos_ref - 1000 + fr_start
        fr_start_ref = pos_ref
        fr_end_ref = pos_ref + fr_end - fr_start
        pos_fr = 0
        for pos_ali in xrange(fr_start, fr_end):
            # Gap in genomewise, ignore position
            if ali1[pos_ali] == '-':
                pos_fr += 1
                continue

            # Gap in fragment, ignore FIXME: probably we should put deletions
            elif ali2[pos_ali] == '-':
                pos_ref += 1
                continue

            # Add the counts
            # NOTE: all fragments are treated the same, even in case of coverage
            # differences of orders of magnitude. This means, larger coverage
            # always wins. Maybe we want to implement this somewhat differently
            ac[:, :, pos_ref] += acsi[:, :, pos_fr]
            pos_fr += 1
            pos_ref += 1

        if VERBOSE >= 3:
            from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
            cons = alpha[ac.sum(axis=0).argmax(axis=0)]
            pretty_print_pairwise_ali((ali1[fr_start: fr_end],
                                       cons[fr_start: fr_end]),
                                      name1='gw',
                                      name2=fr,
                                      width=100)

    return ac
Exemple #3
0
 def on_click(event):
     '''Print sequence on click'''
     mouseevent = event.mouseevent
     artist = event.artist
     i_clicked = int(artist.get_label())
     (score, ali1, ali2) = align_global(seq0, seqs[i_clicked], score_gapopen=-20)
     pretty_print_pairwise_ali((ali1, ali2), name1='cons0', name2='clicked', width=120)
Exemple #4
0
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60):
    '''Join a new block to an extant consensus'''
    import numpy as np
    from seqanpy import align_ladder

    (score, ali1, ali2) = align_ladder(consensus,
                                       cons_block,
                                       score_gapopen=-10)

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali([ali1, ali2],
                                  name1='consensus',
                                  name2='new block')

    # In very rare occasions (coverage holes), the second sequence is actually
    # shorter than the first, then we do not need to glue it in
    if ali2[-1] == '-':
        if VERBOSE >= 2:
            print 'WARNING: the old block is longer than the new one (maybe low coverage)'
        return consensus

    end1 = len(ali1.rstrip('-'))
    start2 = len(ali2) - len(ali2.lstrip('-'))
    scoremax = 3 * (end1 - start2)
    delta = scoremax - score
    if delta > deltamax:
        raise ValueError(
            'Too many mismatches in neighbouring local consensi! (' +
            str(delta) + ', max ' + str(deltamax) + ')')
    consensus = (ali1[:start2] + ali2[start2:]).replace('-', '')
    return consensus
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60):
    '''Join a new block to an extant consensus'''
    import numpy as np
    from seqanpy import align_ladder

    (score, ali1, ali2) = align_ladder(consensus, cons_block, score_gapopen=-10)

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali([ali1, ali2], name1='consensus', name2='new block')

    # In very rare occasions (coverage holes), the second sequence is actually
    # shorter than the first, then we do not need to glue it in
    if ali2[-1] == '-':
        if VERBOSE >= 2:
            print 'WARNING: the old block is longer than the new one (maybe low coverage)'
        return consensus

    end1 = len(ali1.rstrip('-'))
    start2 = len(ali2) - len(ali2.lstrip('-'))
    scoremax = 3 * (end1 - start2)
    delta = scoremax - score
    if delta > deltamax:
        raise ValueError('Too many mismatches in neighbouring local consensi! ('+str(delta)+', max '+str(deltamax)+')')
    consensus = (ali1[:start2] + ali2[start2:]).replace('-', '')
    return consensus
def align_fragments(c1, c2, VERBOSE=0):
    '''Align subsequence fragments'''
    import numpy as np
    from seqanpy import align_ladder
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
    (score, a1, a2) = align_ladder(c1, c2, score_gapopen=-20)
    start2 = len(a2) - len(a2.lstrip('-'))
    end1 = len(a1.rstrip('-'))

    a1 = a1[start2: end1]
    a2 = a2[start2: end1]

    if VERBOSE >= 3:
        pretty_print_pairwise_ali((a1, a2), width=100,
                                  name1=fr1, name2=fr2)

    a1 = np.fromstring(a1, 'S1')
    a2 = np.fromstring(a2, 'S1')
    co1 = (a1 != '-').cumsum() - 1
    co2 = (a2 != '-').cumsum() - 1
    ind = (a1 != '-') & (a2 != '-')

    pos1 = co1[ind] + start2
    pos2 = co2[ind]

    return (pos1, pos2)
Exemple #7
0
def merge_sequences(seqs, skip_initial=30, accept_gaps=False, VERBOSE=0):
    '''Merge sequences with overlaps
    
    Parameters:
       seqs (list): sequences to merge
       skip_initial (int): trim from the beginning of overlaps because we do not
       really trust those bases
       accept_gaps (bool): accept gaps in the overlaps
    '''
    from itertools import izip
    from seqanpy import align_ladder
    import numpy as np

    seqs = map(''.join, seqs)

    left_trim = 0
    seqs_all = []
    for iov, (seq1, seq2) in enumerate(izip(seqs[:-1], seqs[1:])):
        if VERBOSE >= 1:
            print 'Overlap n', iov+1

        (score, ali1, ali2) = align_ladder(seq1[left_trim:], seq2, score_gapopen=-20)
        start2 = len(ali2) - len(ali2.lstrip('-'))
        end1 = len(ali1.rstrip('-'))

        # Append first sequence until overlap
        seqs_all.append(ali1[:start2 + skip_initial])

        # Check overlap
        ov1 = ali1[start2 + skip_initial: end1 - skip_initial]
        ov2 = ali2[start2 + skip_initial: end1 - skip_initial]

        if VERBOSE >= 2:
            from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
            pretty_print_pairwise_ali((ov1, ov2), width=100,
                                      name1='seq1', name2='seq2')

        if (not accept_gaps) and (('-' in ov1) or ('-' in ov2)):
            raise ValueError('Gaps in the overlap n. '+str(iov+1))

        # Trust the first sequence until half, then the other one
        i_mid = len(ov1) // 2
        seqs_all.append(ov1[:i_mid])
        seqs_all.append(ov2[i_mid:])

        # Set the left trim for the trailing sequence
        left_trim = len(ali2[: end1 - skip_initial].replace('-', ''))

    if VERBOSE >= 1:
        print 'Add last sequence'
    seqs_all.append(seq2[left_trim:])

    return ''.join(seqs_all)
Exemple #8
0
def build_coordinate_map(refseq,
                         patseq,
                         VERBOSE=0,
                         score_gapopen=-20,
                         **kwargs):
    '''Build the coordinate map
    
    Parameters
      **kwargs: passed to alignment function (e.g. alignment penalties)
    '''
    from seqanpy import align_global
    (score, ali1, ali2) = align_global(refseq,
                                       patseq,
                                       score_gapopen=score_gapopen,
                                       **kwargs)
    patseq_start = len(ali2) - len(ali2.lstrip('-'))
    patseq_end = len(ali2.rstrip('-'))

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali(
            [ali1[patseq_start:patseq_end], ali2[patseq_start:patseq_end]],
            name1=refseq.name,
            name2=patseq.name)

    # Bijective map
    mapbi = []
    pos_ref = patseq_start
    pos_ini = 0
    for col in xrange(patseq_start, patseq_end):
        nuc_ref = ali1[col]
        nuc_ini = ali2[col]
        if (nuc_ref != '-') and (nuc_ini != '-'):
            mapbi.append((pos_ref, pos_ini))
            pos_ref += 1
            pos_ini += 1
        elif (nuc_ref != '-'):
            pos_ref += 1
        elif (nuc_ini != '-'):
            pos_ini += 1

    return mapbi
Exemple #9
0
def check_reference_overlap(p, VERBOSE=0):
    '''Check whether the reference from the various fragments overlap correctly'''
    from seqanpy import align_ladder
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    fragments = ['F' + str(i + 1) for i in xrange(6)]
    title = 'Overlaps'
    line = ('{:<' + str(title_len) + '}').format(title + ':')
    stati = []
    for i in xrange(len(fragments) - 1):
        ref1 = p.get_reference(fragments[i])
        ref2 = p.get_reference(fragments[i + 1])
        (score, ali1, ali2) = align_ladder(ref1,
                                           ref2,
                                           score_gapopen=-10,
                                           score_gapext=-1)

        start2 = len(ali2) - len(ali2.lstrip('-'))
        end1 = len(ali1.rstrip('-'))

        if VERBOSE >= 4:
            pretty_print_pairwise_ali((ali1[start2:end1], ali2[start2:end1]),
                                      name1=fragments[i],
                                      name2=fragments[i + 1],
                                      width=100)

        if ali1[start2:end1].count('-') == ali2[start2:end1].count('-'):
            status = 'OK'
        else:
            status = 'GAPS'
            import ipdb
            ipdb.set_trace()

        line = line+fragments[i]+': '+\
            ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+'  '
        stati.append(status)

    print line

    if 'GAPS' in stati:
        raise ValueError('GAPS status found')
def check_reference_overlap(p, VERBOSE=0):
    '''Check whether the reference from the various fragments overlap correctly'''
    from seqanpy import align_ladder
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    fragments = ['F'+str(i+1) for i in xrange(6)]
    title = 'Overlaps'
    line = ('{:<'+str(title_len)+'}').format(title+':')
    stati = []
    for i in xrange(len(fragments) - 1):
        ref1 = p.get_reference(fragments[i])
        ref2 = p.get_reference(fragments[i+1])
        (score, ali1, ali2) = align_ladder(ref1, ref2,
                                           score_gapopen=-10,
                                           score_gapext=-1)

        start2 = len(ali2) - len(ali2.lstrip('-'))
        end1 = len(ali1.rstrip('-'))

        if VERBOSE >= 4:
            pretty_print_pairwise_ali((ali1[start2: end1], ali2[start2: end1]),
                                      name1=fragments[i],
                                      name2=fragments[i+1],
                                      width=100)
        
        if ali1[start2: end1].count('-') == ali2[start2: end1].count('-'):
            status = 'OK'
        else:
            status = 'GAPS'
            import ipdb; ipdb.set_trace()

        line = line+fragments[i]+': '+\
            ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+'  '
        stati.append(status)

    print line

    if 'GAPS' in stati:
        raise ValueError('GAPS status found') 
Exemple #11
0
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5):
    '''Check a protein annotation'''
    seq = fea.extract(seqgw).seq

    if len(seq) % 3:
        raise ValueError('The length of ' + fea.id + ' is not a multiple of 3')

    if 'N' in seq:
        raise ValueError('N nucleotides found in ' + fea.id)

    if '-' in seq:
        raise ValueError('Gaps found in ' + fea.id)

    prot = seq.translate()

    if ('*' in prot) and (prot.find('*') != len(prot) - 1):
        raise ValueError('Premature stops found in ' + fea.id)

    if 'X' in prot:
        raise ValueError('X amino acids found in ' + fea.id)

    # Compare to HXB2
    from hivwholeseq.reference import load_custom_reference
    ref = load_custom_reference('HXB2', region=fea.id)

    from seqanpy import align_global
    (score, alis, alir) = align_global(seq, ref, score_gapopen=-20)
    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali((alir, alis),
                                  name1='HXB2',
                                  name2='seq',
                                  width=100)

    scoremax = 3 * len(alis)
    delta = scoremax - score
    if delta > delta_pos * len(alis):
        raise ValueError('The sequence of ' + fea.id +
                         ' looks different from HXB2')
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5):
    '''Check a protein annotation'''
    seq = fea.extract(seqgw).seq

    if len(seq) % 3:
        raise ValueError('The length of '+fea.id+' is not a multiple of 3')

    if 'N' in seq:
        raise ValueError('N nucleotides found in '+fea.id)

    if '-' in seq:
        raise ValueError('Gaps found in '+fea.id)

    prot = seq.translate()

    if ('*' in prot) and (prot.find('*') != len(prot) - 1):
        raise ValueError('Premature stops found in '+fea.id)

    if 'X' in prot:
        raise ValueError('X amino acids found in '+fea.id)

    # Compare to HXB2
    from hivwholeseq.reference import load_custom_reference
    ref = load_custom_reference('HXB2', region=fea.id)

    from seqanpy import align_global
    (score, alis, alir) = align_global(seq, ref, score_gapopen=-20)
    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq',
                                  width=100)

    scoremax = 3 * len(alis)
    delta = scoremax - score
    if delta > delta_pos * len(alis):
        raise ValueError('The sequence of '+fea.id+' looks different from HXB2')
def align_to_reference(seq, refstr, VERBOSE=0, codon_align=False,
                       require_full_cover=True):
    '''Align sequence to refernce, stripping reference gaps'''
    import numpy as np
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from seqanpy import align_overlap, align_local
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    seqstr = ''.join(seq).upper()

    n_amb = len(seqstr) - sum(map(seqstr.count, ('A', 'C', 'G', 'T', '-')))
    if n_amb > 2:
        raise ValueError('Too many ambiguous sites')

    def align_dna(seqstr, refstr, require_full_cover=True):
        if require_full_cover:
            (score, alis, alir) = align_overlap(seqstr, refstr)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            alist = alis[start: end]
            alirt = alir[start: end]
        else:
            (score, alis, alir) = align_local(seqstr, refstr)
            reftrim = alir.replace('-', '')
            start = refstr.find(reftrim[:50])
            end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:])
            alist = ('N' * start) + alis + ('N' * (len(refstr) - end))
            alirt = refstr[:start] + alir + refstr[end:]

        return (alist, alirt)

    (alis, alir) = align_dna(seqstr, refstr, require_full_cover=require_full_cover)

    if codon_align:
        (alis, alir) = align_codon_pairwise(alis.replace('-', ''), alir.replace('-', ''))

    if require_full_cover:
        # If the sequence is shorter than HXB2, skip
        if '-' in (alis[0], alis[-1]):
            raise ValueError('The sequence does not fully cover the region')

        # If the sequence has too much gapping close to the edges, it's also short
        if (alis[:15].count('-') > 5) or (alis[-15:].count('-') > 5):
            raise ValueError('The sequence does not fully cover the region')

    else:
        # Put N instead of gaps at the edges
        first_nongap = len(alis) - len(alis.lstrip('-'))
        last_nongap = len(alis.rstrip('-')) - 1
        alis = (('N' * first_nongap) +
                alis[first_nongap: last_nongap + 1] +
                ('N' * (len(alis) - 1 - last_nongap)))

    if VERBOSE >= 2:
        pretty_print_pairwise_ali((alis, alir), width=100,
                                  name2=refname, name1=seq.name)


    # Strip gaps in HXB2
    alism = np.fromstring(alis, 'S1')
    alirm = np.fromstring(alir, 'S1')
    ind = (alirm != '-')
    seq_aliref = ''.join(alism[ind])

    rec = SeqRecord(Seq(seq_aliref, seq.seq.alphabet),
                    id=seq.id,
                    name=seq.name,
                    description=seq.description)

    return rec
                # NOTE: Take only the most distant read of a pair
                print irp, dpair

                i = dpair.argmax()
                d = dpair[i]
                edge = edgepair[i]
                seq = seqpair[i]

                (score, ali1, ali2) = align_global(seq, consrec[edge[0]: edge[1]])
                scoremax = 3 * len(ali1)
                delta = scoremax - score
                ali = [ali2, ali1]

                print 'Alignment to its own consensus (delta = '+str(delta)+')'
                pretty_print_pairwise_ali(ali,
                                          'cons',
                                          'read'+str(i+1)+' '+str(edge),
                                          len_name=25, width=90)
                print ''

                # Compare to all consensi and find the closest
                alifr = alis[fragment]
                alifrpw = []
                for cons in alifr:
                    alifrpw.append(align_overlap(cons.seq.ungap('-'), seq))
                scores = map(itemgetter(0), alifrpw)
                indmax = np.argmax(scores)

                alimax = alifrpw[indmax][1:]
                start = len(alimax[1]) - len(alimax[1].lstrip('-'))
                end = len(alimax[1].rstrip('-'))
                alimax = [s[start: end] for s in alimax]
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0):
    '''Merge the allele counts of all fragments
    
    Note: we do not require full coverage of all fragments, the missing
          ones will just have zero counts. Sometimes, cherry-picking the data
          fragment by fragment might be a better choice.
    '''
    from hivwholeseq.utils.miseq import alpha, read_types
    from seqanpy import align_overlap

    ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int)

    pos_ref = 1000
    for (fr, ref, acsi) in acs:

        # Find the coordinates
        (score, ali1, ali2) = align_overlap(
            ref_genomewide[pos_ref - 1000:],
            ref,
            #score_gapopen=-20,
        )
        fr_start = len(ali2) - len(ali2.lstrip('-'))
        fr_end = len(ali2.rstrip('-'))

        if VERBOSE:
            print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end

        # Scan the alignment
        pos_ref = pos_ref - 1000 + fr_start
        fr_start_ref = pos_ref
        fr_end_ref = pos_ref + fr_end - fr_start
        pos_fr = 0
        for pos_ali in xrange(fr_start, fr_end):
            # Gap in genomewise, ignore position
            if ali1[pos_ali] == '-':
                pos_fr += 1
                continue

            # Gap in fragment, ignore FIXME: probably we should put deletions
            elif ali2[pos_ali] == '-':
                pos_ref += 1
                continue

            # Add the counts
            # NOTE: all fragments are treated the same, even in case of coverage
            # differences of orders of magnitude. This means, larger coverage
            # always wins. Maybe we want to implement this somewhat differently
            ac[:, :, pos_ref] += acsi[:, :, pos_fr]
            pos_fr += 1
            pos_ref += 1

        if VERBOSE >= 3:
            from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
            cons = alpha[ac.sum(axis=0).argmax(axis=0)]
            pretty_print_pairwise_ali(
                (ali1[fr_start:fr_end], cons[fr_start:fr_end]),
                name1='gw',
                name2=fr,
                width=100)

    return ac
def filter_contamination(
    bamfilename,
    bamfilename_out,
    contseqs,
    samplename,
    VERBOSE=0,
    deltascore_max_self=60,
    deltascore_max_other=24,
    maxreads=-1,
    **kwargs
):
    """Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    """
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if "score_match" in kwargs:
        score_match = kwargs["score_match"]
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam"

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")"

    with pysam.Samfile(bamfilename, "rb") as bamfile:
        with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile(
            bamfilename_trash, "wb", template=bamfile
        ) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write("\x1b[1A")
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print "Read is very close to its own consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read")
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print "Read is closest to its consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                    # The read may come from another consensus (contamination)
                    elif delta_read <= deltascore_max_other:
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print "Contaminated read found! Good:", n_good, "cont:", sum(
                                n_cont.itervalues()
                            ), "sources:", n_cont

                        if VERBOSE >= 3:
                            print "Read is contaminated by", contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read")
                            print ""
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                        if VERBOSE >= 2:
                            print ""

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print "Read is close to nothing really", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)
def merge_fragments(sequences, name='', VERBOSE=0):
    '''Merge references at overlapping pairs'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet.IUPAC import ambiguous_dna
    from seqanpy import align_ladder

    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    consensus = []
    seq_old = ''.join(sequences['F1'])
    for i in xrange(5):
        seq_new = ''.join(sequences['F'+str(i+2)])
        (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10)

        if VERBOSE >= 3:
            pretty_print_pairwise_ali([ali1, ali2], name1='F'+str(i+1), name2='F'+str(i+2))

        # Overlap: the first sequence is better at the start, the second at the end
        end1 = len(ali1.rstrip('-'))
        start2 = len(ali2) - len(ali2.lstrip('-'))
        len_overlap = end1 - start2

        # There might a too short consensus, just join them with N
        if len_overlap < 50:
            consensus.append(seq_old)
            consensus.append('N' * 10)
            if i == 4:
                consensus.append(seq_new)
            else:
                seq_old = seq_new

            continue

        overlap1 = np.fromstring(ali1[start2: end1], 'S1')
        overlap2 = np.fromstring(ali2[start2: end1], 'S1')
        overlap = overlap1.copy()
        ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0]
        for j in ind_overlap_mismatch:
            if j < len(overlap) // 3:
                continue
            elif j < 2 * len(overlap) // 3:
                overlap[j] = 'N'
            else:
                overlap[j] = overlap2[j]
        overlap = overlap.tostring()

        consensus.append(ali1[:start2])
        consensus.append(overlap)
        if i == 4:
            consensus.append(ali2[end1:])
        else:
            seq_old = ali2[end1:].replace('-', '')


    consensus = ''.join(consensus)
    cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna),
                         id=name, name=name,
                         description=name+', genomewide')

    return cons_rec
Exemple #18
0
def merge_fragments(sequences, name='', VERBOSE=0):
    '''Merge references at overlapping pairs'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet.IUPAC import ambiguous_dna
    from seqanpy import align_ladder

    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    consensus = []
    seq_old = ''.join(sequences['F1'])
    for i in xrange(5):
        seq_new = ''.join(sequences['F' + str(i + 2)])
        (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10)

        if VERBOSE >= 3:
            pretty_print_pairwise_ali([ali1, ali2],
                                      name1='F' + str(i + 1),
                                      name2='F' + str(i + 2))

        # Overlap: the first sequence is better at the start, the second at the end
        end1 = len(ali1.rstrip('-'))
        start2 = len(ali2) - len(ali2.lstrip('-'))
        len_overlap = end1 - start2

        # There might a too short consensus, just join them with N
        if len_overlap < 50:
            consensus.append(seq_old)
            consensus.append('N' * 10)
            if i == 4:
                consensus.append(seq_new)
            else:
                seq_old = seq_new

            continue

        overlap1 = np.fromstring(ali1[start2:end1], 'S1')
        overlap2 = np.fromstring(ali2[start2:end1], 'S1')
        overlap = overlap1.copy()
        ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0]
        for j in ind_overlap_mismatch:
            if j < len(overlap) // 3:
                continue
            elif j < 2 * len(overlap) // 3:
                overlap[j] = 'N'
            else:
                overlap[j] = overlap2[j]
        overlap = overlap.tostring()

        consensus.append(ali1[:start2])
        consensus.append(overlap)
        if i == 4:
            consensus.append(ali2[end1:])
        else:
            seq_old = ali2[end1:].replace('-', '')

    consensus = ''.join(consensus)
    cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna),
                         id=name,
                         name=name,
                         description=name + ', genomewide')

    return cons_rec
def align_to_reference(seq,
                       refstr,
                       VERBOSE=0,
                       codon_align=False,
                       require_full_cover=True):
    '''Align sequence to refernce, stripping reference gaps'''
    import numpy as np
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from seqanpy import align_overlap, align_local
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    seqstr = ''.join(seq).upper()

    n_amb = len(seqstr) - sum(map(seqstr.count, ('A', 'C', 'G', 'T', '-')))
    if n_amb > 2:
        raise ValueError('Too many ambiguous sites')

    def align_dna(seqstr, refstr, require_full_cover=True):
        if require_full_cover:
            (score, alis, alir) = align_overlap(seqstr, refstr)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            alist = alis[start:end]
            alirt = alir[start:end]
        else:
            (score, alis, alir) = align_local(seqstr, refstr)
            reftrim = alir.replace('-', '')
            start = refstr.find(reftrim[:50])
            end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:])
            alist = ('N' * start) + alis + ('N' * (len(refstr) - end))
            alirt = refstr[:start] + alir + refstr[end:]

        return (alist, alirt)

    (alis, alir) = align_dna(seqstr,
                             refstr,
                             require_full_cover=require_full_cover)

    if codon_align:
        (alis, alir) = align_codon_pairwise(alis.replace('-', ''),
                                            alir.replace('-', ''))

    if require_full_cover:
        # If the sequence is shorter than HXB2, skip
        if '-' in (alis[0], alis[-1]):
            raise ValueError('The sequence does not fully cover the region')

        # If the sequence has too much gapping close to the edges, it's also short
        if (alis[:15].count('-') > 5) or (alis[-15:].count('-') > 5):
            raise ValueError('The sequence does not fully cover the region')

    else:
        # Put N instead of gaps at the edges
        first_nongap = len(alis) - len(alis.lstrip('-'))
        last_nongap = len(alis.rstrip('-')) - 1
        alis = (('N' * first_nongap) + alis[first_nongap:last_nongap + 1] +
                ('N' * (len(alis) - 1 - last_nongap)))

    if VERBOSE >= 2:
        pretty_print_pairwise_ali((alis, alir),
                                  width=100,
                                  name2=refname,
                                  name1=seq.name)

    # Strip gaps in HXB2
    alism = np.fromstring(alis, 'S1')
    alirm = np.fromstring(alir, 'S1')
    ind = (alirm != '-')
    seq_aliref = ''.join(alism[ind])

    rec = SeqRecord(Seq(seq_aliref, seq.seq.alphabet),
                    id=seq.id,
                    name=seq.name,
                    description=seq.description)

    return rec
def filter_contamination(bamfilename,
                         bamfilename_out,
                         contseqs,
                         samplename,
                         VERBOSE=0,
                         deltascore_max_self=60,
                         deltascore_max_other=24,
                         maxreads=-1,
                         **kwargs):
    '''Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    '''
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if 'score_match' in kwargs:
        score_match = kwargs['score_match']
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam'

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print 'Scanning reads (' + str(
            get_number_reads(bamfilename) // 2) + ')'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \
             pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write('\x1b[1A')
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1,
                     alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print 'Read is very close to its own consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1,
                         ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(),
                                                 key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print 'Read is closest to its consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                    # The read may come from another consensus (contamination)
                    elif (delta_read <= deltascore_max_other):
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print 'Contaminated read found! Good:', n_good, 'cont:', sum(
                                n_cont.itervalues()), 'sources:', n_cont

                        if VERBOSE >= 3:
                            print 'Read is contaminated by', contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='self',
                                                      name2='read')
                            print ''
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                        if VERBOSE >= 2:
                            print ''

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print 'Read is close to nothing really', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)

        if do_genomewide:
            seqs = [patient.get_reference('F'+str(i)) for i in xrange(1, 7)]
            seq = merge_sequences_fragments(seqs, VERBOSE=VERBOSE)

            seq = SeqRecord(Seq(seq, ambiguous_dna),
                            id=pname+'_genomewide',
                            name=pname+'_genomewide',
                            description='Genomewide reference for patient '+pname)

            ref = patient.get_reference('genomewide')

            if VERBOSE >= 2:
                from seqanpy import align_global
                from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
                (score, ali1, ali2) = align_global(ref, seq, score_gapopen=-20)
                pretty_print_pairwise_ali((ali1, ali2),
                                          name1='Old ref', name2='New ref',
                                          width=100)

            # TODO: resplit sequences to make sure we cover the whole F5a, F3c,
            # etc. THIS CHANGES THE COORDINATES!

            if use_save:
                fn = patient.get_reference_filename('genomewide', 'fasta')
                fn_old = fn.replace('.fasta', '_old.fasta')
                save_protect(fn, fn_old, VERBOSE=VERBOSE)