def get_map_overlap(sample, fr1, fr2):
    '''Get a coordinate map of the overlap between the two fragments'''
    import numpy as np
    from seqanpy import align_ladder

    seq1 = sample.get_reference(fr1)
    seq2 = sample.get_reference(fr2)
    (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20)
    start2 = len(ali2) - len(ali2.lstrip('-'))
    end1 = len(ali1.rstrip('-'))
    
    mapco = []
    pos1 = start2
    pos2 = 0
    for i in xrange(start2, end1):
        if (ali1[i] != '-') and (ali2[i] != '-'):
            mapco.append((pos1, pos2))

        if ali1[i] != '-':
            pos1 += 1

        if ali2[i] != '-':
            pos2 += 1

    return np.array(mapco, int)
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60):
    '''Join a new block to an extant consensus'''
    import numpy as np
    from seqanpy import align_ladder

    (score, ali1, ali2) = align_ladder(consensus, cons_block, score_gapopen=-10)

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali([ali1, ali2], name1='consensus', name2='new block')

    # In very rare occasions (coverage holes), the second sequence is actually
    # shorter than the first, then we do not need to glue it in
    if ali2[-1] == '-':
        if VERBOSE >= 2:
            print 'WARNING: the old block is longer than the new one (maybe low coverage)'
        return consensus

    end1 = len(ali1.rstrip('-'))
    start2 = len(ali2) - len(ali2.lstrip('-'))
    scoremax = 3 * (end1 - start2)
    delta = scoremax - score
    if delta > deltamax:
        raise ValueError('Too many mismatches in neighbouring local consensi! ('+str(delta)+', max '+str(deltamax)+')')
    consensus = (ali1[:start2] + ali2[start2:]).replace('-', '')
    return consensus
def align_fragments(c1, c2, VERBOSE=0):
    '''Align subsequence fragments'''
    import numpy as np
    from seqanpy import align_ladder
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
    (score, a1, a2) = align_ladder(c1, c2, score_gapopen=-20)
    start2 = len(a2) - len(a2.lstrip('-'))
    end1 = len(a1.rstrip('-'))

    a1 = a1[start2: end1]
    a2 = a2[start2: end1]

    if VERBOSE >= 3:
        pretty_print_pairwise_ali((a1, a2), width=100,
                                  name1=fr1, name2=fr2)

    a1 = np.fromstring(a1, 'S1')
    a2 = np.fromstring(a2, 'S1')
    co1 = (a1 != '-').cumsum() - 1
    co2 = (a2 != '-').cumsum() - 1
    ind = (a1 != '-') & (a2 != '-')

    pos1 = co1[ind] + start2
    pos2 = co2[ind]

    return (pos1, pos2)
Beispiel #4
0
def get_map_overlap(sample, fr1, fr2):
    '''Get a coordinate map of the overlap between the two fragments'''
    import numpy as np
    from seqanpy import align_ladder

    seq1 = sample.get_reference(fr1)
    seq2 = sample.get_reference(fr2)
    (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20)
    start2 = len(ali2) - len(ali2.lstrip('-'))
    end1 = len(ali1.rstrip('-'))

    mapco = []
    pos1 = start2
    pos2 = 0
    for i in xrange(start2, end1):
        if (ali1[i] != '-') and (ali2[i] != '-'):
            mapco.append((pos1, pos2))

        if ali1[i] != '-':
            pos1 += 1

        if ali2[i] != '-':
            pos2 += 1

    return np.array(mapco, int)
Beispiel #5
0
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60):
    '''Join a new block to an extant consensus'''
    import numpy as np
    from seqanpy import align_ladder

    (score, ali1, ali2) = align_ladder(consensus,
                                       cons_block,
                                       score_gapopen=-10)

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali([ali1, ali2],
                                  name1='consensus',
                                  name2='new block')

    # In very rare occasions (coverage holes), the second sequence is actually
    # shorter than the first, then we do not need to glue it in
    if ali2[-1] == '-':
        if VERBOSE >= 2:
            print 'WARNING: the old block is longer than the new one (maybe low coverage)'
        return consensus

    end1 = len(ali1.rstrip('-'))
    start2 = len(ali2) - len(ali2.lstrip('-'))
    scoremax = 3 * (end1 - start2)
    delta = scoremax - score
    if delta > deltamax:
        raise ValueError(
            'Too many mismatches in neighbouring local consensi! (' +
            str(delta) + ', max ' + str(deltamax) + ')')
    consensus = (ali1[:start2] + ali2[start2:]).replace('-', '')
    return consensus
Beispiel #6
0
def merge_sequences(seqs, skip_initial=30, accept_gaps=False, VERBOSE=0):
    '''Merge sequences with overlaps
    
    Parameters:
       seqs (list): sequences to merge
       skip_initial (int): trim from the beginning of overlaps because we do not
       really trust those bases
       accept_gaps (bool): accept gaps in the overlaps
    '''
    from itertools import izip
    from seqanpy import align_ladder
    import numpy as np

    seqs = map(''.join, seqs)

    left_trim = 0
    seqs_all = []
    for iov, (seq1, seq2) in enumerate(izip(seqs[:-1], seqs[1:])):
        if VERBOSE >= 1:
            print 'Overlap n', iov+1

        (score, ali1, ali2) = align_ladder(seq1[left_trim:], seq2, score_gapopen=-20)
        start2 = len(ali2) - len(ali2.lstrip('-'))
        end1 = len(ali1.rstrip('-'))

        # Append first sequence until overlap
        seqs_all.append(ali1[:start2 + skip_initial])

        # Check overlap
        ov1 = ali1[start2 + skip_initial: end1 - skip_initial]
        ov2 = ali2[start2 + skip_initial: end1 - skip_initial]

        if VERBOSE >= 2:
            from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
            pretty_print_pairwise_ali((ov1, ov2), width=100,
                                      name1='seq1', name2='seq2')

        if (not accept_gaps) and (('-' in ov1) or ('-' in ov2)):
            raise ValueError('Gaps in the overlap n. '+str(iov+1))

        # Trust the first sequence until half, then the other one
        i_mid = len(ov1) // 2
        seqs_all.append(ov1[:i_mid])
        seqs_all.append(ov2[i_mid:])

        # Set the left trim for the trailing sequence
        left_trim = len(ali2[: end1 - skip_initial].replace('-', ''))

    if VERBOSE >= 1:
        print 'Add last sequence'
    seqs_all.append(seq2[left_trim:])

    return ''.join(seqs_all)
def merge_read_pair(seq1, seq2):
    '''Merge two reads of a pair, assuming the second starts later'''
    from seqanpy import align_ladder
    (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20)
    end1 = len(ali1.rstrip('-'))
    start2 = len(ali2) - len(ali2.lstrip('-'))
    overlap_ali = np.vstack([np.fromstring(a[start2: end1], 'S1')
                             for a in (ali1, ali2)])

    overlap = overlap_ali[0]
    overlap[overlap_ali[0] != overlap_ali[1]] = 'N'
    overlap = overlap.tostring()

    seq = ali1[:start2] + overlap + ali2[end1:]
    return seq
def merge_read_pair(seq1, seq2):
    '''Merge two reads of a pair, assuming the second starts later'''
    from seqanpy import align_ladder
    (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20)
    end1 = len(ali1.rstrip('-'))
    start2 = len(ali2) - len(ali2.lstrip('-'))
    overlap_ali = np.vstack(
        [np.fromstring(a[start2:end1], 'S1') for a in (ali1, ali2)])

    overlap = overlap_ali[0]
    overlap[overlap_ali[0] != overlap_ali[1]] = 'N'
    overlap = overlap.tostring()

    seq = ali1[:start2] + overlap + ali2[end1:]
    return seq
Beispiel #9
0
def check_reference_overlap(p, VERBOSE=0):
    '''Check whether the reference from the various fragments overlap correctly'''
    from seqanpy import align_ladder
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    fragments = ['F' + str(i + 1) for i in xrange(6)]
    title = 'Overlaps'
    line = ('{:<' + str(title_len) + '}').format(title + ':')
    stati = []
    for i in xrange(len(fragments) - 1):
        ref1 = p.get_reference(fragments[i])
        ref2 = p.get_reference(fragments[i + 1])
        (score, ali1, ali2) = align_ladder(ref1,
                                           ref2,
                                           score_gapopen=-10,
                                           score_gapext=-1)

        start2 = len(ali2) - len(ali2.lstrip('-'))
        end1 = len(ali1.rstrip('-'))

        if VERBOSE >= 4:
            pretty_print_pairwise_ali((ali1[start2:end1], ali2[start2:end1]),
                                      name1=fragments[i],
                                      name2=fragments[i + 1],
                                      width=100)

        if ali1[start2:end1].count('-') == ali2[start2:end1].count('-'):
            status = 'OK'
        else:
            status = 'GAPS'
            import ipdb
            ipdb.set_trace()

        line = line+fragments[i]+': '+\
            ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+'  '
        stati.append(status)

    print line

    if 'GAPS' in stati:
        raise ValueError('GAPS status found')
def check_reference_overlap(p, VERBOSE=0):
    '''Check whether the reference from the various fragments overlap correctly'''
    from seqanpy import align_ladder
    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    fragments = ['F'+str(i+1) for i in xrange(6)]
    title = 'Overlaps'
    line = ('{:<'+str(title_len)+'}').format(title+':')
    stati = []
    for i in xrange(len(fragments) - 1):
        ref1 = p.get_reference(fragments[i])
        ref2 = p.get_reference(fragments[i+1])
        (score, ali1, ali2) = align_ladder(ref1, ref2,
                                           score_gapopen=-10,
                                           score_gapext=-1)

        start2 = len(ali2) - len(ali2.lstrip('-'))
        end1 = len(ali1.rstrip('-'))

        if VERBOSE >= 4:
            pretty_print_pairwise_ali((ali1[start2: end1], ali2[start2: end1]),
                                      name1=fragments[i],
                                      name2=fragments[i+1],
                                      width=100)
        
        if ali1[start2: end1].count('-') == ali2[start2: end1].count('-'):
            status = 'OK'
        else:
            status = 'GAPS'
            import ipdb; ipdb.set_trace()

        line = line+fragments[i]+': '+\
            ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+'  '
        stati.append(status)

    print line

    if 'GAPS' in stati:
        raise ValueError('GAPS status found') 
def merge_sequences_fragments(seqs, VERBOSE=0):
    '''Merge sequences from consecutive fragments'''
    from seqanpy import align_ladder

    seqs = map(''.join, seqs)
    seq = [seqs[0]]
    for seq2 in seqs[1:]:
        seq1 = seq[-1]
        (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20)
        start2 = len(ali2) - len(ali2.lstrip('-'))
        end1 = len(ali1.rstrip('-'))
        len_overlap = end1 - start2

        # Trust the first sequence in the first half, the second in the second
        overlap = ali1[start2: start2 + len_overlap / 2] + \
                  ali2[start2 + len_overlap / 2: end1]

        seq[-1] = ali1[:start2]
        seq.append(overlap)
        seq.append(ali2[end1:])

    seq = ''.join(seq)
    return seq
def merge_fragments(sequences, name='', VERBOSE=0):
    '''Merge references at overlapping pairs'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet.IUPAC import ambiguous_dna
    from seqanpy import align_ladder

    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    consensus = []
    seq_old = ''.join(sequences['F1'])
    for i in xrange(5):
        seq_new = ''.join(sequences['F'+str(i+2)])
        (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10)

        if VERBOSE >= 3:
            pretty_print_pairwise_ali([ali1, ali2], name1='F'+str(i+1), name2='F'+str(i+2))

        # Overlap: the first sequence is better at the start, the second at the end
        end1 = len(ali1.rstrip('-'))
        start2 = len(ali2) - len(ali2.lstrip('-'))
        len_overlap = end1 - start2

        # There might a too short consensus, just join them with N
        if len_overlap < 50:
            consensus.append(seq_old)
            consensus.append('N' * 10)
            if i == 4:
                consensus.append(seq_new)
            else:
                seq_old = seq_new

            continue

        overlap1 = np.fromstring(ali1[start2: end1], 'S1')
        overlap2 = np.fromstring(ali2[start2: end1], 'S1')
        overlap = overlap1.copy()
        ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0]
        for j in ind_overlap_mismatch:
            if j < len(overlap) // 3:
                continue
            elif j < 2 * len(overlap) // 3:
                overlap[j] = 'N'
            else:
                overlap[j] = overlap2[j]
        overlap = overlap.tostring()

        consensus.append(ali1[:start2])
        consensus.append(overlap)
        if i == 4:
            consensus.append(ali2[end1:])
        else:
            seq_old = ali2[end1:].replace('-', '')


    consensus = ''.join(consensus)
    cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna),
                         id=name, name=name,
                         description=name+', genomewide')

    return cons_rec
Beispiel #13
0
    # Global pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'AAATCGA'
    output = sap.align_global(seq1, seq2, band=5)
    print output

    # Overlap pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2)
    print output

    # Overlap pairwise alignment cutting flanks
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2, cut_flanks=True)
    print output

    # Ladder pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'TCTAGGGAAACCC'
    output = sap.align_ladder(seq1, seq2)
    print output

    # Local pairwise alignment
    seq1 = 'AAAGGTCTACCGTAGCCT'
    seq2 = 'AAGTCTAC'
    output = sap.align_local(seq1, seq2)
    print output
Beispiel #14
0
def merge_fragments(sequences, name='', VERBOSE=0):
    '''Merge references at overlapping pairs'''
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet.IUPAC import ambiguous_dna
    from seqanpy import align_ladder

    from hivwholeseq.utils.sequence import pretty_print_pairwise_ali

    consensus = []
    seq_old = ''.join(sequences['F1'])
    for i in xrange(5):
        seq_new = ''.join(sequences['F' + str(i + 2)])
        (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10)

        if VERBOSE >= 3:
            pretty_print_pairwise_ali([ali1, ali2],
                                      name1='F' + str(i + 1),
                                      name2='F' + str(i + 2))

        # Overlap: the first sequence is better at the start, the second at the end
        end1 = len(ali1.rstrip('-'))
        start2 = len(ali2) - len(ali2.lstrip('-'))
        len_overlap = end1 - start2

        # There might a too short consensus, just join them with N
        if len_overlap < 50:
            consensus.append(seq_old)
            consensus.append('N' * 10)
            if i == 4:
                consensus.append(seq_new)
            else:
                seq_old = seq_new

            continue

        overlap1 = np.fromstring(ali1[start2:end1], 'S1')
        overlap2 = np.fromstring(ali2[start2:end1], 'S1')
        overlap = overlap1.copy()
        ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0]
        for j in ind_overlap_mismatch:
            if j < len(overlap) // 3:
                continue
            elif j < 2 * len(overlap) // 3:
                overlap[j] = 'N'
            else:
                overlap[j] = overlap2[j]
        overlap = overlap.tostring()

        consensus.append(ali1[:start2])
        consensus.append(overlap)
        if i == 4:
            consensus.append(ali2[end1:])
        else:
            seq_old = ali2[end1:].replace('-', '')

    consensus = ''.join(consensus)
    cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna),
                         id=name,
                         name=name,
                         description=name + ', genomewide')

    return cons_rec
Beispiel #15
0
def overlap_ladder():
    print('Test align_ladder')
    import seqanpy
    (score, ali1, ali2) = seqanpy.align_ladder('ACCGT', 'CGTAA')
    assert ali1 == 'ACCGT--'
    assert ali2 == '--CGTAA'