def build_coordinate_map(refseq, patseq, VERBOSE=0, score_gapopen=-20, **kwargs):
    '''Build the coordinate map
    
    Parameters
      **kwargs: passed to alignment function (e.g. alignment penalties)
    '''
    from seqanpy import align_global
    (score, ali1, ali2) = align_global(refseq, patseq, score_gapopen=score_gapopen,
                                       **kwargs)
    patseq_start = len(ali2) - len(ali2.lstrip('-'))
    patseq_end = len(ali2.rstrip('-'))

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali([ali1[patseq_start: patseq_end],
                                   ali2[patseq_start: patseq_end]],
                                  name1=refseq.name, name2=patseq.name)

    # Bijective map
    mapbi = []
    pos_ref = patseq_start
    pos_ini = 0
    for col in xrange(patseq_start, patseq_end):
        nuc_ref = ali1[col]
        nuc_ini = ali2[col]
        if (nuc_ref != '-') and (nuc_ini != '-'):
            mapbi.append((pos_ref, pos_ini))
            pos_ref += 1
            pos_ini += 1
        elif (nuc_ref != '-'):
            pos_ref += 1
        elif (nuc_ini != '-'):
            pos_ini += 1

    return mapbi
Ejemplo n.º 2
0
def check_similarity_initial_sample(refseq,
                                    sample_seq,
                                    fragment,
                                    VERBOSE=0,
                                    maxdiff=10):
    '''Check whether the reference looks similar to the initial sample'''
    from seqanpy import align_global
    (score, ali1, ali2) = align_global(str(refseq.seq),
                                       str(sample_seq.seq),
                                       band=50)

    alim = np.zeros((2, len(ali1)), 'S1')
    alim[0] = np.fromstring(ali1, 'S1')
    alim[1] = np.fromstring(ali2, 'S1')
    n_diff = (alim[0] != alim[1]).sum()

    if VERBOSE >= 2:
        print fragment + ': difference between ref and initial consensus:', n_diff

    if n_diff > maxdiff:
        print 'ERROR: '+fragment+', reference is not similar to initial consensus ('+\
                str(sample_init_seq.name)+', '+\
                str(n_diff)+' differences)'
        return False
    elif VERBOSE >= 3:
        print 'OK: reference is similar to initial consensus ('+\
                str(sample_init_seq.name)+', '+\
                str(n_diff)+' differences)'

    return True
Ejemplo n.º 3
0
 def on_click(event):
     '''Print sequence on click'''
     mouseevent = event.mouseevent
     artist = event.artist
     i_clicked = int(artist.get_label())
     (score, ali1, ali2) = align_global(seq0, seqs[i_clicked], score_gapopen=-20)
     pretty_print_pairwise_ali((ali1, ali2), name1='cons0', name2='clicked', width=120)
Ejemplo n.º 4
0
    def __init__(self, ref1='HXB2', ref2='NL4-3'):
        super(ReferenceTranslator, self).__init__()
        self.ref1 = ref1
        self.ref2 = ref2

        self.refseq1 = SeqIO.read(get_custom_reference_filename(self.ref1, format='gb'), format='genbank').seq
        self.refseq2 = SeqIO.read(get_custom_reference_filename(self.ref2, format='gb'), format='genbank').seq

        from seqanpy import align_global
        (score, ali1, ali2) = align_global(str(self.refseq1), str(self.refseq2), band=200)
        self.count1 = np.cumsum(np.fromstring(ali1,'S1')!='-')-1
        self.count2 = np.cumsum(np.fromstring(ali2,'S1')!='-')-1
Ejemplo n.º 5
0
    def __init__(self, ref1='HXB2', ref2='NL4-3'):
        super(ReferenceTranslator, self).__init__()
        self.ref1 = ref1
        self.ref2 = ref2

        self.refseq1 = SeqIO.read(get_custom_reference_filename(self.ref1,
                                                                format='gb'),
                                  format='genbank').seq
        self.refseq2 = SeqIO.read(get_custom_reference_filename(self.ref2,
                                                                format='gb'),
                                  format='genbank').seq

        from seqanpy import align_global
        (score, ali1, ali2) = align_global(str(self.refseq1),
                                           str(self.refseq2),
                                           band=200)
        self.count1 = np.cumsum(np.fromstring(ali1, 'S1') != '-') - 1
        self.count2 = np.cumsum(np.fromstring(ali2, 'S1') != '-') - 1
Ejemplo n.º 6
0
def build_coordinate_map(refseq,
                         patseq,
                         VERBOSE=0,
                         score_gapopen=-20,
                         **kwargs):
    '''Build the coordinate map
    
    Parameters
      **kwargs: passed to alignment function (e.g. alignment penalties)
    '''
    from seqanpy import align_global
    (score, ali1, ali2) = align_global(refseq,
                                       patseq,
                                       score_gapopen=score_gapopen,
                                       **kwargs)
    patseq_start = len(ali2) - len(ali2.lstrip('-'))
    patseq_end = len(ali2.rstrip('-'))

    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali(
            [ali1[patseq_start:patseq_end], ali2[patseq_start:patseq_end]],
            name1=refseq.name,
            name2=patseq.name)

    # Bijective map
    mapbi = []
    pos_ref = patseq_start
    pos_ini = 0
    for col in xrange(patseq_start, patseq_end):
        nuc_ref = ali1[col]
        nuc_ini = ali2[col]
        if (nuc_ref != '-') and (nuc_ini != '-'):
            mapbi.append((pos_ref, pos_ini))
            pos_ref += 1
            pos_ini += 1
        elif (nuc_ref != '-'):
            pos_ref += 1
        elif (nuc_ini != '-'):
            pos_ini += 1

    return mapbi
Ejemplo n.º 7
0
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5):
    '''Check a protein annotation'''
    seq = fea.extract(seqgw).seq

    if len(seq) % 3:
        raise ValueError('The length of ' + fea.id + ' is not a multiple of 3')

    if 'N' in seq:
        raise ValueError('N nucleotides found in ' + fea.id)

    if '-' in seq:
        raise ValueError('Gaps found in ' + fea.id)

    prot = seq.translate()

    if ('*' in prot) and (prot.find('*') != len(prot) - 1):
        raise ValueError('Premature stops found in ' + fea.id)

    if 'X' in prot:
        raise ValueError('X amino acids found in ' + fea.id)

    # Compare to HXB2
    from hivwholeseq.reference import load_custom_reference
    ref = load_custom_reference('HXB2', region=fea.id)

    from seqanpy import align_global
    (score, alis, alir) = align_global(seq, ref, score_gapopen=-20)
    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali((alir, alis),
                                  name1='HXB2',
                                  name2='seq',
                                  width=100)

    scoremax = 3 * len(alis)
    delta = scoremax - score
    if delta > delta_pos * len(alis):
        raise ValueError('The sequence of ' + fea.id +
                         ' looks different from HXB2')
Ejemplo n.º 8
0
def align_codon_pairwise(seqstr, refstr, **kwargs):
    '''Pairwise alignment via codons
    
    Parameters:
       **kwargs: passed down to SeqAn alignment function
    '''
    from Bio.Seq import translate
    from seqanpy import align_global
    from itertools import izip

    if len(seqstr) % 3:
        raise ValueError('The length of the first sequence is not a multiple of 3')
    elif len(refstr) % 3:
        raise ValueError('The length of the second sequence is not a multiple of 3')

    seqpr = translate(seqstr)
    refpr = translate(refstr)
    (score, alis, alir) = align_global(seqpr, refpr, **kwargs)
    aliseq = []
    aliref = []
    poss = 0
    posr = 0
    for aas, aar in izip(alis, alir):
        if aas == '-':
            aliseq.append('---')
        else:
            aliseq.append(seqstr[poss: poss+3])
            poss += 3

        if aar == '-':
            aliref.append('---')
        else:
            aliref.append(refstr[posr: posr+3])
            posr += 3

    aliseq = ''.join(aliseq)
    aliref = ''.join(aliref)

    return (aliseq, aliref)
def check_similarity_initial_sample(refseq, sample_seq, fragment, VERBOSE=0, maxdiff=10):
    '''Check whether the reference looks similar to the initial sample'''
    from seqanpy import align_global
    (score, ali1, ali2) = align_global(str(refseq.seq), str(sample_seq.seq), band=50)

    alim = np.zeros((2, len(ali1)), 'S1')
    alim[0] = np.fromstring(ali1, 'S1')
    alim[1] = np.fromstring(ali2, 'S1')
    n_diff = (alim[0] != alim[1]).sum()

    if VERBOSE >= 2:
        print fragment+': difference between ref and initial consensus:', n_diff

    if n_diff > maxdiff:
        print 'ERROR: '+fragment+', reference is not similar to initial consensus ('+\
                str(sample_init_seq.name)+', '+\
                str(n_diff)+' differences)'
        return False
    elif VERBOSE >=3:
        print 'OK: reference is similar to initial consensus ('+\
                str(sample_init_seq.name)+', '+\
                str(n_diff)+' differences)'

    return True
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5):
    '''Check a protein annotation'''
    seq = fea.extract(seqgw).seq

    if len(seq) % 3:
        raise ValueError('The length of '+fea.id+' is not a multiple of 3')

    if 'N' in seq:
        raise ValueError('N nucleotides found in '+fea.id)

    if '-' in seq:
        raise ValueError('Gaps found in '+fea.id)

    prot = seq.translate()

    if ('*' in prot) and (prot.find('*') != len(prot) - 1):
        raise ValueError('Premature stops found in '+fea.id)

    if 'X' in prot:
        raise ValueError('X amino acids found in '+fea.id)

    # Compare to HXB2
    from hivwholeseq.reference import load_custom_reference
    ref = load_custom_reference('HXB2', region=fea.id)

    from seqanpy import align_global
    (score, alis, alir) = align_global(seq, ref, score_gapopen=-20)
    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq',
                                  width=100)

    scoremax = 3 * len(alis)
    delta = scoremax - score
    if delta > delta_pos * len(alis):
        raise ValueError('The sequence of '+fea.id+' looks different from HXB2')
Ejemplo n.º 11
0
            indrandom = np.arange(len(ds))
            np.random.shuffle(indrandom)
            ds = ds[indrandom]
            edges = np.array(edges)[indrandom]
            seqs = [seqs[i] for i in indrandom]

            for irp, (dpair, edgepair, seqpair) in enumerate(izip(ds, edges, seqs)):
                # NOTE: Take only the most distant read of a pair
                print irp, dpair

                i = dpair.argmax()
                d = dpair[i]
                edge = edgepair[i]
                seq = seqpair[i]

                (score, ali1, ali2) = align_global(seq, consrec[edge[0]: edge[1]])
                scoremax = 3 * len(ali1)
                delta = scoremax - score
                ali = [ali2, ali1]

                print 'Alignment to its own consensus (delta = '+str(delta)+')'
                pretty_print_pairwise_ali(ali,
                                          'cons',
                                          'read'+str(i+1)+' '+str(edge),
                                          len_name=25, width=90)
                print ''

                # Compare to all consensi and find the closest
                alifr = alis[fragment]
                alifrpw = []
                for cons in alifr:
Ejemplo n.º 12
0
date:       30/01/14
content:    Test of the seqanpy module from Python.
'''
# Modules


# Script
if __name__ == '__main__':

    # Try import
    import seqanpy as sap

    # Global pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'AAATCGA'
    output = sap.align_global(seq1, seq2, band=5)
    print output

    # Overlap pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2)
    print output

    # Overlap pairwise alignment cutting flanks
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2, cut_flanks=True)
    print output

    # Ladder pairwise alignment

        if do_genomewide:
            seqs = [patient.get_reference('F'+str(i)) for i in xrange(1, 7)]
            seq = merge_sequences_fragments(seqs, VERBOSE=VERBOSE)

            seq = SeqRecord(Seq(seq, ambiguous_dna),
                            id=pname+'_genomewide',
                            name=pname+'_genomewide',
                            description='Genomewide reference for patient '+pname)

            ref = patient.get_reference('genomewide')

            if VERBOSE >= 2:
                from seqanpy import align_global
                from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
                (score, ali1, ali2) = align_global(ref, seq, score_gapopen=-20)
                pretty_print_pairwise_ali((ali1, ali2),
                                          name1='Old ref', name2='New ref',
                                          width=100)

            # TODO: resplit sequences to make sure we cover the whole F5a, F3c,
            # etc. THIS CHANGES THE COORDINATES!

            if use_save:
                fn = patient.get_reference_filename('genomewide', 'fasta')
                fn_old = fn.replace('.fasta', '_old.fasta')
                save_protect(fn, fn_old, VERBOSE=VERBOSE)


Ejemplo n.º 14
0
def global_test():
    print('Test align_global')
    import seqanpy
    (score, ali1, ali2) = seqanpy.align_global('ACCGT', 'AGT')
    assert ali1 == 'ACCGT'
    assert ali2 == 'A--GT'