def build_coordinate_map(refseq, patseq, VERBOSE=0, score_gapopen=-20, **kwargs): '''Build the coordinate map Parameters **kwargs: passed to alignment function (e.g. alignment penalties) ''' from seqanpy import align_global (score, ali1, ali2) = align_global(refseq, patseq, score_gapopen=score_gapopen, **kwargs) patseq_start = len(ali2) - len(ali2.lstrip('-')) patseq_end = len(ali2.rstrip('-')) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali([ali1[patseq_start: patseq_end], ali2[patseq_start: patseq_end]], name1=refseq.name, name2=patseq.name) # Bijective map mapbi = [] pos_ref = patseq_start pos_ini = 0 for col in xrange(patseq_start, patseq_end): nuc_ref = ali1[col] nuc_ini = ali2[col] if (nuc_ref != '-') and (nuc_ini != '-'): mapbi.append((pos_ref, pos_ini)) pos_ref += 1 pos_ini += 1 elif (nuc_ref != '-'): pos_ref += 1 elif (nuc_ini != '-'): pos_ini += 1 return mapbi
def check_similarity_initial_sample(refseq, sample_seq, fragment, VERBOSE=0, maxdiff=10): '''Check whether the reference looks similar to the initial sample''' from seqanpy import align_global (score, ali1, ali2) = align_global(str(refseq.seq), str(sample_seq.seq), band=50) alim = np.zeros((2, len(ali1)), 'S1') alim[0] = np.fromstring(ali1, 'S1') alim[1] = np.fromstring(ali2, 'S1') n_diff = (alim[0] != alim[1]).sum() if VERBOSE >= 2: print fragment + ': difference between ref and initial consensus:', n_diff if n_diff > maxdiff: print 'ERROR: '+fragment+', reference is not similar to initial consensus ('+\ str(sample_init_seq.name)+', '+\ str(n_diff)+' differences)' return False elif VERBOSE >= 3: print 'OK: reference is similar to initial consensus ('+\ str(sample_init_seq.name)+', '+\ str(n_diff)+' differences)' return True
def on_click(event): '''Print sequence on click''' mouseevent = event.mouseevent artist = event.artist i_clicked = int(artist.get_label()) (score, ali1, ali2) = align_global(seq0, seqs[i_clicked], score_gapopen=-20) pretty_print_pairwise_ali((ali1, ali2), name1='cons0', name2='clicked', width=120)
def __init__(self, ref1='HXB2', ref2='NL4-3'): super(ReferenceTranslator, self).__init__() self.ref1 = ref1 self.ref2 = ref2 self.refseq1 = SeqIO.read(get_custom_reference_filename(self.ref1, format='gb'), format='genbank').seq self.refseq2 = SeqIO.read(get_custom_reference_filename(self.ref2, format='gb'), format='genbank').seq from seqanpy import align_global (score, ali1, ali2) = align_global(str(self.refseq1), str(self.refseq2), band=200) self.count1 = np.cumsum(np.fromstring(ali1,'S1')!='-')-1 self.count2 = np.cumsum(np.fromstring(ali2,'S1')!='-')-1
def __init__(self, ref1='HXB2', ref2='NL4-3'): super(ReferenceTranslator, self).__init__() self.ref1 = ref1 self.ref2 = ref2 self.refseq1 = SeqIO.read(get_custom_reference_filename(self.ref1, format='gb'), format='genbank').seq self.refseq2 = SeqIO.read(get_custom_reference_filename(self.ref2, format='gb'), format='genbank').seq from seqanpy import align_global (score, ali1, ali2) = align_global(str(self.refseq1), str(self.refseq2), band=200) self.count1 = np.cumsum(np.fromstring(ali1, 'S1') != '-') - 1 self.count2 = np.cumsum(np.fromstring(ali2, 'S1') != '-') - 1
def build_coordinate_map(refseq, patseq, VERBOSE=0, score_gapopen=-20, **kwargs): '''Build the coordinate map Parameters **kwargs: passed to alignment function (e.g. alignment penalties) ''' from seqanpy import align_global (score, ali1, ali2) = align_global(refseq, patseq, score_gapopen=score_gapopen, **kwargs) patseq_start = len(ali2) - len(ali2.lstrip('-')) patseq_end = len(ali2.rstrip('-')) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali( [ali1[patseq_start:patseq_end], ali2[patseq_start:patseq_end]], name1=refseq.name, name2=patseq.name) # Bijective map mapbi = [] pos_ref = patseq_start pos_ini = 0 for col in xrange(patseq_start, patseq_end): nuc_ref = ali1[col] nuc_ini = ali2[col] if (nuc_ref != '-') and (nuc_ini != '-'): mapbi.append((pos_ref, pos_ini)) pos_ref += 1 pos_ini += 1 elif (nuc_ref != '-'): pos_ref += 1 elif (nuc_ini != '-'): pos_ini += 1 return mapbi
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5): '''Check a protein annotation''' seq = fea.extract(seqgw).seq if len(seq) % 3: raise ValueError('The length of ' + fea.id + ' is not a multiple of 3') if 'N' in seq: raise ValueError('N nucleotides found in ' + fea.id) if '-' in seq: raise ValueError('Gaps found in ' + fea.id) prot = seq.translate() if ('*' in prot) and (prot.find('*') != len(prot) - 1): raise ValueError('Premature stops found in ' + fea.id) if 'X' in prot: raise ValueError('X amino acids found in ' + fea.id) # Compare to HXB2 from hivwholeseq.reference import load_custom_reference ref = load_custom_reference('HXB2', region=fea.id) from seqanpy import align_global (score, alis, alir) = align_global(seq, ref, score_gapopen=-20) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq', width=100) scoremax = 3 * len(alis) delta = scoremax - score if delta > delta_pos * len(alis): raise ValueError('The sequence of ' + fea.id + ' looks different from HXB2')
def align_codon_pairwise(seqstr, refstr, **kwargs): '''Pairwise alignment via codons Parameters: **kwargs: passed down to SeqAn alignment function ''' from Bio.Seq import translate from seqanpy import align_global from itertools import izip if len(seqstr) % 3: raise ValueError('The length of the first sequence is not a multiple of 3') elif len(refstr) % 3: raise ValueError('The length of the second sequence is not a multiple of 3') seqpr = translate(seqstr) refpr = translate(refstr) (score, alis, alir) = align_global(seqpr, refpr, **kwargs) aliseq = [] aliref = [] poss = 0 posr = 0 for aas, aar in izip(alis, alir): if aas == '-': aliseq.append('---') else: aliseq.append(seqstr[poss: poss+3]) poss += 3 if aar == '-': aliref.append('---') else: aliref.append(refstr[posr: posr+3]) posr += 3 aliseq = ''.join(aliseq) aliref = ''.join(aliref) return (aliseq, aliref)
def check_similarity_initial_sample(refseq, sample_seq, fragment, VERBOSE=0, maxdiff=10): '''Check whether the reference looks similar to the initial sample''' from seqanpy import align_global (score, ali1, ali2) = align_global(str(refseq.seq), str(sample_seq.seq), band=50) alim = np.zeros((2, len(ali1)), 'S1') alim[0] = np.fromstring(ali1, 'S1') alim[1] = np.fromstring(ali2, 'S1') n_diff = (alim[0] != alim[1]).sum() if VERBOSE >= 2: print fragment+': difference between ref and initial consensus:', n_diff if n_diff > maxdiff: print 'ERROR: '+fragment+', reference is not similar to initial consensus ('+\ str(sample_init_seq.name)+', '+\ str(n_diff)+' differences)' return False elif VERBOSE >=3: print 'OK: reference is similar to initial consensus ('+\ str(sample_init_seq.name)+', '+\ str(n_diff)+' differences)' return True
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5): '''Check a protein annotation''' seq = fea.extract(seqgw).seq if len(seq) % 3: raise ValueError('The length of '+fea.id+' is not a multiple of 3') if 'N' in seq: raise ValueError('N nucleotides found in '+fea.id) if '-' in seq: raise ValueError('Gaps found in '+fea.id) prot = seq.translate() if ('*' in prot) and (prot.find('*') != len(prot) - 1): raise ValueError('Premature stops found in '+fea.id) if 'X' in prot: raise ValueError('X amino acids found in '+fea.id) # Compare to HXB2 from hivwholeseq.reference import load_custom_reference ref = load_custom_reference('HXB2', region=fea.id) from seqanpy import align_global (score, alis, alir) = align_global(seq, ref, score_gapopen=-20) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq', width=100) scoremax = 3 * len(alis) delta = scoremax - score if delta > delta_pos * len(alis): raise ValueError('The sequence of '+fea.id+' looks different from HXB2')
indrandom = np.arange(len(ds)) np.random.shuffle(indrandom) ds = ds[indrandom] edges = np.array(edges)[indrandom] seqs = [seqs[i] for i in indrandom] for irp, (dpair, edgepair, seqpair) in enumerate(izip(ds, edges, seqs)): # NOTE: Take only the most distant read of a pair print irp, dpair i = dpair.argmax() d = dpair[i] edge = edgepair[i] seq = seqpair[i] (score, ali1, ali2) = align_global(seq, consrec[edge[0]: edge[1]]) scoremax = 3 * len(ali1) delta = scoremax - score ali = [ali2, ali1] print 'Alignment to its own consensus (delta = '+str(delta)+')' pretty_print_pairwise_ali(ali, 'cons', 'read'+str(i+1)+' '+str(edge), len_name=25, width=90) print '' # Compare to all consensi and find the closest alifr = alis[fragment] alifrpw = [] for cons in alifr:
date: 30/01/14 content: Test of the seqanpy module from Python. ''' # Modules # Script if __name__ == '__main__': # Try import import seqanpy as sap # Global pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'AAATCGA' output = sap.align_global(seq1, seq2, band=5) print output # Overlap pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2) print output # Overlap pairwise alignment cutting flanks seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2, cut_flanks=True) print output # Ladder pairwise alignment
if do_genomewide: seqs = [patient.get_reference('F'+str(i)) for i in xrange(1, 7)] seq = merge_sequences_fragments(seqs, VERBOSE=VERBOSE) seq = SeqRecord(Seq(seq, ambiguous_dna), id=pname+'_genomewide', name=pname+'_genomewide', description='Genomewide reference for patient '+pname) ref = patient.get_reference('genomewide') if VERBOSE >= 2: from seqanpy import align_global from hivwholeseq.utils.sequence import pretty_print_pairwise_ali (score, ali1, ali2) = align_global(ref, seq, score_gapopen=-20) pretty_print_pairwise_ali((ali1, ali2), name1='Old ref', name2='New ref', width=100) # TODO: resplit sequences to make sure we cover the whole F5a, F3c, # etc. THIS CHANGES THE COORDINATES! if use_save: fn = patient.get_reference_filename('genomewide', 'fasta') fn_old = fn.replace('.fasta', '_old.fasta') save_protect(fn, fn_old, VERBOSE=VERBOSE)
def global_test(): print('Test align_global') import seqanpy (score, ali1, ali2) = seqanpy.align_global('ACCGT', 'AGT') assert ali1 == 'ACCGT' assert ali2 == 'A--GT'