def concatenate(alignments, padding_length=0, partitions=None): ''' Concatenate alignments based on the Seq ids; row order does not matter. If one alignment contains a Seq id that another one does not, gaps will be introduced in place of the missing Seq. Args: alignments: (tuple, list) Alignments to be concatenated. padding_length: Introduce this many gaps between concatenated alignments. ''' from Bio import Alphabet from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Align import MultipleSeqAlignment if not isinstance(alignments, (list, tuple)): raise ValueError('Argument must be a list or a tuple.') elif len(alignments) == 1: return alignments[0] if isinstance(alignments, tuple): alignments = list(alignments) aln1 = None aln2 = None if len(alignments) > 2: aln2 = alignments.pop() result1 = concatenate(alignments=alignments, padding_length=padding_length, partitions=partitions) aln1 = result1[0] partitions = result1[1] elif len(alignments) == 2: aln1 = alignments[0] aln2 = alignments[1] if (not isinstance(aln1, MultipleSeqAlignment) or not isinstance(aln2, MultipleSeqAlignment)): raise ValueError( 'Argument must inherit from Bio.Align.MultipleSeqAlignment.') alphabet = Alphabet._consensus_alphabet([aln1._alphabet, aln2._alphabet]) aln1_dict = dict() aln2_dict = dict() for aln1_s in aln1: aln1_dict[aln1_s.id] = aln1_s for aln2_s in aln2: aln2_dict[aln2_s.id] = aln2_s aln1_length = aln1.get_alignment_length() aln2_length = aln2.get_alignment_length() aln1_gaps = SeqRecord(Seq('-' * aln1_length, alphabet)) aln2_gaps = SeqRecord(Seq('-' * aln2_length, alphabet)) padding = SeqRecord(Seq('N' * padding_length, alphabet)) if not partitions: partitions = [(1, aln1_length)] partitions.append((1 + aln1_length, padding_length + aln1_length + aln2_length)) result_seq_list = list() for aln1_key in aln1_dict.keys(): merged_Seq = None if aln1_key in aln2_dict: merged_Seq = aln1_dict[aln1_key] + padding + aln2_dict[aln1_key] merged_Seq.id = aln1_dict[aln1_key].id merged_Seq.name = '' merged_Seq.description = '' aln2_dict.pop(aln1_key) else: aln1_seq_record = aln1_dict[aln1_key] merged_Seq = aln1_seq_record + padding + aln2_gaps merged_Seq.id = aln1_seq_record.id merged_Seq.name = '' merged_Seq.description = '' result_seq_list.append(merged_Seq) for aln2_seq_record in aln2_dict.values(): merged_Seq = aln1_gaps + padding + aln2_seq_record merged_Seq.id = aln2_seq_record.id merged_Seq.name = '' merged_Seq.description = '' result_seq_list.append(merged_Seq) result_alignment = MultipleSeqAlignment(result_seq_list, alphabet) result_alignment.sort() return((result_alignment, partitions))
dictionary[key] = valuelist[0:10] from pprint import pprint fielddict_file = open("global.dict", "w") pprint(dictionary, fielddict_file) fielddict_file.close() reference = [] for i, j in dictionary.iteritems(): n = 0 combined_seq = MultipleSeqAlignment([ SeqRecord(Seq('', generic_dna), id="hg19"), SeqRecord(Seq('', generic_dna), id="panTro4"), SeqRecord(Seq('', generic_dna), id="gorGor3"), SeqRecord(Seq('', generic_dna), id="rheMac3"), SeqRecord(Seq('', generic_dna), id="ponAbe2") ]) combined_seq.sort() for ref in j: n = n + 1 seq_records = AlignIO.read(ref, 'fasta') seq_records.description = "" seq_records.sort() combined_seq = combined_seq + seq_records combined_seq.description = "" with open('%s.ref' % i, 'w') as write_file: AlignIO.write(combined_seq, write_file, 'fasta') referencelist = open('reference.list', 'a') referencelist.write('%s\t%i\n' % (i, n))
class virus_clean(object): """docstring for virus_clean""" def __init__(self,n_iqd = 5, **kwargs): ''' parameters n_std -- number of interquartile distances accepted in molecular clock filter ''' self.n_iqd = n_iqd def remove_insertions(self): ''' remove all columns from the alignment in which the outgroup is gapped ''' outgroup_ok = np.array(self.sequence_lookup[self.outgroup['strain']])!='-' for seq in self.viruses: seq.seq = Seq("".join(np.array(seq.seq)[outgroup_ok]).upper()) def clean_gaps(self): ''' remove viruses with gaps -- not part of the standard pipeline ''' self.viruses = filter(lambda x: '-' in x.seq, self.viruses) def clean_ambiguous(self): ''' substitute all ambiguous characters with '-', ancestral inference will interpret this as missing data ''' for v in self.viruses: v.seq = Seq(re.sub(r'[BDEFHIJKLMNOPQRSUVWXYZ]', '-',str(v.seq))) def unique_date(self): ''' add a unique numerical date to each leaf. uniqueness is achieved adding a small number ''' from date_util import numerical_date og = self.sequence_lookup[self.outgroup['strain']] if hasattr(og, 'date'): try: og.num_date = numerical_date(og.date) except: print "cannot parse date" og.num_date="undefined"; for ii, v in enumerate(self.viruses): if hasattr(v, 'date'): try: v.num_date = numerical_date(v.date, self.date_format['fields']) + 1e-7*(ii+1) except: print "cannot parse date" v.num_date="undefined"; def times_from_outgroup(self): outgroup_date = self.sequence_lookup[self.outgroup['strain']].num_date return np.array([x.num_date-outgroup_date for x in self.viruses if x.strain]) def distance_from_outgroup(self): from seq_util import hamming_distance outgroup_seq = self.sequence_lookup[self.outgroup['strain']].seq return np.array([hamming_distance(x.seq, outgroup_seq) for x in self.viruses if x.strain]) def clean_distances(self): """Remove viruses that don't follow a loose clock """ times = self.times_from_outgroup() distances = self.distance_from_outgroup() slope, intercept, r_value, p_value, std_err = stats.linregress(times, distances) residuals = slope*times + intercept - distances r_iqd = stats.scoreatpercentile(residuals,75) - stats.scoreatpercentile(residuals,25) if self.verbose: print "\tslope: " + str(slope) print "\tr: " + str(r_value) print "\tresiduals iqd: " + str(r_iqd) new_viruses = [] for (v,r) in izip(self.viruses,residuals): # filter viruses more than n_std standard devitations up or down if np.abs(r)<self.n_iqd * r_iqd or v.id == self.outgroup["strain"]: new_viruses.append(v) else: if self.verbose>1: print "\t\tresidual:", r, "\nremoved ",v.strain self.viruses = MultipleSeqAlignment(new_viruses) def clean_generic(self): print "Number of viruses before cleaning:",len(self.viruses) self.unique_date() self.remove_insertions() self.clean_ambiguous() self.clean_distances() self.viruses.sort(key=lambda x:x.num_date) print "Number of viruses after outlier filtering:",len(self.viruses)