def estimate_mutation_frequencies(self, region="global"): ''' calculate the frequencies of mutation in a particular region currently the global frequencies should be estimated first because this defines the set of positions at which frequencies in other regions are estimated. ''' def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None): from Bio.Align import MultipleSeqAlignment tmp = aln if region is not None: tmp = [s for s in aln if s.attributes['region']==region] if lower_tp is not None: tmp = [s for s in aln if s.attributes['num_date']>=lower_tp] if upper_tp is not None: tmp = [s for s in aln if s.attributes['num_date']<upper_tp] return MultipleSeqAlignment(aln) if not hasattr(self.seqs, 'aln'): print("Align sequences first") return # loop over nucleotide sequences and translations and calcuate # region specific frequencies of mutations above a certain threshold for prot, aln in [('nuc',self.seqs.aln)]+ self.seqs.translations.items(): if region=="global": tmp_aln = filter_alignment(aln, lower_tp=self.pivots[0], upper_tp=self.pivots[-1]) include_set=[] else: tmp_aln = filter_alignment(aln, region=region, lower_tp=self.pivots[0], upper_tp=self.pivots[-1]) include_set = set([pos for (pos, mut) in self.frequencies[(prot, 'global')]]) time_points = [x.attributes['num_date'] for x in tmp_aln] if len(time_points)==0: print('no samples in region', region, prot) continue aln_frequencies = alignment_frequencies(tmp_aln, time_points, self.pivots, ws=max(2,len(time_points)//10), **self.kwargs) aln_frequencies.mutation_frequencies(min_freq=0.01) self.frequencies[(region,prot)] = aln_frequencies.frequencies self.frequency_confidence[(region,prot)] = aln_frequencies.calc_confidence() self.tip_count[region]=aln_frequencies.counts
def estimate_mutation_frequencies(self, region="global", pivots=24): ''' calculate the frequencies of mutation in a particular region currently the global frequencies should be estimated first because this defines the set of positions at which frequencies in other regions are estimated. ''' if not hasattr(self.seqs, 'aln'): print("Align sequences first") return def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None): from Bio.Align import MultipleSeqAlignment tmp = aln if region is not None: if type(region) == str: tmp = [s for s in tmp if s.attributes['region'] == region] elif type(region) == list: tmp = [s for s in tmp if s.attributes['region'] in region] else: print("region must be string or list") return if lower_tp is not None: tmp = [ s for s in tmp if np.mean(s.attributes['num_date']) >= lower_tp ] if upper_tp is not None: tmp = [ s for s in tmp if np.mean(s.attributes['num_date']) < upper_tp ] return MultipleSeqAlignment(tmp) if not hasattr(self, 'pivots'): tps = np.array([ np.mean(x.attributes['num_date']) for x in self.seqs.seqs.values() ]) self.pivots = make_pivots(pivots, tps) else: print('estimate_mutation_frequencies: using self.pivots') if not hasattr(self, 'mutation_frequencies'): self.mutation_frequencies = {} self.mutation_frequency_confidence = {} self.mutation_frequency_counts = {} # loop over nucleotide sequences and translations and calcuate # region specific frequencies of mutations above a certain threshold if type(region) == str: region_name = region region_match = region elif type(region) == tuple: region_name = region[0] region_match = region[1] else: print("region must be string or tuple") return for prot, aln in [('nuc', self.seqs.aln) ] + self.seqs.translations.items(): if region_match == "global": tmp_aln = filter_alignment(aln, lower_tp=self.pivots[0], upper_tp=self.pivots[-1]) include_set = [] else: tmp_aln = filter_alignment(aln, region=region_match, lower_tp=self.pivots[0], upper_tp=self.pivots[-1]) include_set = set([ pos for (pos, mut) in self.mutation_frequencies[('global', prot)] ]) time_points = [np.mean(x.attributes['num_date']) for x in tmp_aln] if len(time_points) == 0: print('no samples in region', region_name, prot) self.mutation_frequency_counts[region_name] = np.zeros_like( self.pivots) continue aln_frequencies = alignment_frequencies( tmp_aln, time_points, self.pivots, ws=max(2, len(time_points) // 10), **self.kwargs) aln_frequencies.mutation_frequencies(min_freq=0.01) self.mutation_frequencies[(region_name, prot)] = aln_frequencies.frequencies self.mutation_frequency_confidence[( region_name, prot)] = aln_frequencies.calc_confidence() self.mutation_frequency_counts[ region_name] = aln_frequencies.counts
def estimate_mutation_frequencies(self, inertia=0.0, min_freq=0.01, stiffness=20.0, pivots=24, region="global", include_set={}): ''' calculate the frequencies of mutation in a particular region currently the global frequencies should be estimated first because this defines the set of positions at which frequencies in other regions are estimated. ''' if not hasattr(self.seqs, 'aln'): self.log.warn("Align sequences first") return def filter_alignment(aln, region=None, lower_tp=None, upper_tp=None): from Bio.Align import MultipleSeqAlignment tmp = aln if region is not None: if type(region) == str: tmp = [s for s in tmp if s.attributes['region'] == region] elif type(region) == list: tmp = [s for s in tmp if s.attributes['region'] in region] else: self.log.warn("region must be string or list") return if lower_tp is not None: tmp = [ s for s in tmp if np.mean(s.attributes['num_date']) >= lower_tp ] if upper_tp is not None: tmp = [ s for s in tmp if np.mean(s.attributes['num_date']) < upper_tp ] return MultipleSeqAlignment(tmp) if not hasattr(self, 'pivots'): tps = np.array([ np.mean(x.attributes['num_date']) for x in self.seqs.seqs.values() ]) self.pivots = make_pivots(pivots, tps) # else: # self.log.notify('estimate_mutation_frequencies: using self.pivots') if not hasattr(self, 'mutation_frequencies'): self.restore_mutation_frequencies() # loop over nucleotide sequences and translations and calcuate # region specific frequencies of mutations above a certain threshold if type(region) == str: region_name = region region_match = region elif type(region) == tuple: region_name = region[0] region_match = region[1] else: self.log.warn("region must be string or tuple") return # loop over different alignment types for prot, aln in [('nuc', self.seqs.aln) ] + self.seqs.translations.items(): if (region_name, prot) in self.mutation_frequencies: self.log.notify( "Skipping Frequency Estimation for region \"{}\", protein \"{}\"" .format(region_name, prot)) continue self.log.notify( "Starting Frequency Estimation for region \"{}\", protein \"{}\"" .format(region_name, prot)) # determine set of positions that have to have a frequency calculated if prot in include_set: tmp_include_set = [x for x in include_set[prot]] else: tmp_include_set = [] tmp_aln = filter_alignment( aln, region=None if region == 'global' else region_match, lower_tp=self.pivots[0], upper_tp=self.pivots[-1]) if ('global', prot) in self.mutation_frequencies: tmp_include_set += set([ pos for (pos, mut) in self.mutation_frequencies[('global', prot)] ]) time_points = [np.mean(x.attributes['num_date']) for x in tmp_aln] if len(time_points) == 0: self.log.notify('no samples in region {} (protein: {})'.format( region_name, prot)) self.mutation_frequency_counts[region_name] = np.zeros_like( self.pivots) continue # instantiate alignment frequency aln_frequencies = alignment_frequencies( tmp_aln, time_points, self.pivots, ws=max(2, len(time_points) // 10), inertia=inertia, stiffness=stiffness, method='SLSQP') if prot == 'nuc': # if this is a nucleotide alignment, set all non-canonical states to N A = aln_frequencies.aln A[~((A == 'A') | (A == 'C') | (A == 'G') | (A == 'T') | ('A' == '-'))] = 'N' aln_frequencies.mutation_frequencies( min_freq=min_freq, include_set=tmp_include_set, ignore_char='N' if prot == 'nuc' else 'X') self.mutation_frequencies[(region_name, prot)] = aln_frequencies.frequencies self.mutation_frequency_confidence[( region_name, prot)] = aln_frequencies.calc_confidence() self.mutation_frequency_counts[ region_name] = aln_frequencies.counts self.log.notify("Saving mutation frequencies (pickle)") with open(self.output_path + "_mut_freqs.pickle", 'wb') as fh: pickle.dump(set(self.seqs.seqs.keys()), fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump( (self.mutation_frequencies, self.mutation_frequency_confidence, self.mutation_frequency_counts), fh, protocol=pickle.HIGHEST_PROTOCOL)