Ejemplo n.º 1
0
	def add_older_new_viruses(self, dt = 3, dtref = None):
		
		from date_util import numerical_date
		for v in self.new_strains:
			if v['strain'] not in [x['strain'] for x in self.viruses]:
				tmp_date = numerical_date(v['date'])
				if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dt:
					self.viruses.append(v)
					print("adding ",v['strain'], v['date'], tmp_date, self.time_interval)
				else:
					print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval)
		new_strain_names = [v['strain'] for v in self.new_strains]
		try:
			if dtref==None:
				dtref=dt*0.5
			from json import load as jload
			with open('/Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/source-data/'+self.virus_type+'_ref_strains.json', 'r') as infile:
				self.reference_viruses = jload(infile)
			for v in self.reference_viruses:
				if v['strain'] not in [x['strain'] for x in self.viruses]:
					tmp_date = numerical_date(v['date'])
					tmp_strain = v['strain']
					print(tmp_strain)
					if tmp_strain not in new_strain_names:
						if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dtref:
							self.viruses.append(v)
							print("adding ",v['strain'], v['date'], tmp_date, self.time_interval)
						else:
							print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval)
		except:
			print("can't find reference_viruses")
Ejemplo n.º 2
0
	def calc_time_censcored_tree_frequencies(self):
		print("fitting clade frequencies for seasons")
		region = "global_fit"
		freq_cutoff = 25.0
		total_pivots = 12
		pivots_fit = 2
		freq_window = 0.0
		from date_util import numerical_date
		for n in self.tree.preorder_node_iter():
			n.fit_frequencies = {}
			n.freq_slope = {}
		for s in self.seasons:
			time_interval = [numerical_date(s[0]) - freq_window, numerical_date(s[1])]
			pivots = np.linspace(time_interval[0], time_interval[1], total_pivots)
			n_nodes = len(self.tree.seed_node.season_tips[s])
			self.estimate_tree_frequencies(pivots=pivots, threshold = 40, regions=None,
								region_name = region, time_interval=time_interval)
			for n in self.tree.preorder_node_iter():
				if n.logit_freq[region] is not None:
					n.fit_frequencies[s] = np.minimum(freq_cutoff, np.maximum(-freq_cutoff,n.logit_freq[region]))
				else:
					n.fit_frequencies[s] = n.parent_node.fit_frequencies[s]
				try:
					slope, intercept, rval, pval, stderr = linregress(pivots[pivots_fit:], n.fit_frequencies[s][pivots_fit:])
					n.freq_slope[s] = slope
				except:
					import ipdb; ipdb.set_trace()
		# reset pivots in tree to global pivots
		self.tree.seed_node.pivots = self.pivots
Ejemplo n.º 3
0
	def add_older_vaccine_viruses(self, dt = 3, dtref = None):
		'''
		addes additional vaccine viruses prior to the time interval to provide phylogenetic context
		'''
		from date_util import numerical_date
		for v in self.vaccine_strains:
			if v['strain'] not in [x['strain'] for x in self.viruses]:
				tmp_date = numerical_date(v['date'])
				if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dt:
					self.viruses.append(v)
					print("adding ",v['strain'], v['date'], tmp_date, self.time_interval)
				else:
					print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval)
		vaccine_strain_names = [v['strain'] for v in self.vaccine_strains]
		try:
			if dtref==None:
				dtref=dt*0.5
			from json import load as jload
			with open('source-data/'+self.virus_type+'_ref_strains.json', 'r') as infile:
				self.reference_viruses = jload(infile)
			for v in self.reference_viruses:
				if v['strain'] not in [x['strain'] for x in self.viruses]:
					tmp_date = numerical_date(v['date'])
					tmp_strain = v['strain']
					print(tmp_strain)
					if tmp_strain not in vaccine_strain_names:
						if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dtref:
							self.viruses.append(v)
							print("adding ",v['strain'], v['date'], tmp_date, self.time_interval)
						else:
							print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval)
		except:
			print("can't find reference_viruses")
Ejemplo n.º 4
0
	def auto_outgroup_blast(self):
		from random import sample
		from Bio.Blast.Applications import NcbiblastxCommandline
		from Bio.Blast import NCBIXML

		self.make_run_dir()
		nvir = 10
		max_ref_seqs = 5
		tmp_dates = []
		for v in self.viruses:
			try:
				tmp_dates.append(numerical_date(v["date"]))
			except:
				print("Can't parse date for",v['strain'], v['date'])
		earliest_date = np.min(tmp_dates)
		all_strains = [v["strain"] for v in self.viruses]
		representatives = [SeqRecord(Seq(v['seq']), id=v['strain']) for v in sample(self.viruses, min(nvir, len(self.viruses)))]
		standard_outgroups = self.load_standard_outgroups()
		SeqIO.write(representatives, self.run_dir+'representatives.fasta', 'fasta')
		blast_out = self.run_dir+"outgroup_blast.xml"
		blast_cline = NcbiblastxCommandline(query=self.run_dir+"representatives.fasta", db=std_outgroup_file_blast, evalue=0.01,
		                                     outfmt=5, out=blast_out)
		stdout, stderr = blast_cline()
		with open(blast_out, 'r') as bfile:
			og_blast = NCBIXML.parse(bfile)
			by_og = defaultdict(list)
			for rep in og_blast:
				for hit in rep.alignments:
					for aln in hit.hsps:
						by_og[hit.hit_def].append((rep.query, aln.score, aln.score/aln.align_length, 1.0*aln.identities/aln.align_length))
		by_og = by_og.items()
		print by_og[1]
		# sort by number of hits, then mean score
		by_og.sort(key = lambda x:(len(x[1]), np.mean([y[1] for y in x[1]])), reverse=True)
		outgroups_older_than_sample = [(og, hits) for (og, hits) in by_og
							if (numerical_date(standard_outgroups[og]['date'])<earliest_date-5) or
								('A/California/07/2009' in standard_outgroups[og]['strain'])]
		if len(outgroups_older_than_sample) and np.mean([y[-1] for y in outgroups_older_than_sample[0][1]])>0.8:
			outgroup = outgroups_older_than_sample[0][0]
		else:
			outgroup = by_og[0][0]
			self.midpoint_rooting = True
			print("will root at midpoint")

		for oi, (ref, hits) in enumerate(by_og):
			if (np.max([y[-1] for y in hits])>0.9+oi*0.02) and ref!=outgroup:
				self.viruses.append(standard_outgroups[ref])
				print("including reference strain ",ref, [y[-1] for y in hits])
				if oi>max_ref_seqs:
					break
		self.outgroup = standard_outgroups[outgroup]
		if 'A/California/07/2009' not in self.outgroup['strain']:
			self.outgroup['strain']+='OG'
		prot = Seq(self.outgroup['seq']).translate(to_stop=True)
		self.cds = [0,min(len(prot)*3,len(self.outgroup['seq']))]
		print("chosen outgroup",self.outgroup['strain'])
Ejemplo n.º 5
0
	def unique_date(self):
		'''
		add a unique numerical date to each leaf. uniqueness is achieved adding a small number
		'''
		from date_util import numerical_date
		og = self.sequence_lookup[self.outgroup['strain']]
		if hasattr(og, 'date'):
			try:
				og.num_date = numerical_date(og.date)
			except:
				print "cannot parse date"
				og.num_date="undefined";
		for ii, v in enumerate(self.viruses):
			if hasattr(v, 'date'):
				try:
					v.num_date = numerical_date(v.date, self.date_format['fields']) + 1e-7*(ii+1)
				except:
					print "cannot parse date"
					v.num_date="undefined";
Ejemplo n.º 6
0
    def add_older_vaccine_viruses(self, dt=3, dtref=None):
        '''
		addes additional vaccine viruses prior to the time interval to provide phylogenetic context
		'''
        from date_util import numerical_date
        for v in self.vaccine_strains:
            if v['strain'] not in [x['strain'] for x in self.viruses]:
                tmp_date = numerical_date(v['date'])
                if tmp_date < self.time_interval[
                        0] and tmp_date >= self.time_interval[0] - dt:
                    self.viruses.append(v)
                    print("adding ", v['strain'], v['date'], tmp_date,
                          self.time_interval)
                else:
                    print("skipping ", v['strain'], v['date'], tmp_date,
                          self.time_interval)
        vaccine_strain_names = [v['strain'] for v in self.vaccine_strains]
        try:
            if dtref == None:
                dtref = dt * 0.5
            from json import load as jload
            with open('source-data/' + self.virus_type + '_ref_strains.json',
                      'r') as infile:
                self.reference_viruses = jload(infile)
            for v in self.reference_viruses:
                if v['strain'] not in [x['strain'] for x in self.viruses]:
                    tmp_date = numerical_date(v['date'])
                    tmp_strain = v['strain']
                    print(tmp_strain)
                    if tmp_strain not in vaccine_strain_names:
                        if tmp_date < self.time_interval[
                                0] and tmp_date >= self.time_interval[
                                    0] - dtref:
                            self.viruses.append(v)
                            print("adding ", v['strain'], v['date'], tmp_date,
                                  self.time_interval)
                        else:
                            print("skipping ", v['strain'], v['date'],
                                  tmp_date, self.time_interval)
        except:
            print("can't find reference_viruses")
Ejemplo n.º 7
0
    def auto_outgroup_blast(self):
        from random import sample
        from Bio.Blast.Applications import NcbiblastxCommandline
        from Bio.Blast import NCBIXML

        self.make_run_dir()
        nvir = 10
        max_ref_seqs = 5
        tmp_dates = []
        for v in self.viruses:
            try:
                tmp_dates.append(numerical_date(v["date"]))
            except:
                print("Can't parse date for", v['strain'], v['date'])
        earliest_date = np.min(tmp_dates)
        all_strains = [v["strain"] for v in self.viruses]
        representatives = [
            SeqRecord(Seq(v['seq']), id=v['strain'])
            for v in sample(self.viruses, min(nvir, len(self.viruses)))
        ]
        standard_outgroups = self.load_standard_outgroups()
        SeqIO.write(representatives, self.run_dir + 'representatives.fasta',
                    'fasta')
        blast_out = self.run_dir + "outgroup_blast.xml"
        blast_cline = NcbiblastxCommandline(query=self.run_dir +
                                            "representatives.fasta",
                                            db=std_outgroup_file_blast,
                                            evalue=0.01,
                                            outfmt=5,
                                            out=blast_out)
        stdout, stderr = blast_cline()
        with open(blast_out, 'r') as bfile:
            og_blast = NCBIXML.parse(bfile)
            by_og = defaultdict(list)
            for rep in og_blast:
                for hit in rep.alignments:
                    for aln in hit.hsps:
                        by_og[hit.hit_def].append(
                            (rep.query, aln.score,
                             aln.score / aln.align_length,
                             1.0 * aln.identities / aln.align_length))
        by_og = by_og.items()
        print by_og[1]
        # sort by number of hits, then mean score
        by_og.sort(key=lambda x: (len(x[1]), np.mean([y[1] for y in x[1]])),
                   reverse=True)
        outgroups_older_than_sample = [(og, hits) for (og, hits) in by_og if (
            numerical_date(standard_outgroups[og]['date']) < earliest_date -
            5) or ('A/California/07/2009' in standard_outgroups[og]['strain'])]
        if len(outgroups_older_than_sample) and np.mean(
            [y[-1] for y in outgroups_older_than_sample[0][1]]) > 0.8:
            outgroup = outgroups_older_than_sample[0][0]
        else:
            outgroup = by_og[0][0]
            self.midpoint_rooting = True
            print("will root at midpoint")

        for oi, (ref, hits) in enumerate(by_og):
            if (np.max([y[-1]
                        for y in hits]) > 0.9 + oi * 0.02) and ref != outgroup:
                self.viruses.append(standard_outgroups[ref])
                print("including reference strain ", ref,
                      [y[-1] for y in hits])
                if oi > max_ref_seqs:
                    break
        self.outgroup = standard_outgroups[outgroup]
        if 'A/California/07/2009' not in self.outgroup['strain']:
            self.outgroup['strain'] += 'OG'
        prot = Seq(self.outgroup['seq']).translate(to_stop=True)
        self.cds = [0, min(len(prot) * 3, len(self.outgroup['seq']))]
        print("chosen outgroup", self.outgroup['strain'])