Beispiel #1
0
def write_strains_with_HI_and_sequence(flutype='H9'):
	HI_titers = read_tables(flutype)
	HI_trevor = read_trevor_table(flutype)
	HI_strains = set(HI_titers.index)
	HI_strains.update(HI_trevor[0])
	from Bio import SeqIO
	good_strains = set()
	with myopen(' /Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/src/data/'+flutype+"_strains_with_HI.fasta", 'w') as outfile, \
		 myopen(' /Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/src/data/'+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile:
		for seq_rec in SeqIO.parse(infile, 'fasta'):
			tmp_name = seq_rec.description.split('|')[0].strip()
			reduced_name = HI_fix_name(tmp_name)
			if reduced_name in HI_strains and (reduced_name not in good_strains):
				SeqIO.write(seq_rec, outfile,'fasta')
				good_strains.add(reduced_name)

	titer_count = defaultdict(int)
	measurements = get_all_titers_flat(flutype)
	for ii, rec in measurements.iterrows():
		test, ref, src_id, val = rec
		titer_count[test]+=1

	with myopen(' /Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/src/data/'+flutype+"_HI_strains.txt", 'w') as HI_strain_outfile:
		for strain, count in sorted(titer_count.items(), key=lambda x:x[1], reverse=True):
			HI_strain_outfile.write(strain + '\t'+str(count)+'\n')
			if fix_name(strain)!=strain:
				HI_strain_outfile.write(fix_name(strain) + '\t'+str(count)+'\n')
Beispiel #2
0
def write_strains_with_HI_and_sequence(flutype='H3N2'):
	HI_titers = read_tables(flutype)
	HI_trevor = read_trevor_table(flutype)
	HI_strains = set(HI_titers.index)
	HI_strains.update(HI_trevor[0])
	from Bio import SeqIO
	good_strains = set()
	with myopen("data/"+flutype+"_strains_with_HI.fasta", 'w') as outfile, \
		 myopen("data/"+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile:
		for seq_rec in SeqIO.parse(infile, 'fasta'):
			tmp_name = seq_rec.description.split('|')[0].strip()
			reduced_name = HI_fix_name(tmp_name)
			if reduced_name in HI_strains and (reduced_name not in good_strains):
				SeqIO.write(seq_rec, outfile,'fasta')
				good_strains.add(reduced_name)

	titer_count = defaultdict(int)
	measurements = get_all_titers_flat(flutype)
	for ii, rec in measurements.iterrows():
		test, ref, src_id, val = rec
		titer_count[test]+=1

	with myopen("data/"+flutype+"_HI_strains.txt", 'w') as HI_strain_outfile:
		for strain, count in sorted(titer_count.items(), key=lambda x:x[1], reverse=True):
			HI_strain_outfile.write(strain + '\t'+str(count)+'\n')
			if fix_name(strain)!=strain:
				HI_strain_outfile.write(fix_name(strain) + '\t'+str(count)+'\n')
Beispiel #3
0
def HI_fix_name(name):
	if name.split() == ["NIB-85", "(A/Almaty/2958/2013)"]:
		tmp_name = fix_name("A/Almaty/2958/2013")
	elif name.split() == ["A/Texas/50/2012","(6&7)"]:
		tmp_name = fix_name("A/Texas/50/2012")
	else:
		tmp_name = fix_name(name)
	return tmp_name.upper().lstrip('*')
Beispiel #4
0
def HI_fix_name(name):
	if name.split() == ["NIB-85", "(A/Almaty/2958/2013)"]:
		tmp_name = fix_name("A/Almaty/2958/2013")
	elif name.split() == ["A/Texas/50/2012","(6&7)"]:
		tmp_name = fix_name("A/Texas/50/2012")
	else:
		tmp_name = fix_name(name)
	return tmp_name.upper().lstrip('*')
Beispiel #5
0
	def run(self, steps, viruses_per_month=50, raxml_time_limit=1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0):
		if 'filter' in steps:
			print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---"
			self.filter()
			if self.force_include is not None and os.path.isfile(self.force_include):
				with open(self.force_include) as infile:
					forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile]
			else:
				forced_strains = []
			self.subsample(viruses_per_month,
				prioritize=forced_strains, all_priority=self.force_include_all,
				region_specific = self.max_global)
			self.add_older_vaccine_viruses(dt = 6)
			self.dump()
		else:
			self.load()
		if 'align' in steps:
			self.align()   	# -> self.viruses is an alignment object
		if 'clean' in steps:
			print "--- Clean at " + time.strftime("%H:%M:%S") + " ---"
			self.clean()   # -> every node as a numerical date
			self.dump()
		if 'tree' in steps:
			print "--- Tree	 infer at " + time.strftime("%H:%M:%S") + " ---"
			self.infer_tree(raxml_time_limit)  # -> self has a tree
			self.dump()
		if 'ancestral' in steps:
			print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---"
			self.infer_ancestral()  # -> every node has a sequence
			self.dump()
		if 'refine' in steps:
			print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---"
			self.refine()
			self.dump()
		if 'frequencies' in steps:
			print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---"
			self.determine_variable_positions()
			self.estimate_frequencies(tasks = ["mutations", "clades", "tree"])
			if 'genotype_frequencies' in steps:
					self.estimate_frequencies(tasks = ["genotypes"])
			self.dump()
		if 'export' in steps:
			#self.add_titers()
			self.temporal_regional_statistics()
			# exporting to json, including the H4 specific fields
			self.export_to_auspice(tree_fields = [
				'aa_muts','accession','isolate_id', 'lab','db', 'country',
				'avidity_tree','avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'],
                   annotations = ['5', '6', '6b', '6c', '7', '84N'])
			if params.html:
				self.generate_indexHTML()
Beispiel #6
0
	def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0):
		if 'filter' in steps:
			print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---"
			self.filter()
			if self.force_include is not None and os.path.isfile(self.force_include):
				with open(self.force_include) as infile:
					forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile]
			else:
				forced_strains = []
			self.subsample(viruses_per_month,
				prioritize=forced_strains, all_priority=self.force_include_all,
				region_specific = self.max_global)
			self.add_older_vaccine_viruses(dt = 6)
			self.dump()
		else:
			self.load()
		if 'align' in steps:
			self.align()   	# -> self.viruses is an alignment object
		if 'clean' in steps:
			print "--- Clean at " + time.strftime("%H:%M:%S") + " ---"
			self.clean()   # -> every node as a numerical date
			self.dump()
		if 'tree' in steps:
			print "--- Tree	 infer at " + time.strftime("%H:%M:%S") + " ---"
			self.infer_tree(raxml_time_limit)  # -> self has a tree
			self.dump()
		if 'ancestral' in steps:
			print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---"
			self.infer_ancestral()  # -> every node has a sequence
			self.dump()
		if 'refine' in steps:
			print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---"
			self.refine()
			self.dump()
		if 'frequencies' in steps:
			print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---"
			self.determine_variable_positions()
			self.estimate_frequencies(tasks = ["mutations", "tree"])
			if 'genotype_frequencies' in steps:
					self.estimate_frequencies(tasks = ["genotypes"])
			self.dump()
		if 'HI' in steps:
			print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---"
			try:
				self.determine_variable_positions()
				self.map_HI(training_fraction=1.0, method = 'nnl1reg',
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True)
				self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True,
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False)
				self.dump()
			except:
				print("HI modeling failed!")
		if 'export' in steps:
			self.add_titers()
			self.temporal_regional_statistics()
			# exporting to json, including the BVic specific fields
			self.export_to_auspice(tree_fields = [
				'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab','db', 'country',
				'dHI', 'cHI', 'mean_HI_titers','HI_titers','HI_titers_raw', 'serum', 'HI_info',
				'avidity_tree','avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'],
				annotations = ['1A', '1B', '117V'])
			if params.html:
				self.generate_indexHTML()
			self.export_HI_mutation_effects()


		if 'HIvalidate' in steps:
			print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---"
			self.generate_validation_figures()
Beispiel #7
0
	def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0):
		if 'filter' in steps:
			print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---"
			self.filter()
			if self.force_include is not None and os.path.isfile(self.force_include):
				with open(self.force_include) as infile:
					forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile]
			else:
				forced_strains = []
			self.subsample(viruses_per_month,
				prioritize=forced_strains, all_priority=self.force_include_all,
				region_specific = self.max_global)
			self.add_older_vaccine_viruses(dt = 3)
			self.dump()
		else:
			self.load()
		if 'align' in steps:
			self.align()   	# -> self.viruses is an alignment object
		if 'clean' in steps:
			print "--- Clean at " + time.strftime("%H:%M:%S") + " ---"
			self.clean()   # -> every node as a numerical date
			self.dump()
		if 'tree' in steps:
			print "--- Tree	 infer at " + time.strftime("%H:%M:%S") + " ---"
			self.infer_tree(raxml_time_limit)  # -> self has a tree
			self.dump()
		if 'ancestral' in steps:
			print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---"
			self.infer_ancestral()  # -> every node has a sequence
			self.dump()
		if 'refine' in steps:
			print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---"
			self.refine()
			self.dump()
		if 'frequencies' in steps:
			print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---"
			self.determine_variable_positions()
			self.estimate_frequencies(tasks = ["mutations", "tree"])
			if 'genotype_frequencies' in steps:
					self.estimate_frequencies(tasks = ["genotypes"])
			self.dump()

		method = 'nnl1reg'
		if 'HI' in steps:
			print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---"
			try:
				self.determine_variable_positions()
				self.map_HI(training_fraction=1.0, method = 'nnl1reg',
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True)
				self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True,
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False)
			except:
				print("HI modeling failed!")
			#freqs = self.determine_HI_mutation_frequencies(threshold = 0.1)
			#self.frequencies["mutations"]["global"].update(freqs)
			self.dump()

		if 'fitness' in steps:
			print "--- Estimating fitnesses at " + time.strftime("%H:%M:%S") + " ---"
			self.annotate_fitness()
			self.dump()

		if 'export' in steps:
			self.add_titers()
			self.temporal_regional_statistics()
			# exporting to json, including the H3N2 specific fields
			self.export_to_auspice(tree_fields = [
				'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab', 'db', 'country', 'dfreq', 'fitness', 'pred_distance',
				'dHI', 'cHI', 'mHI', 'mean_HI_titers', 'HI_titers', 'HI_titers_raw', 'serum', 'HI_info',
				'avidity_tree', 'avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'],
				   annotations = ['3c2.a', '3c3.a', '3c3.b', '171K'])
			if params.html:
				self.generate_indexHTML()
			self.export_HI_mutation_effects()
			#self.export_clade_frequencies()
			#self.export_viruses()

		if 'HIvalidate' in steps:
			from diagnostic_figures import tree_additivity_symmetry, fmts

			print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---"
			print "-- number of non-zero branch parameters: ",np.sum([n.dHI>1e-3 for n in self.tree.postorder_node_iter()])
			print "-- number of non-zero mutation parameters: ",np.sum([val>1e-3 for val in self.mutation_effects.values()])
			for model in ['tree', 'mutation']:
				try:
					tree_additivity_symmetry(self, model)
					for fmt in fmts: plt.savefig(self.htmlpath()+'HI_symmetry_'+model+fmt)
				except:
					print("Can't generate symmetry/additivity figures")
			try:
				self.slopes_muts = slope_vs_mutation(self)
			except:
				print("Couldn't derive slopes, probably to small time interval")
			self.generate_validation_figures(method)
Beispiel #8
0
	def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0):
		if 'filter' in steps:
			print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---"
			self.filter()
			if self.force_include is not None and os.path.isfile(self.force_include):
				with open(self.force_include) as infile:
					forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile]
			else:
				forced_strains = []
			self.subsample(viruses_per_month,
				prioritize=forced_strains, all_priority=self.force_include_all,
				region_specific = self.max_global)
			self.add_older_vaccine_viruses(dt = 6)
			self.dump()
		else:
			self.load()
		if 'align' in steps:
			self.align()   	# -> self.viruses is an alignment object
		if 'clean' in steps:
			print "--- Clean at " + time.strftime("%H:%M:%S") + " ---"
			self.clean()   # -> every node as a numerical date
			self.dump()
		if 'tree' in steps:
			print "--- Tree	 infer at " + time.strftime("%H:%M:%S") + " ---"
			self.infer_tree(raxml_time_limit)  # -> self has a tree
			self.dump()
		if 'ancestral' in steps:
			print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---"
			self.infer_ancestral()  # -> every node has a sequence
			self.dump()
		if 'refine' in steps:
			print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---"
			self.refine()
			self.dump()
		if 'frequencies' in steps:
			print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---"
			self.determine_variable_positions()
			self.estimate_frequencies(tasks = ["mutations", "tree"])
			if 'genotype_frequencies' in steps:
					self.estimate_frequencies(tasks = ["genotypes"])
			self.dump()
		if 'HI' in steps:
			print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---"
			try:
				self.determine_variable_positions()
				self.map_HI(training_fraction=1.0, method = 'nnl1reg',
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True)
				self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True,
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False)
				self.dump()
			except:
				print("HI modeling failed!")
		if 'export' in steps:
			self.add_titers()
			self.temporal_regional_statistics()
			# exporting to json, including the BYam specific fields
			self.export_to_auspice(tree_fields = [
				'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab','db', 'country',
				'dHI', 'cHI', 'mean_HI_titers','HI_titers','HI_titers_raw', 'serum', 'HI_info',
				'avidity_tree','avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'],
				annotations = ['2', '3', '3a', '172Q'])
			if params.html:
				self.generate_indexHTML()
			self.export_HI_mutation_effects()

		if 'HIvalidate' in steps:
			print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---"
			self.generate_validation_figures()
Beispiel #9
0
	def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2):
		if 'filter' in steps:
			print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---"
			self.filter()
			if self.force_include is not None and os.path.isfile(self.force_include):
				with open(self.force_include) as infile:
					forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile]
			else:
				forced_strains = []
			self.subsample(viruses_per_month,
				prioritize=forced_strains, all_priority=self.force_include_all,
				region_specific = self.max_global)
			self.add_older_vaccine_viruses(dt = 3)
			self.dump()
		else:
			self.load()
		if 'align' in steps:
			self.align()   	# -> self.viruses is an alignment object
		if 'clean' in steps:
			print "--- Clean at " + time.strftime("%H:%M:%S") + " ---"
			self.clean()   # -> every node as a numerical date
			self.dump()
		if 'tree' in steps:
			print "--- Tree	 infer at " + time.strftime("%H:%M:%S") + " ---"
			self.infer_tree(raxml_time_limit)  # -> self has a tree
			self.dump()
		if 'ancestral' in steps:
			print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---"
			self.infer_ancestral()  # -> every node has a sequence
			self.dump()
		if 'refine' in steps:
			print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---"
			self.refine()
			self.dump()
		if 'frequencies' in steps:
			print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---"
			self.determine_variable_positions()
			self.estimate_frequencies(tasks = ["mutations", "tree"])
			if 'genotype_frequencies' in steps:
					self.estimate_frequencies(tasks = ["genotypes"])
			self.dump()

		method = 'nnl1reg'
		if 'HI' in steps:
			print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---"
			try:
				self.determine_variable_positions()
				self.map_HI(training_fraction=1.0, method = 'nnl1reg',
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True)
				self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True,
					lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False)
			except:
				print("HI modeling failed!")
			#freqs = self.determine_HI_mutation_frequencies(threshold = 0.1)
			#self.frequencies["mutations"]["global"].update(freqs)
			self.dump()

		if 'fitness' in steps:
			print "--- Estimating fitnesses at " + time.strftime("%H:%M:%S") + " ---"
			self.annotate_fitness()
			self.dump()
		
		if 'mutations' in steps:
			print "--- Tree mutations at " + time.strftime("%H:%M:%S") + " ---"
			self.mutations()
			self.dump()

		if 'stability' in steps:
			print "--- Stability at " + time.strftime("%H:%M:%S") + " ---"
			self.stability()
			self.dump()

		if 'export' in steps:
			self.add_titers()
			self.temporal_regional_statistics()
			# exporting to json, including the H3N2 specific fields
			self.export_to_auspice(tree_fields = [
				'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab','db', 'country', 'dfreq', 'fitness', 'pred_distance',
				'dHI', 'cHI', 'mHI', 'mean_HI_titers', 'HI_titers', 'HI_titers_raw', 'serum', 'HI_info',
				'avidity_tree', 'avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'],
				   annotations = ['3c2.a', '3c3.a', '3c3.b'])
			if params.html:
				self.generate_indexHTML()
			self.export_HI_mutation_effects()

		if 'HIvalidate' in steps:
			from diagnostic_figures import tree_additivity_symmetry, fmts

			print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---"
			print "-- number of non-zero branch parameters: ",np.sum([n.dHI>1e-3 for n in self.tree.postorder_node_iter()])
			print "-- number of non-zero mutation parameters: ",np.sum([val>1e-3 for val in self.mutation_effects.values()])
			for model in ['tree', 'mutation']:
				try:
					tree_additivity_symmetry(self, model)
					for fmt in fmts: plt.savefig(self.htmlpath()+'HI_symmetry_'+model+fmt)
				except:
					print("Can't generate symmetry/additivity figures")
			try:
				self.slopes_muts = slope_vs_mutation(self)
			except:
				print("Couldn't derive slopes, probably to small time interval")
			self.generate_validation_figures(method)