def write_strains_with_HI_and_sequence(flutype='H9'): HI_titers = read_tables(flutype) HI_trevor = read_trevor_table(flutype) HI_strains = set(HI_titers.index) HI_strains.update(HI_trevor[0]) from Bio import SeqIO good_strains = set() with myopen(' /Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/src/data/'+flutype+"_strains_with_HI.fasta", 'w') as outfile, \ myopen(' /Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/src/data/'+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile: for seq_rec in SeqIO.parse(infile, 'fasta'): tmp_name = seq_rec.description.split('|')[0].strip() reduced_name = HI_fix_name(tmp_name) if reduced_name in HI_strains and (reduced_name not in good_strains): SeqIO.write(seq_rec, outfile,'fasta') good_strains.add(reduced_name) titer_count = defaultdict(int) measurements = get_all_titers_flat(flutype) for ii, rec in measurements.iterrows(): test, ref, src_id, val = rec titer_count[test]+=1 with myopen(' /Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/src/data/'+flutype+"_HI_strains.txt", 'w') as HI_strain_outfile: for strain, count in sorted(titer_count.items(), key=lambda x:x[1], reverse=True): HI_strain_outfile.write(strain + '\t'+str(count)+'\n') if fix_name(strain)!=strain: HI_strain_outfile.write(fix_name(strain) + '\t'+str(count)+'\n')
def write_strains_with_HI_and_sequence(flutype='H3N2'): HI_titers = read_tables(flutype) HI_trevor = read_trevor_table(flutype) HI_strains = set(HI_titers.index) HI_strains.update(HI_trevor[0]) from Bio import SeqIO good_strains = set() with myopen("data/"+flutype+"_strains_with_HI.fasta", 'w') as outfile, \ myopen("data/"+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile: for seq_rec in SeqIO.parse(infile, 'fasta'): tmp_name = seq_rec.description.split('|')[0].strip() reduced_name = HI_fix_name(tmp_name) if reduced_name in HI_strains and (reduced_name not in good_strains): SeqIO.write(seq_rec, outfile,'fasta') good_strains.add(reduced_name) titer_count = defaultdict(int) measurements = get_all_titers_flat(flutype) for ii, rec in measurements.iterrows(): test, ref, src_id, val = rec titer_count[test]+=1 with myopen("data/"+flutype+"_HI_strains.txt", 'w') as HI_strain_outfile: for strain, count in sorted(titer_count.items(), key=lambda x:x[1], reverse=True): HI_strain_outfile.write(strain + '\t'+str(count)+'\n') if fix_name(strain)!=strain: HI_strain_outfile.write(fix_name(strain) + '\t'+str(count)+'\n')
def HI_fix_name(name): if name.split() == ["NIB-85", "(A/Almaty/2958/2013)"]: tmp_name = fix_name("A/Almaty/2958/2013") elif name.split() == ["A/Texas/50/2012","(6&7)"]: tmp_name = fix_name("A/Texas/50/2012") else: tmp_name = fix_name(name) return tmp_name.upper().lstrip('*')
def run(self, steps, viruses_per_month=50, raxml_time_limit=1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0): if 'filter' in steps: print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" self.filter() if self.force_include is not None and os.path.isfile(self.force_include): with open(self.force_include) as infile: forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile] else: forced_strains = [] self.subsample(viruses_per_month, prioritize=forced_strains, all_priority=self.force_include_all, region_specific = self.max_global) self.add_older_vaccine_viruses(dt = 6) self.dump() else: self.load() if 'align' in steps: self.align() # -> self.viruses is an alignment object if 'clean' in steps: print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" self.clean() # -> every node as a numerical date self.dump() if 'tree' in steps: print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" self.infer_tree(raxml_time_limit) # -> self has a tree self.dump() if 'ancestral' in steps: print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" self.infer_ancestral() # -> every node has a sequence self.dump() if 'refine' in steps: print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" self.refine() self.dump() if 'frequencies' in steps: print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" self.determine_variable_positions() self.estimate_frequencies(tasks = ["mutations", "clades", "tree"]) if 'genotype_frequencies' in steps: self.estimate_frequencies(tasks = ["genotypes"]) self.dump() if 'export' in steps: #self.add_titers() self.temporal_regional_statistics() # exporting to json, including the H4 specific fields self.export_to_auspice(tree_fields = [ 'aa_muts','accession','isolate_id', 'lab','db', 'country', 'avidity_tree','avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'], annotations = ['5', '6', '6b', '6c', '7', '84N']) if params.html: self.generate_indexHTML()
def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0): if 'filter' in steps: print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" self.filter() if self.force_include is not None and os.path.isfile(self.force_include): with open(self.force_include) as infile: forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile] else: forced_strains = [] self.subsample(viruses_per_month, prioritize=forced_strains, all_priority=self.force_include_all, region_specific = self.max_global) self.add_older_vaccine_viruses(dt = 6) self.dump() else: self.load() if 'align' in steps: self.align() # -> self.viruses is an alignment object if 'clean' in steps: print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" self.clean() # -> every node as a numerical date self.dump() if 'tree' in steps: print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" self.infer_tree(raxml_time_limit) # -> self has a tree self.dump() if 'ancestral' in steps: print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" self.infer_ancestral() # -> every node has a sequence self.dump() if 'refine' in steps: print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" self.refine() self.dump() if 'frequencies' in steps: print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" self.determine_variable_positions() self.estimate_frequencies(tasks = ["mutations", "tree"]) if 'genotype_frequencies' in steps: self.estimate_frequencies(tasks = ["genotypes"]) self.dump() if 'HI' in steps: print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---" try: self.determine_variable_positions() self.map_HI(training_fraction=1.0, method = 'nnl1reg', lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True) self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True, lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False) self.dump() except: print("HI modeling failed!") if 'export' in steps: self.add_titers() self.temporal_regional_statistics() # exporting to json, including the BVic specific fields self.export_to_auspice(tree_fields = [ 'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab','db', 'country', 'dHI', 'cHI', 'mean_HI_titers','HI_titers','HI_titers_raw', 'serum', 'HI_info', 'avidity_tree','avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'], annotations = ['1A', '1B', '117V']) if params.html: self.generate_indexHTML() self.export_HI_mutation_effects() if 'HIvalidate' in steps: print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---" self.generate_validation_figures()
def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0): if 'filter' in steps: print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" self.filter() if self.force_include is not None and os.path.isfile(self.force_include): with open(self.force_include) as infile: forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile] else: forced_strains = [] self.subsample(viruses_per_month, prioritize=forced_strains, all_priority=self.force_include_all, region_specific = self.max_global) self.add_older_vaccine_viruses(dt = 3) self.dump() else: self.load() if 'align' in steps: self.align() # -> self.viruses is an alignment object if 'clean' in steps: print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" self.clean() # -> every node as a numerical date self.dump() if 'tree' in steps: print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" self.infer_tree(raxml_time_limit) # -> self has a tree self.dump() if 'ancestral' in steps: print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" self.infer_ancestral() # -> every node has a sequence self.dump() if 'refine' in steps: print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" self.refine() self.dump() if 'frequencies' in steps: print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" self.determine_variable_positions() self.estimate_frequencies(tasks = ["mutations", "tree"]) if 'genotype_frequencies' in steps: self.estimate_frequencies(tasks = ["genotypes"]) self.dump() method = 'nnl1reg' if 'HI' in steps: print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---" try: self.determine_variable_positions() self.map_HI(training_fraction=1.0, method = 'nnl1reg', lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True) self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True, lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False) except: print("HI modeling failed!") #freqs = self.determine_HI_mutation_frequencies(threshold = 0.1) #self.frequencies["mutations"]["global"].update(freqs) self.dump() if 'fitness' in steps: print "--- Estimating fitnesses at " + time.strftime("%H:%M:%S") + " ---" self.annotate_fitness() self.dump() if 'export' in steps: self.add_titers() self.temporal_regional_statistics() # exporting to json, including the H3N2 specific fields self.export_to_auspice(tree_fields = [ 'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab', 'db', 'country', 'dfreq', 'fitness', 'pred_distance', 'dHI', 'cHI', 'mHI', 'mean_HI_titers', 'HI_titers', 'HI_titers_raw', 'serum', 'HI_info', 'avidity_tree', 'avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'], annotations = ['3c2.a', '3c3.a', '3c3.b', '171K']) if params.html: self.generate_indexHTML() self.export_HI_mutation_effects() #self.export_clade_frequencies() #self.export_viruses() if 'HIvalidate' in steps: from diagnostic_figures import tree_additivity_symmetry, fmts print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---" print "-- number of non-zero branch parameters: ",np.sum([n.dHI>1e-3 for n in self.tree.postorder_node_iter()]) print "-- number of non-zero mutation parameters: ",np.sum([val>1e-3 for val in self.mutation_effects.values()]) for model in ['tree', 'mutation']: try: tree_additivity_symmetry(self, model) for fmt in fmts: plt.savefig(self.htmlpath()+'HI_symmetry_'+model+fmt) except: print("Can't generate symmetry/additivity figures") try: self.slopes_muts = slope_vs_mutation(self) except: print("Couldn't derive slopes, probably to small time interval") self.generate_validation_figures(method)
def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2.0): if 'filter' in steps: print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" self.filter() if self.force_include is not None and os.path.isfile(self.force_include): with open(self.force_include) as infile: forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile] else: forced_strains = [] self.subsample(viruses_per_month, prioritize=forced_strains, all_priority=self.force_include_all, region_specific = self.max_global) self.add_older_vaccine_viruses(dt = 6) self.dump() else: self.load() if 'align' in steps: self.align() # -> self.viruses is an alignment object if 'clean' in steps: print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" self.clean() # -> every node as a numerical date self.dump() if 'tree' in steps: print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" self.infer_tree(raxml_time_limit) # -> self has a tree self.dump() if 'ancestral' in steps: print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" self.infer_ancestral() # -> every node has a sequence self.dump() if 'refine' in steps: print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" self.refine() self.dump() if 'frequencies' in steps: print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" self.determine_variable_positions() self.estimate_frequencies(tasks = ["mutations", "tree"]) if 'genotype_frequencies' in steps: self.estimate_frequencies(tasks = ["genotypes"]) self.dump() if 'HI' in steps: print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---" try: self.determine_variable_positions() self.map_HI(training_fraction=1.0, method = 'nnl1reg', lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True) self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True, lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False) self.dump() except: print("HI modeling failed!") if 'export' in steps: self.add_titers() self.temporal_regional_statistics() # exporting to json, including the BYam specific fields self.export_to_auspice(tree_fields = [ 'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab','db', 'country', 'dHI', 'cHI', 'mean_HI_titers','HI_titers','HI_titers_raw', 'serum', 'HI_info', 'avidity_tree','avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'], annotations = ['2', '3', '3a', '172Q']) if params.html: self.generate_indexHTML() self.export_HI_mutation_effects() if 'HIvalidate' in steps: print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---" self.generate_validation_figures()
def run(self, steps, viruses_per_month=50, raxml_time_limit = 1.0, lam_HI=2.0, lam_pot=0.3, lam_avi=2): if 'filter' in steps: print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" self.filter() if self.force_include is not None and os.path.isfile(self.force_include): with open(self.force_include) as infile: forced_strains = [fix_name(line.strip().split('\t')[0]).upper() for line in infile] else: forced_strains = [] self.subsample(viruses_per_month, prioritize=forced_strains, all_priority=self.force_include_all, region_specific = self.max_global) self.add_older_vaccine_viruses(dt = 3) self.dump() else: self.load() if 'align' in steps: self.align() # -> self.viruses is an alignment object if 'clean' in steps: print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" self.clean() # -> every node as a numerical date self.dump() if 'tree' in steps: print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" self.infer_tree(raxml_time_limit) # -> self has a tree self.dump() if 'ancestral' in steps: print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" self.infer_ancestral() # -> every node has a sequence self.dump() if 'refine' in steps: print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" self.refine() self.dump() if 'frequencies' in steps: print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" self.determine_variable_positions() self.estimate_frequencies(tasks = ["mutations", "tree"]) if 'genotype_frequencies' in steps: self.estimate_frequencies(tasks = ["genotypes"]) self.dump() method = 'nnl1reg' if 'HI' in steps: print "--- Adding HI titers to the tree " + time.strftime("%H:%M:%S") + " ---" try: self.determine_variable_positions() self.map_HI(training_fraction=1.0, method = 'nnl1reg', lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=True) self.map_HI(training_fraction=1.0, method = 'nnl1reg', force_redo=True, lam_HI=lam_HI, lam_avi=lam_avi, lam_pot=lam_pot, map_to_tree=False) except: print("HI modeling failed!") #freqs = self.determine_HI_mutation_frequencies(threshold = 0.1) #self.frequencies["mutations"]["global"].update(freqs) self.dump() if 'fitness' in steps: print "--- Estimating fitnesses at " + time.strftime("%H:%M:%S") + " ---" self.annotate_fitness() self.dump() if 'mutations' in steps: print "--- Tree mutations at " + time.strftime("%H:%M:%S") + " ---" self.mutations() self.dump() if 'stability' in steps: print "--- Stability at " + time.strftime("%H:%M:%S") + " ---" self.stability() self.dump() if 'export' in steps: self.add_titers() self.temporal_regional_statistics() # exporting to json, including the H3N2 specific fields self.export_to_auspice(tree_fields = [ 'ep', 'ne', 'rb', 'aa_muts','accession','isolate_id', 'lab','db', 'country', 'dfreq', 'fitness', 'pred_distance', 'dHI', 'cHI', 'mHI', 'mean_HI_titers', 'HI_titers', 'HI_titers_raw', 'serum', 'HI_info', 'avidity_tree', 'avidity_mut', 'potency_mut', 'potency_tree', 'mean_potency_mut', 'mean_potency_tree', 'autologous_titers'], annotations = ['3c2.a', '3c3.a', '3c3.b']) if params.html: self.generate_indexHTML() self.export_HI_mutation_effects() if 'HIvalidate' in steps: from diagnostic_figures import tree_additivity_symmetry, fmts print "--- generating validation figures " + time.strftime("%H:%M:%S") + " ---" print "-- number of non-zero branch parameters: ",np.sum([n.dHI>1e-3 for n in self.tree.postorder_node_iter()]) print "-- number of non-zero mutation parameters: ",np.sum([val>1e-3 for val in self.mutation_effects.values()]) for model in ['tree', 'mutation']: try: tree_additivity_symmetry(self, model) for fmt in fmts: plt.savefig(self.htmlpath()+'HI_symmetry_'+model+fmt) except: print("Can't generate symmetry/additivity figures") try: self.slopes_muts = slope_vs_mutation(self) except: print("Couldn't derive slopes, probably to small time interval") self.generate_validation_figures(method)