Ejemplo n.º 1
0
	def export_HI_mutation_effects(self):
		from io_util import write_json, read_json
		# make a tab delimited file with the mutaton effects
		table_effects = []
		HI_mutation_effects_fname = self.output_path+self.prefix+self.resolution_prefix+'HI_mutation_effects.tsv'
		with open(HI_mutation_effects_fname, 'w') as ofile:
			for mut, val in self.mutation_effects.iteritems():
				mut_str = '/'.join([x[1] for x in self.mutation_clusters[mut]])
				ofile.write(mut_str+'\t'+str(np.round(val,4))+'\t'+str(self.mutation_counter[mut])+'\n')
				if val>0.001:
					table_effects.append((mut_str,round(val,2)))
		# export mutation effects to JSON
		try: #they are added to a larger json fir different lineages and resultions
			display_effects = read_json(self.auspice_HI_display_mutations)
		except: # if file doesn't yet exist, create and empty dictionary
			display_effects = {}

		# effects for use in the js are indext by first mutation in cluster
		model_effects = {mut[0]+':'+mut[1]:val for mut, val in
						 self.mutation_effects.iteritems() if val>0.01}
		write_json(model_effects, self.auspice_HI_fname)

		if self.virus_type not in display_effects: display_effects[self.virus_type]={}
		table_effects.sort(key = lambda x:x[1], reverse=True)
		display_effects[self.virus_type][self.resolution] = table_effects
		write_json(display_effects, self.auspice_HI_display_mutations)
Ejemplo n.º 2
0
    def export(self, path = '', extra_attr = ['aa_muts']):
        from Bio import Seq
        from itertools import izip
        timetree_fname = path+'tree.json'
        sequence_fname = path+'sequences.json'
        tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr)
        write_json(tree_json, timetree_fname, indent=None)
        elems = {}
        elems['root'] = {}
        elems['root']['nuc'] = "".join(self.tree.root.sequence)
        for prot in self.proteins:
            tmp = str(self.proteins[prot].extract(Seq.Seq(elems['root']['nuc'])))
            #elems['root'][prot] = str(Seq.translate(tmp.replace('---', 'NNN'))).replace('X','-')
            elems['root'][prot] = str(Seq.translate(tmp.replace('-', 'N'))).replace('X','-')


        for node in self.tree.find_clades():
            if hasattr(node, "clade") and hasattr(node, "sequence"):
                elems[node.clade] = {}
                elems[node.clade]['nuc'] = {pos:state for pos, (state, ancstate) in
                                enumerate(izip(node.sequence, self.tree.root.sequence)) if state!=ancstate}
        for node in self.tree.find_clades():
            if hasattr(node, "clade") and hasattr(node, "translations"):
                for prot in self.proteins:
                    elems[node.clade][prot] = {pos:state for pos, (state, ancstate) in
                                    enumerate(izip(node.translations[prot], elems['root'][prot])) if state!=ancstate}

        write_json(elems, sequence_fname, indent=None)
Ejemplo n.º 3
0
    def load_viruses(self, aln_fname=None, years_back=3, viruses_per_month=50):
        if config['virus']:
            from H3N2_filter import H3N2_filter as virus_filter
            fasta_fields = config['fasta_fields']
            if 'force_include' in config and os.path.isfile(
                    config['force_include']):
                with open(config['force_include']) as force_include_file:
                    force_include_strains = [
                        line.strip() for line in force_include_file
                    ]
                print "found ", len(
                    force_include_strains), "strains to include"
            else:
                force_include_strains = []
        else:
            from virus_filter import virus_filter as virus_filter
            fasta_fields = {0: 'strain'}
        if aln_fname is None: aln_fname = config['alignment_file']

        my_filter = virus_filter(aln_fname, fasta_fields)
        my_filter.filter()
        my_filter.subsample(years_back,
                            viruses_per_month,
                            prioritize=force_include_strains,
                            all_priority=True,
                            region_specific=config['max_global'])

        self.viruses = my_filter.virus_subsample
        write_json(self.viruses, self.initial_virus_fname)
Ejemplo n.º 4
0
	def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']):
		import bernoulli_frequency as freq_est
		plot=False
		freq_est.flu_stiffness = config['frequency_stiffness']
		freq_est.time_interval = config['time_interval']
		freq_est.pivots_per_year = config['pivots_per_year']
		freq_est.relevant_pos_cutoff = 0.1

		if 'mutations' in tasks or 'genotypes' in tasks:
			self.frequencies['mutations'], relevant_pos = freq_est.all_mutations(self.tree, config['aggregate_regions'], 
														threshold = config['min_mutation_count'], plot=plot)
		if 'genotypes' in tasks:
			self.frequencies['genotypes'] = freq_est.all_genotypes(self.tree, config['aggregate_regions'], relevant_pos)
			
		#if 'specieshost' in tasks:
			#self.frequencies['specieshost'] = freq_est.all_genotypes(self.tree, config['aggregate_hosts'], relevant_pos)
			
		if 'clades' in tasks:
			self.frequencies['clades'] = freq_est.all_clades(self.tree, config['clade_designations'], 
															config['aggregate_regions'], plot)
		if any(x in tasks for x in ['mutations','clades', 'genotypes']):
			write_json(self.frequencies, self.frequency_fname)

		if 'tree' in tasks:
			for region_label, regions in config['aggregate_regions']:
				print "--- "+"adding frequencies to tree "+region_label+ " "  + time.strftime("%H:%M:%S") + " ---"
				freq_est.estimate_tree_frequencies(self.tree, threshold = 10, regions=regions, region_name=region_label)
Ejemplo n.º 5
0
    def export_HI_mutation_effects(self):
        from io_util import write_json, read_json
        # make a tab delimited file with the mutaton effects
        table_effects = []
        HI_mutation_effects_fname = self.output_path + self.prefix + self.resolution_prefix + 'HI_mutation_effects.tsv'
        with open(HI_mutation_effects_fname, 'w') as ofile:
            for mut, val in self.mutation_effects.iteritems():
                mut_str = '/'.join([x[1] for x in self.mutation_clusters[mut]])
                ofile.write(mut_str + '\t' + str(np.round(val, 4)) + '\t' +
                            str(self.mutation_counter[mut]) + '\n')
                if val > 0.001:
                    table_effects.append((mut_str, round(val, 2)))
        # export mutation effects to JSON
        try:  #they are added to a larger json fir different lineages and resultions
            display_effects = read_json(self.auspice_HI_display_mutations)
        except:  # if file doesn't yet exist, create and empty dictionary
            display_effects = {}

        # effects for use in the js are indext by first mutation in cluster
        model_effects = {
            mut[0] + ':' + mut[1]: val
            for mut, val in self.mutation_effects.iteritems() if val > 0.01
        }
        write_json(model_effects, self.auspice_HI_fname)

        if self.virus_type not in display_effects:
            display_effects[self.virus_type] = {}
        table_effects.sort(key=lambda x: x[1], reverse=True)
        display_effects[self.virus_type][self.resolution] = table_effects
        write_json(display_effects, self.auspice_HI_display_mutations)
Ejemplo n.º 6
0
 def export_diversity(self, fname='entropy.json', indent=None):
     '''
     write the alignment entropy of each alignment (nucleotide and translations) to file
     '''
     if not hasattr(self, "entropy"):
         self.diversity_statistics()
     entropy_json = {}
     for feat in self.entropy:
         S = [max(0, round(x, 4)) for x in self.entropy[feat]]
         n = len(S)
         if feat == 'nuc':
             entropy_json[feat] = {
                 'pos': range(0, n),
                 'codon': [x // 3 for x in range(0, n)],
                 'val': S
             }
         else:
             entropy_json[feat] = {
                 'pos': [x for x in self.proteins[feat]][::3],
                 'codon': [(x - self.proteins[feat].start) // 3
                           for x in self.proteins[feat]][::3],
                 'val':
                 S
             }
     write_json(entropy_json, fname, indent=indent)
Ejemplo n.º 7
0
    def export(self,
               path='',
               extra_attr=['aa_muts', 'clade'],
               plain_export=10,
               indent=None,
               write_seqs_json=True):
        '''
        export the tree data structure along with the sequence information as
        json files for display in web browsers.
        parameters:
            path    -- path (incl prefix) to which the output files are written.
                       filenames themselves are standardized  to *tree.json and *sequences.json
            extra_attr -- attributes of tree nodes that are exported to json
            plain_export -- store sequences are plain strings instead of
                            differences to root if number of differences exceeds
                            len(seq)/plain_export
        '''
        from Bio import Seq
        from itertools import izip
        timetree_fname = path + '_tree.json'
        sequence_fname = path + '_sequences.json'
        tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr)
        write_json(tree_json, timetree_fname, indent=indent)

        # prepare a json with sequence information to export.
        # first step: add the sequence & translations of the root as string
        elems = {}
        elems['root'] = {}
        elems['root']['nuc'] = "".join(self.tree.root.sequence)
        for prot, seq in self.tree.root.translations.iteritems():
            elems['root'][prot] = seq

        # add sequence for every node in tree. code as difference to root
        # or as full strings.
        for node in self.tree.find_clades():
            if hasattr(node, "clade"):
                elems[node.clade] = {}
                # loop over proteins and nucleotide sequences
                for prot, seq in [('nuc', "".join(node.sequence))
                                  ] + node.translations.items():
                    differences = {
                        pos: state
                        for pos, (state, ancstate) in enumerate(
                            izip(seq, elems['root'][prot]))
                        if state != ancstate
                    }
                    if plain_export * len(differences) <= len(seq):
                        elems[node.clade][prot] = differences
                    else:
                        elems[node.clade][prot] = seq
        if write_seqs_json:
            write_json(elems, sequence_fname, indent=indent)
Ejemplo n.º 8
0
 def export_diversity(self, fname = 'entropy.json'):
     if not hasattr(self, "entropy"):
         self.diversity_statistics()
     entropy_json = {}
     for feat in self.entropy:
         S = [max(0,round(x,4)) for x in self.entropy[feat]]
         n = len(S)
         if feat=='nuc':
             entropy_json[feat] = {'pos':range(0,n), 'codon':[x//3 for x in range(0,n)], 'val':S}
         else:
             entropy_json[feat] = {'pos':[x for x in self.proteins[feat]][::3],
                                   'codon':[(x-self.proteins[feat].start)//3 for x in self.proteins[feat]][::3], 'val':S}
     write_json(entropy_json, fname, indent=None)
Ejemplo n.º 9
0
    def to_json(self, filename):
        """Export fitness model parameters, data, and accuracy statistics to JSON.
        """
        # Convert predictor parameters to a data frame to easily export as
        # records.
        params_df = pd.DataFrame({
            "predictor": self.predictors,
            "param": self.model_params.tolist(),
            "global_sd": self.global_sds.tolist()
        })

        correlation_null, correlation_raw, correlation_rel = self.get_correlation(
        )
        mcc = get_matthews_correlation_coefficient_for_data_frame(
            self.pred_vs_true_df)

        # Do not try to export titer data if it was provided to the model.
        predictor_kwargs = self.predictor_kwargs.copy()
        if "transform" in predictor_kwargs:
            predictor_kwargs["transform"] = str(predictor_kwargs["transform"])

        if "titers" in predictor_kwargs:
            del predictor_kwargs["titers"]

        data = {
            "params": params_df.to_dict(orient="records"),
            "predictor_kwargs": predictor_kwargs,
            "data": self.pred_vs_true_df.to_dict(orient="records"),
            "accuracy": {
                "clade_error": self.clade_fit(self.model_params),
                "correlation_rel": correlation_rel[0],
                "mcc": mcc
            },
            "delta_time": self.delta_time,
            "step_size": self.timepoint_step_size,
            "end_date": self.end_date
        }

        predictor_arrays = {}
        for key in self.predictor_arrays:
            predictor_arrays[key] = self.predictor_arrays[key].tolist()

        data["predictor_arrays"] = predictor_arrays

        freq_arrays = {}
        for key in self.freq_arrays:
            freq_arrays[key] = self.freq_arrays[key].tolist()

        data["freq_arrays"] = freq_arrays

        write_json(data, filename)
Ejemplo n.º 10
0
def main():

    print "--- Tree LBI at " + time.strftime("%H:%M:%S") + " ---"

    tree = json_to_dendropy(read_json('data/tree_refine.json'))

    print "calculate local branching index"
    T2 = get_average_T2(tree, 365)
    tau = T2 * 2**-4
    print "avg pairwise distance:", T2
    print "memory time scale:", tau
    calc_LBI(tree, tau=tau)

    write_json(dendropy_to_json(tree.seed_node), "data/tree_LBI.json")
Ejemplo n.º 11
0
    def to_json(self, filename):
        """Export fitness model parameters, data, and accuracy statistics to JSON.
        """
        # Convert predictor parameters to a data frame to easily export as
        # records.
        params_df = pd.DataFrame({
            "predictor": self.predictors,
            "param": self.model_params.tolist(),
            "global_sd": self.global_sds.tolist()
        })

        correlation_null, correlation_raw, correlation_rel = self.get_correlation()
        mcc = get_matthews_correlation_coefficient_for_data_frame(self.pred_vs_true_df)

        # Do not try to export titer data if it was provided to the model.
        predictor_kwargs = self.predictor_kwargs.copy()
        if "transform" in predictor_kwargs:
            predictor_kwargs["transform"] = str(predictor_kwargs["transform"])

        if "titers" in predictor_kwargs:
            del predictor_kwargs["titers"]

        data = {
            "params": params_df.to_dict(orient="records"),
            "predictor_kwargs": predictor_kwargs,
            "data": self.pred_vs_true_df.to_dict(orient="records"),
            "accuracy": {
                "clade_error": self.clade_fit(self.model_params),
                "correlation_rel": correlation_rel[0],
                "mcc": mcc
            },
            "delta_time": self.delta_time,
            "step_size": self.timepoint_step_size,
            "end_date": self.end_date
        }

        predictor_arrays = {}
        for key in self.predictor_arrays:
            predictor_arrays[key] = self.predictor_arrays[key].tolist()

        data["predictor_arrays"] = predictor_arrays

        freq_arrays = {}
        for key in self.freq_arrays:
            freq_arrays[key] = self.freq_arrays[key].tolist()

        data["freq_arrays"] = freq_arrays

        write_json(data, filename)
Ejemplo n.º 12
0
def main(params):
    import time
    from io_util import read_json
    from io_util import write_json
    from tree_util import json_to_dendropy, dendropy_to_json

    print "--- Start fitness model optimization at " + time.strftime("%H:%M:%S") + " ---"

    tree_fname = "data/tree_refine.json"
    tree = json_to_dendropy(read_json(tree_fname))
    fm = fitness_model(tree, predictors=params["predictors"], verbose=1)
    fm.predict(niter=params["niter"])
    out_fname = "data/tree_fitness.json"
    write_json(dendropy_to_json(tree.seed_node), out_fname)
    return out_fname
Ejemplo n.º 13
0
def main(params):
	import time
	from io_util import read_json
	from io_util import write_json	
	from tree_util import json_to_dendropy, dendropy_to_json
	
	print "--- Start fitness model optimization at " + time.strftime("%H:%M:%S") + " ---"

	tree_fname='data/tree_refine.json'
	tree =  json_to_dendropy(read_json(tree_fname))
	fm = fitness_model(tree, predictors = params['predictors'], verbose=1)
	fm.predict(niter = params['niter'])
	out_fname = "tree_fitness.json"
	write_json(dendropy_to_json(tree.seed_node), out_fname)
	return out_fname
Ejemplo n.º 14
0
def main(in_fname='tree_refine.json', tree=True):

	print "--- Mutational tolerance at " + time.strftime("%H:%M:%S") + " ---"
	viruses = read_json(in_fname)
	if tree:
		viruses = json_to_dendropy(viruses)

	assign_fitness(viruses)

	if tree:
		out_fname = "tree_tolerance.json"
		write_json(dendropy_to_json(viruses.seed_node), out_fname)
	else:
		out_fname = "virus_tolerance.json"
		write_json(viruses, out_fname)
	return out_fname, viruses
Ejemplo n.º 15
0
def main(in_fname='data/tree_refine.json', tree=True):

	print "--- Mutational tolerance at " + time.strftime("%H:%M:%S") + " ---"
	viruses = read_json(in_fname)
	if tree:
		viruses = json_to_dendropy(viruses)

	assign_fitness(viruses)

	if tree:
		out_fname = "data/tree_tolerance.json"
		write_json(dendropy_to_json(viruses.seed_node), out_fname)
	else:
		out_fname = "data/virus_tolerance.json"
		write_json(viruses, out_fname)
	return out_fname, viruses
Ejemplo n.º 16
0
def main(tree_fname = 'data/tree_refine.json'):

	print "--- Testing predictor evaluations ---"
	tree =  json_to_dendropy(read_json(tree_fname))

	print "Calculating epitope distances"
	calc_epitope_distance(tree)

	print "Calculating nonepitope distances"
	calc_nonepitope_distance(tree)

	print "Calculating LBI"
#	calc_LBI(tree)

	print "Writing decorated tree"
	out_fname = "data/tree_predictors.json"
	write_json(dendropy_to_json(tree.seed_node), out_fname)
	return out_fname
Ejemplo n.º 17
0
 def export_diversity(self, fname='entropy.json'):
     if not hasattr(self, "entropy"):
         self.diversity_statistics()
     entropy_json = {}
     for feat in self.entropy:
         S = [max(0, round(x, 4)) for x in self.entropy[feat]]
         n = len(S)
         if feat == 'nuc':
             entropy_json[feat] = {
                 'pos': range(0, n),
                 'codon': [x // 3 for x in range(0, n)],
                 'val': S
             }
         else:
             entropy_json[feat] = {
                 'pos': [x for x in self.proteins[feat]][::3],
                 'codon': [(x - self.proteins[feat].start) // 3
                           for x in self.proteins[feat]][::3],
                 'val':
                 S
             }
     write_json(entropy_json, fname, indent=None)
Ejemplo n.º 18
0
	def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50):
		if config['virus']:
			from H9_filter import H9_filter as virus_filter
			fasta_fields = config['fasta_fields']
			if 'force_include' in config and os.path.isfile(config['force_include']):
				with open(config['force_include']) as force_include_file:
					force_include_strains = [line.strip() for line in force_include_file]
				print "found ",len(force_include_strains),"strains to include"
			else:
				force_include_strains = []
		else:
			from virus_filter import virus_filter as virus_filter
			fasta_fields = {0:'strain'}
		if aln_fname is None: aln_fname = config['alignment_file']

		my_filter = virus_filter(aln_fname, fasta_fields)
		my_filter.filter()
		my_filter.subsample(years_back, viruses_per_month, prioritize = force_include_strains, 
								all_priority = True, region_specific=config['max_global'])

		self.viruses = my_filter.virus_subsample
		write_json(self.viruses, self.initial_virus_fname)
Ejemplo n.º 19
0
    def export_to_auspice(self,
                          tree_fields=[],
                          tree_pop_list=[],
                          annotations=[],
                          seq='aa'):
        from tree_util import dendropy_to_json, all_descendants
        from io_util import write_json, read_json
        print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---"
        # Move sequence data to separate file
        print "Writing sequences"
        elems = {}
        for node in self.tree:
            if hasattr(node, "clade") and hasattr(node, "seq"):
                elems[node.clade] = {}
                elems[node.clade]['nuc'] = {
                    pos: state
                    for pos, (state, ancstate) in enumerate(
                        izip(node.seq, self.tree.seed_node.seq))
                    if state != ancstate
                }
                for anno, aa_seq in node.aa_seq.iteritems():
                    elems[node.clade][anno] = {
                        pos: state
                        for pos, (state, ancstate) in enumerate(
                            izip(aa_seq, self.tree.seed_node.aa_seq[anno]))
                        if state != ancstate
                    }

        elems['root'] = {}
        elems['root']['nuc'] = self.tree.seed_node.seq
        for anno, aa_seq in self.tree.seed_node.aa_seq.iteritems():
            elems['root'][anno] = aa_seq
        write_json(elems, self.auspice_sequences_fname, indent=None)

        print "Writing tree"
        self.tree_json = dendropy_to_json(self.tree.seed_node, tree_fields)
        for node in all_descendants(self.tree_json):
            for attr in tree_pop_list:
                if attr in node:
                    node.pop(attr, None)
            if "freq" in node:
                for reg in node["freq"]:
                    try:
                        node["freq"][reg] = [
                            round(x, 3) for x in node["freq"][reg]
                        ]
                    except:
                        node["freq"][reg] = "undefined"

        if hasattr(self, "clade_designations"):
            # find basal node of clade and assign clade x and y values based on this basal node
            clade_present = {}
            clade_xval = {}
            clade_yval = {}
            if hasattr(self.tree.seed_node, "freq"):
                self.frequencies['clades'] = {
                    reg: {
                        "pivots": list(self.tree.seed_node.pivots)
                    }
                    for reg in self.tree.seed_node.freq
                }

            for clade, gt in self.clade_designations.iteritems():
                if clade in annotations:
                    print "Annotating clade", clade
                    tmp_nodes = sorted(
                        (node for node in self.tree.postorder_node_iter()
                         if not node.is_leaf() and all([
                             node.aa_seq[gene][pos - 1] == aa
                             for gene, pos, aa in gt
                         ])),
                        key=lambda node: node.xvalue)
                    if len(tmp_nodes):
                        clade_present[clade] = True
                        base_node = tmp_nodes[0]
                        clade_xval[clade] = base_node.xvalue
                        clade_yval[clade] = base_node.yvalue
                        if hasattr(base_node, 'freq'):
                            for region in base_node.freq:
                                try:
                                    self.frequencies["clades"][region][
                                        clade.lower()] = [
                                            round(x, 3)
                                            for x in base_node.freq[region]
                                        ]
                                    print "added frequencies", region, clade
                                except:
                                    print base_node.freq[region]
                    else:
                        clade_present[clade] = False
                        print "clade", clade, gt, "not in tree"
            # append clades, coordinates and genotype to meta
            self.tree_json["clade_annotations"] = [
                (clade, clade_xval[clade], clade_yval[clade],
                 "/".join([gene + ':' + str(pos) + aa
                           for gene, pos, aa in gt]))
                for clade, gt in self.clade_designations.iteritems()
                if clade in annotations and clade_present[clade] == True
            ]
        write_json(self.tree_json, self.auspice_tree_fname, indent=None)
        try:
            read_json(self.auspice_tree_fname)
        except:
            print "Read failed, rewriting with indents"
            write_json(self.tree_json, self.auspice_tree_fname, indent=1)

        # Write out frequencies
        if hasattr(self, 'frequencies'):
            if not hasattr(self, 'aa_entropy') and not hasattr(
                    self, 'nuc_entropy'):
                self.determine_variable_positions()

            if hasattr(self, 'aa_entropy'):
                self.frequencies["entropy"] = {}
                self.frequencies["location"] = {}
                for anno, alnS in self.aa_entropy.iteritems():
                    self.frequencies["location"][anno] = [int(self.cds[anno].location.start),\
                              int(self.cds[anno].location.start)]
                    self.frequencies["entropy"][anno] = [[
                        pos, S, muts
                    ] for pos, S, muts in izip(
                        xrange(alnS.shape[0]), alnS,
                        self.variable_aa_identities[anno])]
            elif seq == 'nuc' and hasattr(self, 'nuc_entropy'):
                self.frequencies["entropy"] = [
                    [pos, S, muts] for pos, S, muts in izip(
                        xrange(self.nuc_entropy.shape[0]), self.nuc_entropy,
                        self.variable_nuc_identities)
                ]

            write_json(self.frequencies, self.auspice_frequency_fname)

        # Write out metadata
        print "Writing out metadata"
        meta = {}
        meta["updated"] = time.strftime("X%d %b %Y").replace('X0',
                                                             'X').replace(
                                                                 'X', '')
        try:
            from pygit2 import Repository, discover_repository
            current_working_directory = os.getcwd()
            repository_path = discover_repository(current_working_directory)
            repo = Repository(repository_path)
            commit_id = repo[repo.head.target].id
            meta["commit"] = str(commit_id)
        except ImportError:
            meta["commit"] = "unknown"

        if hasattr(self, "date_region_count"):
            meta["regions"] = self.regions
            meta["virus_stats"] = [
                [str(y) + '-' + str(m)] +
                [self.date_region_count[(y, m)][reg] for reg in self.regions]
                for y, m in sorted(self.date_region_count.keys())
            ]
        write_json(meta, self.auspice_meta_fname, indent=None)
        self.export_accession_numbers()
Ejemplo n.º 20
0
	def export_to_auspice(self, tree_fields = [], tree_pop_list = [], annotations = [], seq='aa'):
		from tree_util import dendropy_to_json, all_descendants
		from io_util import write_json, read_json
		print time.strftime("%H:%M:%S") + " ---"
		# Move sequence data to separate file
		print "Writing sequences"
		elems = {}
		for node in self.tree:
			if hasattr(node, "clade") and hasattr(node, "seq"):
				elems[node.clade] = {}
				elems[node.clade]['nuc'] = {pos:state for pos, (state, ancstate) in 
								enumerate(izip(node.seq, self.tree.seed_node.seq)) if state!=ancstate}
				for anno, aa_seq in node.aa_seq.iteritems():
					elems[node.clade][anno] = {pos:state for pos, (state, ancstate) in 
								enumerate(izip(aa_seq, self.tree.seed_node.aa_seq[anno])) if state!=ancstate}

		elems['root'] = {}
		elems['root']['nuc'] = self.tree.seed_node.seq
		for anno, aa_seq in self.tree.seed_node.aa_seq.iteritems():
			elems['root'][anno] = aa_seq
		write_json(elems, self.auspice_sequences_fname, indent=None)

		print "Writing tree"
		self.tree_json = dendropy_to_json(self.tree.seed_node, tree_fields)
		for node in all_descendants(self.tree_json):
			for attr in tree_pop_list:
				if attr in node:
					node.pop(attr, None)
			if "freq" in node:
				for reg in node["freq"]:
					try:
						node["freq"][reg] = [round(x,3) for x in node["freq"][reg]]
					except:
						node["freq"][reg] = "undefined"				

		if hasattr(self,"clade_designations"):
			# find basal node of clade and assign clade x and y values based on this basal node
			clade_present = {}
			clade_xval = {}
			clade_yval = {}
			self.frequencies['clades'] = {reg:{"pivots":list(self.tree.seed_node.pivots)} 
											for reg in self.tree.seed_node.freq}

			for clade, gt in self.clade_designations.iteritems():
				if clade in annotations:
					print "Annotating clade", clade
					tmp_nodes = sorted((node for node in self.tree.postorder_node_iter()
						if not node.is_leaf() and all([node.aa_seq[gene][pos-1]==aa for gene, pos, aa in gt])),
						key=lambda node: node.xvalue)
					if len(tmp_nodes):
						clade_present[clade] = True
						base_node = tmp_nodes[0]
						clade_xval[clade] = base_node.xvalue
						clade_yval[clade] = base_node.yvalue
						for region in base_node.freq:
							try:
								self.frequencies["clades"][region][clade.lower()] = [round(x,3) for x in base_node.freq[region]]
								print "added frequencies",region, clade
							except:
								print base_node.freq[region]
					else:
						clade_present[clade] = False
						print "clade",clade, gt, "not in tree"
			# append clades, coordinates and genotype to meta
			self.tree_json["clade_annotations"] = [(clade, clade_xval[clade],clade_yval[clade], 
								"/".join([gene+':'+str(pos)+aa for gene, pos, aa in gt]))
							for clade, gt in self.clade_designations.iteritems() 
							if clade in annotations and clade_present[clade] == True]
		write_json(self.tree_json, self.auspice_tree_fname, indent=None)
		try:
			read_json(self.auspice_tree_fname)
		except:
			print "Read failed, rewriting with indents"	
			write_json(self.tree_json, self.auspice_tree_fname, indent=1)
			
		# Include genotype frequencies
		if hasattr(self, 'frequencies'):
			if not hasattr(self, 'aa_entropy') and not hasattr(self, 'nuc_entropy'):
				self.determine_variable_positions()

			if hasattr(self, 'aa_entropy'):
				self.frequencies["entropy"] = {}
				self.frequencies["location"] = {}
				for anno, alnS in self.aa_entropy.iteritems():
					self.frequencies["location"][anno] = [int(self.cds[anno].location.start),\
															int(self.cds[anno].location.start)]
					self.frequencies["entropy"][anno] = [ [pos, S, muts] for pos,S,muts in 
						izip(xrange(alnS.shape[0]), alnS,self.variable_aa_identities[anno]) ]
			elif seq=='nuc' and hasattr(self, 'nuc_entropy'):
				self.frequencies["entropy"] = [ [pos, S, muts] for pos,S,muts in 
						izip(xrange(self.nuc_entropy.shape[0]), self.nuc_entropy,self.variable_nuc_identities) ]

			write_json(self.frequencies, self.auspice_frequency_fname)
			print("WRITEEN")
		# Write out metadata
		print "Writing out metadata"		
		meta = {}
		meta["updated"] = time.strftime("X%d %b %Y").replace('X0','X').replace('X','')
		try:
			from pygit2 import Repository, discover_repository
			current_working_directory = os.getcwd()
			repository_path = discover_repository(current_working_directory)
			repo = Repository(repository_path)
			commit_id = repo[repo.head.target].id
			meta["commit"] = str(commit_id)
		except ImportError:
			meta["commit"] = "unknown"
		
		if hasattr(self,"date_region_count"):
			meta["regions"] = self.regions
			meta["virus_stats"] = [ [str(y)+'-'+str(m)] + [self.date_region_count[(y,m)][reg] for reg in self.regions]
									for y,m in sorted(self.date_region_count.keys()) ]
		write_json(meta, self.auspice_meta_fname, indent=0)
Ejemplo n.º 21
0
	def refine_tree(self):
		import tree_refine
		tree_refine.main(self.tree, self.viruses, config['outgroup'], config['cds'])
		write_json(dendropy_to_json(self.tree.seed_node), self.intermediate_tree_fname)
Ejemplo n.º 22
0
	def align(self):
		import virus_align
		self.viruses = virus_align.main(self.viruses)
		out_fname = 'virus_align.json'
		write_json(self.viruses, out_fname)
Ejemplo n.º 23
0
	def clean_viruses(self):
		import virus_clean
		self.viruses = virus_clean.main(self.viruses)
		write_json(self.viruses, self.clean_virus_fname)