def export_sequence_json(T, path, prefix): from Bio import SeqIO plain_export = 0.99 indent = None elems = {'root': {}} for node in T.find_clades(): elems[node.clade] = {} for gene, aln_fname in get_genes_and_alignments(path, tree=True): seqs = {} for seq in SeqIO.parse(aln_fname, 'fasta'): seqs[seq.name] = seq root_seq = seqs[T.root.name] elems['root'][gene] = "".join(root_seq) for node in T.find_clades(): nseq = seqs[node.name] if hasattr(node, "clade"): differences = { pos: state for pos, ( state, ancstate) in enumerate(zip(nseq, elems['root'][gene])) if state != ancstate } if len(differences) <= plain_export * len(seq): elems[node.clade][gene] = differences else: elems[node.clade][gene] = seq fname = sequence_json(path, prefix) write_json(elems, fname, indent=indent)
def export_diversity(path, prefix, reference, indent=None): ''' write the alignment entropy of each alignment (nucleotide and translations) to file ''' genes = load_features(reference) entropy_json = {} for feat, aln_fname in get_genes_and_alignments(path, tree=False): entropy = diversity_statistics(aln_fname, nuc=feat == 'nuc') S = [max(0, round(x, 4)) for x in entropy] n = len(S) if feat == 'nuc': entropy_json[feat] = { 'pos': range(0, n), 'codon': [x // 3 for x in range(0, n)], 'val': S } elif feat in genes: entropy_json[feat] = { 'pos': [x for x in genes[feat]][::3], 'codon': range(n), 'val': S } write_json(entropy_json, diversity_json(path, prefix), indent=indent)
seqs[seq.name] = seq muts = {} muts[T.root.name]='' for node in T.get_nonterminals(): pseq = seqs[node.name] for c in node: cseq = seqs[c.name] muts[c.name]=','.join([anc+str(pos+1)+der for pos, (anc, der) in enumerate(zip(pseq, cseq)) if anc!=der]) return muts if __name__ == '__main__': parser = generic_argparse("Assign amino acid mutations to the tree") args = parser.parse_args() path = args.path tree_meta = read_tree_meta_data(path) T = Phylo.read(tree_newick(path), 'newick') for gene, aln_fname in get_genes_and_alignments(path, tree=True): if gene!='nuc': muts = get_amino_acid_mutations(T, aln_fname) for node_name in tree_meta: tree_meta[node_name][gene+'_mutations'] = muts[node_name] write_tree_meta_data(path, tree_meta)
def export_metadata_json(T, path, prefix, reference, isvcf=False, indent=1): print("Writing out metaprocess") mjson = {} mjson["virus_count"] = T.count_terminals() from datetime import date mjson["updated"] = date.today().strftime('%Y-%m-%d') mjson["author_info"] = { "?": { "paper_url": "?", "journal": "?", "title": "?", "n": 1 } } mjson["seq_author_map"] = {} from collections import defaultdict cmaps = defaultdict(list) with open(color_maps(path), 'r') as cfile: for line in cfile: try: trait, name, color = line.strip().split('\t') except: continue cmaps[trait].append((name, color)) #if drug-resistance colours have been auto-generated, get these too import os.path if os.path.isfile(drm_color_maps(path)): with open(drm_color_maps(path), 'r') as cfile: for line in cfile: try: trait, name, color = line.strip().split('\t') except: continue cmaps[trait].append((name, color)) mjson["color_options"] = { "gt": { "menuItem": "genotype", "type": "discrete", "legendTitle": "Genotype", "key": "genotype" }, "num_date": { "menuItem": "date", "type": "continuous", "legendTitle": "Sampling date", "key": "num_date" } } for trait in cmaps: mjson["color_options"][trait] = { "menuItem": trait, "type": "discrete", "color_map": cmaps[trait], "legendTitle": trait, "key": trait } mjson["panels"] = ["tree", "map", "entropy"] mjson["title"] = "NextTB" mjson["maintainer"] = "Emma Hodcroft" mjson["geo"] = {} lat_long_defs = load_lat_long_defs() for geo_trait in ['region', "country", 'division']: mjson["geo"][geo_trait] = {} for n in T.find_clades(): if geo_trait in n.attr: place = n.attr[geo_trait] if (place not in mjson["geo"][geo_trait] and place in lat_long_defs): mjson["geo"][geo_trait][place] = lat_long_defs[place] mjson["commit"] = "unknown" mjson["filters"] = ["country", "region", "division"] genes = load_features(reference) anno = {} for feat, aln_fname in get_genes_and_alignments(path, tree=False): if feat in genes: anno[feat] = { "start": int(genes[feat].location.start), "end": int(genes[feat].location.end), "strand": genes[feat].location.strand } if isvcf: #if vcf, there is no 'gene' called 'nuc' that will be read in #above, so manually include it here. from filenames import ref_fasta from Bio import SeqIO refSeq = SeqIO.parse(ref_fasta(path), format='fasta').next() anno['nuc'] = {"start": 1, "end": len(refSeq.seq), "strand": 1} mjson["annotations"] = anno write_json(mjson, meta_json(path, prefix), indent=indent)