def main(newick: str, output_newick: str, samples: Iterable[Sample]): sample_name_by_workflow_run_id = { str(s["workflow_run_id"]): s["sample_name"] for s in samples } with open(newick) as i, open(output_newick, "w") as o: tree = next(NewickIO.parse(i)) for node in tree.find_clades(order="level"): node.name = sample_name_by_workflow_run_id.get( node.name, node.name) NewickIO.write([tree], o)
def nexus_text(obj, colour_branches, colours, **kwargs): """ Take tree-like object(s) and create nexus-format representation. Allows for colouring tip names. Modified from http://biopython.org/DIST/docs/api/Bio.Phylo.NexusIO-pysrc.html NB here we compensate for an apparent bug in the Biopython implementation, whereby an additional colon is wrongly added to confidence values in the output tree strings. """ try: trees = list(obj) # assume iterable except TypeError: trees = [obj] writer = NewickIO.Writer(trees) nexus_trees = [TREE_TEMPLATE % {'index': idx + 1, 'tree': nwk} for idx, nwk in enumerate( writer.to_strings(plain=False, plain_newick=True, **kwargs))] # if branches are being coloured, then taxon names already contain colouring annotation # otherwise we need to add this annotation here tax_labels = [ colour_taxon(str(x.name), colours) if not colour_branches else str(x.name) for x in chain(*(t.get_terminals() for t in trees))] text = NEX_TEMPLATE % { 'count': len(tax_labels), 'labels': ' '.join(tax_labels), # taxlabels all on one line 'trees': '\n'.join(nexus_trees), # trees on separate lines } return re.sub(r':([0-9]{1,3}\.[0-9]{1,3}):', r'\1:', text) # Corrects for biopython bug. eg ":50.00:" -> "50.00:"
def to_nexus(self, filename): """Writes the tree to the given file in nexus format. This method doesn't call Bio.Phylo.NexusIO as BayesTraitsV2 requires a different dialect of Nexus format. """ # Copy the tree before making any changes on it. tree = copy.deepcopy(self.tree) # BayesTraits requires the Nexus file to have a "Translate" block which # declares a number->taxon mapping so that numbers, not long taxa # names, are used in the tree descriptions. names_to_ints = dict((clade.name, i) for i, clade in enumerate( tree.get_terminals(), start=1)) # Assign numbers to terminal clades for node in tree.get_terminals(): node.name = str(names_to_ints[node.name]) # Drop names of the inner nodes for n in tree.get_nonterminals(): n.name = None # Tree to string writer = NewickIO.Writer([tree]) nexus_tree = NEX_TEMPLATE % { 'translate': ',\n'.join('%d %s' % (name, id) for id, name in names_to_ints.items()), 'tree': next(writer.to_strings(plain=False, plain_newick=True))} # Write string to file with open(filename, 'w') as handle: handle.write(nexus_tree)
def write_nexus_trees_to_bayestraits(nx, handle, **kwargs): """Modified from Bio.Phylo.NexusIO.write(): add a translate block converting leaf names to integers. """ trees = BioNexusTrees_to_BioPhylo(nx.trees) writer = NewickIO.Writer(trees) nexus_trees = [ TREE_TEMPLATE % {"index": idx + 1, "tree": nwk} for idx, nwk in enumerate( writer.to_strings(plain=False, plain_newick=True, **kwargs) ) ] translate = ["%d %s" % id_name for id_name in nx.translate.items()] # Unused in my output format (BayesTraits) + why aren't they unique? tax_labels = [taxon for nt in nx.trees for taxon in nt.get_taxa()] #tax_labels = [str(x.name) for x in chain(*(t.get_terminals() for t in trees))] text = NEX_TEMPLATE % { "count": len(tax_labels), "labels": " ".join(tax_labels), "trees": "\n".join(nexus_trees), "translate": ",\n ".join(translate) } handle.write(text) return len(nexus_trees)
def write(obj, handle, **kwargs): """Write a new Nexus file containing the given trees. Uses a simple Nexus template and the NewickIO writer to serialize just the trees and minimal supporting info needed for a valid Nexus file. """ trees = list(obj) writer = NewickIO.Writer(trees) nexus_trees = [ TREE_TEMPLATE % { 'index': idx + 1, 'tree': nwk } for idx, nwk in enumerate( writer.to_strings(plain=False, plain_newick=True, **kwargs)) ] tax_labels = [ str(x.name) for x in chain(*(t.get_terminals() for t in trees)) ] text = NEX_TEMPLATE % { 'count': len(tax_labels), 'labels': ' '.join(tax_labels), 'trees': '\n'.join(nexus_trees), } handle.write(text) return len(nexus_trees)
def get_fam(rfid): '''Get a family including tree and sequence information from an Rfam data dump stored in data/rfam inputs: rfid: rfam family id. outputs: ali: a biopython alignment tree: a biopython tree from a newick file. info: information parsed from the original stockholm file. ''' fmeta = open(cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid))) fali = open(cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid))) ali = aio.parse(fali, 'fasta').next() info = pickle.load(fmeta) fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)) tree = nio.parse( open(cfg.dataPath( 'rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next() return ali, tree, info
def test_phylotree(self): sample_names = [ s["sample_name"] for s in self.common_inputs["samples"] ] res = self.run_miniwdl() outputs = res["outputs"] self.assertCountEqual(outputs.keys(), [ "phylotree.clustermap_png", "phylotree.clustermap_svg", "phylotree.ncbi_metadata_json", "phylotree.phylotree_newick", "phylotree.ska_distances", "phylotree.variants", ]) with open(outputs["phylotree.phylotree_newick"]) as f: tree = next(NewickIO.parse(f)) nodes = [ n.name for n in tree.get_terminals() + tree.get_nonterminals() if n.name ] self.assertCountEqual(nodes, sample_names + self.accession_ids) identifiers = sorted(sample_names + self.accession_ids) with open(outputs["phylotree.ska_distances"]) as f: pairs = [ sorted([r["Sample 1"], r["Sample 2"]]) for r in DictReader(f, delimiter="\t") ] expected = [[a, b] for a in identifiers for b in identifiers if a < b] self.assertCountEqual(pairs, expected) with open(outputs["phylotree.variants"]) as f: self.assertCountEqual(identifiers, [r.id for r in SeqIO.parse(f, "fasta")]) with open(outputs["phylotree.ncbi_metadata_json"]) as f: self.assertEqual( json.load(f), { "NC_012532.1": { "name": "Zika virus, complete genome", "country": "Uganda", }, "NC_035889.1": { "name": "Zika virus isolate ZIKV/H. sapiens/Brazil/Natal/2015, complete genome", "country": "Brazil: Rio Grande do Norte, Natal", "collection_date": "2015", }, }) with open(outputs["phylotree.clustermap_svg"]) as f: full_text = "\n".join(f.readlines()) for name in sample_names + self.accession_ids: self.assertEqual(full_text.count(name), 2, name)
def read_fasta_or_newick_and_return_tree(path, nwk_path=None, patt=None): global NUM_OF_VIRIONS if any(path.name.endswith(x) for x in FASTA_EXTENSIONS): seqs = AlignIO.read(path, FASTA) seqs._records = [x for x in seqs if get_count(x, patt) > MIN_COUNT] NUM_OF_VIRIONS = int(sum(get_count(x, patt) for x in seqs)) if len(seqs) <= 2: return None tree = build_phylogenetic_tree(seqs) if nwk_path is not None and tree is not None: NewickIO.write([tree], nwk_path) elif any(path.name.endswith(x) for x in NEWICK_EXTENSIONS): tree = NewickIO.parse(path).next() # Root the tree if necessary if not tree.rooted: tree.root_at_midpoint() return tree
def readOneTree(stream): """Reads a Newick-formatted tree, permitting lines with comments denoted by leading '#'.""" tree_string = "" lines = stream.readlines() for line in lines: if not line.strip()[0] == '#': tree_string += line.strip() trees = NewickIO.parse(StringIO(tree_string)) tree = next(trees) return tree
def main(): prog = sys.argv[0] description = ('Parse newick tree an perform action on' 'each non-root node') parser = argparse.ArgumentParser(prog=prog, description=description) parser.add_argument('infile', nargs='?', type=argparse.FileType(), help='a Newick treefile') parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), help='changed Newick outfile') parser.add_argument('--cutoff', dest='cutoff', nargs='?', type=int, default=75, help='value at or beneath which no inner node' 'confidences are snown any more') options = parser.parse_args() infile = options.infile or sys.stdin outfile = options.outfile or sys.stdout cutoff = options.cutoff newick = NewickCutoff(infile, outfile, cutoff) trees = newick.readtrees() trees = newick.relabeltree(trees) NewickIO.write(trees, outfile)
def get_fam(rfid): '''Get a family including tree and sequence information from an Rfam data dump stored in data/rfam inputs: rfid: rfam family id. outputs: ali: a biopython alignment tree: a biopython tree from a newick file. info: information parsed from the original stockholm file. ''' fmeta = open( cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid))) fali = open( cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid))) ali = aio.parse(fali,'fasta').next() info= pickle.load(fmeta) fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)) tree = nio.parse(open(cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next() return ali, tree, info
code) if len(mappings) <= 0: log.warning("empty mappings file") return mappings def printtrees(trees): for tree in trees: Phylo.draw_ascii(tree) stdout_handler = logging.StreamHandler(sys.stderr) handlers = [stdout_handler] logging.basicConfig(level=logging.INFO, format='[%(levelname)s - %(message)s]', handlers=handlers) log = logging.getLogger('LOGGER_NAME') mappingfile = sys.argv[2] mappings = readmappings(mappingfile) mappedstrings = {} treefile = sys.argv[1] trees = readtrees(treefile) trees = relabeltree(trees) reportMappings(mappedstrings) #printtees(trees) NewickIO.write(trees, sys.stdout)
n.name = 'n{}'.format(i) tree_ids = [_.name for _ in tree] df = pd.read_csv(params.tab, sep='\t', index_col=0) df = df.loc[df.index.isin(tree_ids), :] cdf = df[['country', 'host']].groupby(['country']).count().to_dict()['host'] for c, n in cdf.items(): print(c, n) c2ids = defaultdict(set) for t in tree: if t.name in df.index: c2ids[df.loc[t.name, 'country']].add(t.name) to_keep = set() for c, ids in c2ids.items(): if not pd.isna(c): if len(ids) <= params.threshold: to_keep |= ids else: to_keep |= set(pd.np.random.choice(list(ids), size=params.threshold, replace=False)) tree = remove_certain_leaves(tree, lambda _: _.name not in to_keep) features = [DATE, DATE_CI] nwk = tree.write(format_root_node=True, features=features, format=3) write(NewickIO.parse(StringIO(nwk)), params.out_tree, 'nexus') with open(params.out_tree, 'r') as f: nexus_str = f.read().replace('&&NHX:', '&') for feature in features: nexus_str = nexus_str.replace(':{}='.format(feature), ',{}='.format(feature)) with open(params.out_tree, 'w') as f: f.write(nexus_str)
def _mapto(raxml_dir, pathr, query_id): # print(raxml_dir, pathr, query_id) classification_file = raxml_dir + '/RAxML_portableTree.' + pathr + '.jplace' # print(classification_file) with open(classification_file) as classification: classification_json = json.load(classification) # print(classification_json) tree_string = classification_json['tree'] # print(tree_string) # tree_string = "(((((AN11:0.84399999999999997247{0},((AN7:1.00600000000000000533{1},AN8:0.87399999999999999911{2}):0.14999999999999999445{3},(AN9:0.77700000000000002398{4},AN10:2.00000000000000000000{5}):0.14999999999999999445{6}):0.14999999999999999445{7}):0.00500000000000000010{8},(AN13:0.63000000000000000444{9},AN14:0.50200000000000000178{10}):0.04700000000000000011{11}):0.65000000000000002220{12},(AN3:1.52200000000000001954{13},AN4:1.17700000000000004619{14}):0.14999999999999999445{15}):0.24399999999999999467{16},(((AN17:1.04200000000000003730{17},((AN20:0.53600000000000003197{18},AN21:0.55800000000000005151{19}):0.56799999999999994937{20},AN22:0.79700000000000004174{21}):0.46000000000000001998{22}):0.55900000000000005240{23},(AN24:0.70499999999999996003{24},AN25:0.64400000000000001688{25}):0.78500000000000003109{26}):0.14999999999999999445{27},((AN27:0.71899999999999997247{28},AN28:0.58299999999999996270{29}):0.63300000000000000711{30},((AN31:0.71899999999999997247{31},(AN37:0.16400000000000000688{32},((AN34:0.83899999999999996803{33},AN35:0.17299999999999998712{34}):0.05999999999999999778{35},AN36:0.36399999999999999023{36}):0.14999999999999999445{37}):0.54800000000000004263{38}):0.43800000000000000044{39},(((AN45:0.14099999999999998646{40},(AN43:0.17000000000000001221{41},AN44:0.47399999999999997691{42}):0.14999999999999999445{43}):0.58899999999999996803{44},(AN49:0.05899999999999999689{45},(AN47:0.07199999999999999456{46},AN48:0.25500000000000000444{47}):0.14999999999999999445{48}):0.43499999999999999778{49}):0.20799999999999999045{50},(AN39:0.51200000000000001066{51},AN40:0.44500000000000000666{52}):0.14999999999999999445{53}):0.64500000000000001776{54}):0.48099999999999998312{55}):0.14999999999999999445{56}):0.25100000000000000089{57}):0.39800000000000002043{58},((((AN75:0.75400000000000000355{59},(((AN79:0.17799999999999999156{60},((AN82:0.22300000000000000377{61},AN83:0.20999999999999999223{62}):0.52800000000000002487{63},(AN85:0.33100000000000001643{64},AN86:0.30799999999999999600{65}):0.02400000000000000050{66}):0.18399999999999999689{67}):0.12199999999999999734{68},(AN88:0.14799999999999999267{69},(((AN94:0.19600000000000000755{70},AN95:0.06099999999999999867{71}):0.08200000000000000344{72},(AN97:2.00000000000000000000{73},AN98:0.42499999999999998890{74}):0.05500000000000000028{75}):0.40600000000000002753{76},(AN90:0.16300000000000000600{77},AN91:0.17299999999999998712{78}):0.14999999999999999445{79}):0.08300000000000000433{80}):0.13500000000000000888{81}):0.22700000000000000733{82},(AN100:2.00000000000000000000{83},AN101:0.48899999999999999023{84}):0.09600000000000000200{85}):0.38000000000000000444{86}):0.47999999999999998224{87},(((AN55:0.67200000000000004174{88},AN56:0.60799999999999998490{89}):0.58599999999999996536{90},(((AN59:1.10600000000000009415{91},AN60:1.22199999999999997513{92}):0.14999999999999999445{93},(AN61:1.24300000000000010481{94},AN62:1.10600000000000009415{95}):0.14999999999999999445{96}):0.17599999999999998979{97},(AN64:0.17199999999999998623{98},AN65:0.18699999999999999956{99}):0.45800000000000001821{100}):0.66900000000000003908{101}):0.46800000000000002709{102},((AN70:1.05299999999999993605{103},(AN68:1.24700000000000010836{104},AN69:1.20199999999999995737{105}):0.14999999999999999445{106}):0.12099999999999999645{107},(AN72:0.90100000000000002309{108},AN73:0.92200000000000004174{109}):0.13600000000000000977{110}):0.49699999999999999734{111}):0.14999999999999999445{112}):0.29999999999999998890{113},(((AN105:0.76900000000000001688{114},AN106:0.73599999999999998757{115}):0.16200000000000000511{116},(AN110:0.67500000000000004441{117},(AN108:0.55500000000000004885{118},AN109:0.76800000000000001599{119}):0.14999999999999999445{120}):0.32300000000000000933{121}):0.58499999999999996447{122},AN111:1.19999999999999995559{123}):0.39400000000000001688{124}):0.13400000000000000799{125},(((((AN116:1.02800000000000002487{126},AN117:0.98099999999999998312{127}):0.32600000000000001199{128},AN118:0.65400000000000002576{129}):0.21900000000000000022{130},((AN121:0.14499999999999999001{131},AN122:0.14499999999999999001{132}):0.38600000000000000977{133},((AN125:0.07699999999999999900{134},AN126:0.04299999999999999656{135}):0.03300000000000000155{136},AN127:0.02000000000000000042{137}):0.45800000000000001821{138}):0.44300000000000000488{139}):1.09400000000000008349{140},(AN129:1.00000000000000000000{141},(AN131:0.22500000000000000555{142},AN132:0.20399999999999998690{143}):1.01499999999999990230{144}):1.13999999999999990230{145}):0.14999999999999999445{146},((((AN135:0.74099999999999999201{147},((AN138:0.96899999999999997247{148},AN139:0.76400000000000001243{149}):0.25100000000000000089{150},((AN142:0.60199999999999997957{151},AN143:0.57099999999999995204{152}):0.13400000000000000799{153},(AN145:0.58699999999999996625{154},AN146:0.49199999999999999289{155}):0.13800000000000001155{156}):0.31900000000000000577{157}):0.21900000000000000022{158}):0.56000000000000005329{159},(AN148:0.57299999999999995381{160},AN149:0.58099999999999996092{161}):0.33400000000000001910{162}):0.14999999999999999445{163},(AN150:1.11499999999999999112{164},AN151:0.77800000000000002487{165}):0.14999999999999999445{166}):1.09099999999999996980{167},((AN206:0.95499999999999996003{168},((AN213:0.04599999999999999922{169},(AN211:0.03799999999999999906{170},AN212:0.39000000000000001332{171}):0.14999999999999999445{172}):0.54300000000000003819{173},(AN208:0.85999999999999998668{174},AN209:0.51200000000000001066{175}):0.14999999999999999445{176}):0.35399999999999998135{177}):0.46200000000000002176{178},(((((AN157:0.82799999999999995826{179},(AN155:1.17799999999999993605{180},AN156:1.09400000000000008349{181}):0.14999999999999999445{182}):0.08200000000000000344{183},(((AN179:0.88800000000000001155{184},(AN177:1.03600000000000003197{185},AN178:0.98799999999999998934{186}):0.14999999999999999445{187}):0.06500000000000000222{188},(AN181:0.75000000000000000000{189},(AN183:0.62600000000000000089{190},(AN187:1.00499999999999989342{191},(AN185:1.33200000000000007283{192},AN186:1.50699999999999989519{193}):0.14999999999999999445{194}):0.17199999999999998623{195}):0.31800000000000000488{196}):0.11200000000000000233{197}):0.35299999999999998046{198},((AN160:0.80500000000000004885{199},(AN162:0.50900000000000000799{200},(AN164:0.47899999999999998135{201},AN165:0.40799999999999997380{202}):0.25900000000000000799{203}):0.44500000000000000666{204}):0.63200000000000000622{205},(AN167:1.00200000000000000178{206},((AN170:1.21599999999999996980{207},AN171:0.52400000000000002132{208}):0.05500000000000000028{209},(AN173:0.83399999999999996358{210},AN174:0.50600000000000000533{211}):0.05399999999999999939{212}):0.56399999999999994582{213}):0.81200000000000005507{214}):0.14999999999999999445{215}):0.00800000000000000017{216}):0.64600000000000001865{217},(((AN191:0.38400000000000000799{218},AN192:0.34799999999999997602{219}):0.30699999999999999512{220},AN193:0.53500000000000003109{221}):0.31300000000000000044{222},AN194:0.58599999999999996536{223}):0.70399999999999995914{224}):0.14999999999999999445{225},(AN195:1.05499999999999993783{226},(AN197:0.88500000000000000888{227},AN198:0.65200000000000002398{228}):0.69799999999999995381{229}):0.14999999999999999445{230}):0.14999999999999999445{231},((AN199:1.52000000000000001776{232},AN200:1.15999999999999992006{233}):0.14999999999999999445{234},((AN202:0.45700000000000001732{235},AN203:0.48399999999999998579{236}):0.64800000000000002043{237},AN204:1.56699999999999994849{238}):0.14999999999999999445{239}):0.14999999999999999445{240}):0.14999999999999999445{241}):0.85399999999999998135{242}):0.14999999999999999445{243}):0.05500000000000000028{244}):0.39800000000000002043{245});" matches = re.findall('AN(\d+):\d+\.\d+\{(\d+)\}', tree_string) # print(matches) AN_label = {} for [an, r] in matches: AN_label['AN' + an] = 'R' + r AN_label['R' + r] = 'AN' + an # print(AN_label) newick_string = re.sub('(AN\d+)?\:\d+\.\d+{(\d+)}', 'R\g<2>', tree_string) # print(newick_string) mytree = Phylo.read(NewickIO.StringIO(newick_string), 'newick') # print(mytree) # Phylo.draw_ascii(mytree) locations_ref = classification_json['placements'][0]['p'] # locations_ref = [[130, 13902], [238, 13902]] # print(locations_ref) child_ids = [] ter = [] for maploc in locations_ref: # print("maploc") rloc = 'R' + str(maploc[0]) # print(rloc) node = mytree.find_clades(rloc).__next__() # print(node) ter.extend(node.get_terminals()) # print("maploc OUT") comonancestor = mytree.common_ancestor(ter) # print(comonancestor) for leaf in comonancestor.get_terminals(): child_ids.append(AN_label[leaf.name]) # print(child_ids) return child_ids
if not os.path.isfile(fname): raise IOError("# Error: file {} does not exist".format(fname)) with open(fname, 'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) # Read tree tree_fname = os.path.expanduser(options.tree_in_fname) if not os.path.isfile(tree_fname): raise IOError("# Error: file {} does not exist".format(tree_fname)) tree_string = "" with open(tree_fname, 'r') as inf: lines = inf.readlines() for line in lines: if not line.strip()[0] == '#': tree_string += line.strip() trees = NewickIO.parse(StringIO(tree_string)) tree = next(trees) # Read mapping file map_fname = os.path.expanduser(options.mapping_in_fname) if not os.path.isfile(map_fname): raise IOError("# Error: file {} does not exist".format(map_fname)) with open(map_fname, 'r') as inf: map_table = util.readTable(inf, header=True) # Create mapping mapping_dict = dict(zip(map_table['species'], map_table['updated.species'])) # Update the FASTA headers #new_headers = [] #new_seqs = []
import sys from Bio import Phylo from Bio.Phylo import NewickIO trees = list(Phylo.parse(sys.argv[1], "newick")) print("Removing trees that are not bifurcating.") for tree in trees: for nonterminal in tree.get_nonterminals(): nonterminal.comment = None nonterminal.branch_length = None writer = NewickIO.Writer([tree for tree in trees if tree.is_bifurcating()]) print() print("Saving trees as plain newick files (no branch lengths).") with open(sys.argv[2], "w") as handle: for newick_tree in writer.to_strings(plain=True): handle.write(newick_tree + "\n")