def distances_to_tree( id_filename: str, distance_filename: str, output_filename: str, method: str = "nj", fmt: str = "newick", ) -> None: r"""Read IDs and distances from files, build tree and write to a file. Parameters ---------- id_filename : str File containing a list of sequence IDs, one per line. distance_filename : str File containing distances in f"{id1}\t{id12}\t{dist}\n" format. output_filename : str File to write output tree. method : str Tree-building method. One of "nj" (default) or "upgma" fmt : str Output format (default = "newick") Returns ------- None """ with open(id_filename, "r") as idfile: ids = idfile.read().split() with open(distance_filename, "r") as distfile: dm = build_distance_matrix(ids, distfile) tree = build_tree(dm, method) write(tree, output_filename, fmt)
import logging import os from Bio.Phylo import NexusIO, write if '__main__' == __name__: import argparse parser = argparse.ArgumentParser() parser.add_argument('--trees', required=True, type=str) parser.add_argument('--pattern', required=True, type=str) params = parser.parse_args() logging.basicConfig(level=logging.INFO, format='%(asctime)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") i = 0 for tree in NexusIO.parse(params.trees): os.makedirs(os.path.dirname(params.pattern % i), exist_ok=True) write([tree], params.pattern % i, 'newick', plain=True) i += 1 logging.info('Converted %d trees to newick' % i)
n.name = 'n{}'.format(i) tree_ids = [_.name for _ in tree] df = pd.read_csv(params.tab, sep='\t', index_col=0) df = df.loc[df.index.isin(tree_ids), :] cdf = df[['country', 'host']].groupby(['country']).count().to_dict()['host'] for c, n in cdf.items(): print(c, n) c2ids = defaultdict(set) for t in tree: if t.name in df.index: c2ids[df.loc[t.name, 'country']].add(t.name) to_keep = set() for c, ids in c2ids.items(): if not pd.isna(c): if len(ids) <= params.threshold: to_keep |= ids else: to_keep |= set(pd.np.random.choice(list(ids), size=params.threshold, replace=False)) tree = remove_certain_leaves(tree, lambda _: _.name not in to_keep) features = [DATE, DATE_CI] nwk = tree.write(format_root_node=True, features=features, format=3) write(NewickIO.parse(StringIO(nwk)), params.out_tree, 'nexus') with open(params.out_tree, 'r') as f: nexus_str = f.read().replace('&&NHX:', '&') for feature in features: nexus_str = nexus_str.replace(':{}='.format(feature), ',{}='.format(feature)) with open(params.out_tree, 'w') as f: f.write(nexus_str)
def newick(args, trees, lang=None): p = args.module_dir.joinpath( 'static', 'trees', 'tree-%s-newick.txt' % (lang.id if lang else 'glottolog',)) with codecs.open(p, 'w', 'utf8') as fp: write(trees, fp, 'newick')
def newick(args, trees, lang=None): p = args.module_dir.joinpath( 'static', 'trees', 'tree-%s-newick.txt' % (lang.id if lang else 'glottolog', )) with codecs.open(p, 'w', 'utf8') as fp: write(trees, fp, 'newick')