def distances_to_tree(
    id_filename: str,
    distance_filename: str,
    output_filename: str,
    method: str = "nj",
    fmt: str = "newick",
) -> None:
    r"""Read IDs and distances from files, build tree and write to a file.

    Parameters
    ----------
    id_filename : str
        File containing a list of sequence IDs, one per line.
    distance_filename : str
        File containing distances in f"{id1}\t{id12}\t{dist}\n" format.
    output_filename : str
        File to write output tree.
    method : str
        Tree-building method. One of "nj" (default) or "upgma"
    fmt : str
        Output format (default = "newick")

    Returns
    -------
    None

    """
    with open(id_filename, "r") as idfile:
        ids = idfile.read().split()
    with open(distance_filename, "r") as distfile:
        dm = build_distance_matrix(ids, distfile)
    tree = build_tree(dm, method)
    write(tree, output_filename, fmt)
Example #2
0
import logging
import os
from Bio.Phylo import NexusIO, write


if '__main__' == __name__:
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument('--trees', required=True, type=str)
    parser.add_argument('--pattern', required=True, type=str)
    params = parser.parse_args()

    logging.basicConfig(level=logging.INFO, format='%(asctime)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S")

    i = 0
    for tree in NexusIO.parse(params.trees):
        os.makedirs(os.path.dirname(params.pattern % i), exist_ok=True)
        write([tree], params.pattern % i, 'newick', plain=True)
        i += 1
    logging.info('Converted %d trees to newick' % i)
Example #3
0
            n.name = 'n{}'.format(i)
    tree_ids = [_.name for _ in tree]
    df = pd.read_csv(params.tab, sep='\t', index_col=0)
    df = df.loc[df.index.isin(tree_ids), :]
    cdf = df[['country', 'host']].groupby(['country']).count().to_dict()['host']
    for c, n in cdf.items():
        print(c, n)

    c2ids = defaultdict(set)
    for t in tree:
        if t.name in df.index:
            c2ids[df.loc[t.name, 'country']].add(t.name)
    to_keep = set()
    for c, ids in c2ids.items():
        if not pd.isna(c):
            if len(ids) <= params.threshold:
                to_keep |= ids
            else:
                to_keep |= set(pd.np.random.choice(list(ids), size=params.threshold, replace=False))

    tree = remove_certain_leaves(tree, lambda _: _.name not in to_keep)
    features = [DATE, DATE_CI]
    nwk = tree.write(format_root_node=True, features=features, format=3)
    write(NewickIO.parse(StringIO(nwk)), params.out_tree, 'nexus')
    with open(params.out_tree, 'r') as f:
        nexus_str = f.read().replace('&&NHX:', '&')
    for feature in features:
        nexus_str = nexus_str.replace(':{}='.format(feature), ',{}='.format(feature))
    with open(params.out_tree, 'w') as f:
        f.write(nexus_str)
Example #4
0
def newick(args, trees, lang=None):
    p = args.module_dir.joinpath(
        'static', 'trees', 'tree-%s-newick.txt' % (lang.id if lang else 'glottolog',))
    with codecs.open(p, 'w', 'utf8') as fp:
        write(trees, fp, 'newick')
def newick(args, trees, lang=None):
    p = args.module_dir.joinpath(
        'static', 'trees',
        'tree-%s-newick.txt' % (lang.id if lang else 'glottolog', ))
    with codecs.open(p, 'w', 'utf8') as fp:
        write(trees, fp, 'newick')