def check_mulrf_scores(sfile, gfile, mulrf):
    """
    Checks RF scores are the same regardless of preprocessing gene family trees

    Parameters
    ----------
    sfile : string
            name of file containing species tree
    gfile : string
            name of file containing gene family trees
    mulrf: string
           name including full path of MulRFScorer binary
    """
    # Read species tree
    stree = treeswift.read_tree(sfile, "newick")
    remove_internal_node_labels(stree)
    stree.suppress_unifurcations()

    total_rf = 0

    with open(gfile, 'r') as f:
        g = 1
        for line in f.readlines():
            temp = "".join(line.split())

            # Build MUL-tree
            mtree = treeswift.read_tree_newick(temp)
            remove_internal_node_labels(mtree)
            unroot(mtree)

            # Build pre-processed MUL-tree
            mxtree = treeswift.read_tree(temp, "newick")
            remove_internal_node_labels(mxtree)

            [nEM, nLM, nR, c, nEMX, nLMX] = preprocess_multree(mxtree)

            score_shift = compute_score_shift(nEM, nLM, nR, c, nEMX, nLMX)

            # Compute MulRF scores
            temp = gfile.rsplit('.', 1)[0]
            mscore = score_with_MulRF(mulrf, stree, mtree,
                                      temp + "-scored")
            mxscore = score_with_MulRF(mulrf, stree, mxtree,
                                       temp + "-preprocessed-and-scored")

            # Check scores match!
            if mxscore + score_shift != mscore:
                sys.exit("Gene tree on line %d failed!\n" % g)

            total_rf += mscore

            g += 1

    sys.stdout.write('%d\n' % total_rf)
    sys.stdout.flush()
    os._exit(0)  # CRITICAL ON BLUE WATERS LOGIN NODE
Esempio n. 2
0
    def __init__(self, args, calculate_distance_matrix=False):
        self.args = args
        print('Loding data...')
        backbone_tree_file = args.backbone_tree_file
        backbone_seq_file = args.backbone_seq_file
        self_seq = SeqIO.to_dict(SeqIO.parse(backbone_seq_file, "fasta"))
        tree = treeswift.read_tree(backbone_tree_file, 'newick')

#        self.nodes = list(self_seq.keys())

        print('finish data loading!')

        args.sequence_length = len(list(self_seq.values())[0])
        L = args.sequence_length

        if calculate_distance_matrix:
            print('Calculating distance matrix...')
            self.distance_matrix = tree.distance_matrix(leaf_labels=True)
            for key in self.distance_matrix:
                self.distance_matrix[key][key] = 0
            self.distance_matrix = pd.DataFrame.from_dict(self.distance_matrix)
            print('Finish distance matrix calculation!')

        self.nodes, self.seq, self.mask = utils.process_seq(self_seq, args, True, True)
        self.seq = dict(zip(self.nodes, self.seq))
        self.nongaps = dict(zip(self.nodes, self.mask))
        self.num = len(self.nodes)
Esempio n. 3
0
def relabel_multrees_simphy(ifil, ofil):
    """
    Relabels leaves of locus or gene trees generated by SimPhy; specifically,
    [sid]_[lid]_[gid] is relabled to [sid]. Also, removes internal node labels
    and branch lengths.

    Parameters
    ----------
    ifil : string
           name of input file (one newick string per line)
    ofil : string
           name of output file (one newick string per line)
    """
    with open(ifil, 'r') as fi, open(ofil, 'w') as fo:
        for line in fi.readlines():
            temp = "".join(line.split())
            tree = treeswift.read_tree(temp, "newick")

            for node in tree.traverse_postorder():
                if node.is_leaf():
                    node.label = node.label.split('_')[0]
                else:
                    node.label = None
                node.edge_length = None

            fo.write(tree.newick())
            fo.write('\n')
def main(args):
    t = treeswift.read_tree(args.tree, 'newick')
    t.collapse_short_branches(args.min_branch_length)

    # Leaves aren't shortened by collapse_short_branch_lengths()
    # so shorten them manually by iterating over all leaves
    for node in t.traverse_leaves():
        if node.get_edge_length() <= args.min_branch_length:
            node.set_edge_length(0)

    # If we use the default ascending=True, this breaks TreeCluster clustering
    t.order("num_descendants_then_edge_length_then_label", ascending=False)
    t.resolve_polytomies()
    print(t)
Esempio n. 5
0
def prepareTree(options):
    if options.reestimate_backbone:  # reestimate backbone branch lengths
        reestimate_backbone(options)

    start = time.time()
    first_read_tree = ts.read_tree(options.tree_fp, schema='newick')
    logging.info("[%s] Tree is parsed in %.3f seconds." %
                 (time.strftime("%H:%M:%S"), (time.time() - start)))
    start = time.time()
    util.index_edges(first_read_tree)
    util.set_levels(first_read_tree)

    # create a dictionary where keys are leaf labels and values are
    # pendant edge index for that leaf
    name_to_node_map = {}
    for l in first_read_tree.traverse_postorder(internal=False):
        name_to_node_map[l.label] = l

    extended_newick_string = extended_newick(first_read_tree)
    logging.info("[%s] Tree preprocessing is completed in %.3f seconds." %
                 (time.strftime("%H:%M:%S"), (time.time() - start)))
    return first_read_tree, name_to_node_map, extended_newick_string
Esempio n. 6
0
def main():
    args_base = OmegaConf.create(default_config.default_config)

    args_cli = OmegaConf.from_cli()

    args = OmegaConf.merge(args_base, args_cli)
    original_distance = pd.read_csv(os.path.join(args.outdir, "depp.csv"),
                                    sep='\t')
    a_for_seq_name = pd.read_csv(os.path.join(args.outdir, "depp.csv"),
                                 sep='\t',
                                 dtype=str)
    s = list(original_distance.keys())[1:]
    tree = treeswift.read_tree(args.backbone_tree, 'newick')
    true_max = tree.diameter()
    # print(true_max)
    data = {}
    s_set = set(s)
    for i in range(len(original_distance)):
        line = list(a_for_seq_name.iloc[i])
        seq_name = line[0]
        with open(f"{args.outdir}/depp_tmp/{seq_name}_leaves.txt", "r") as f:
            method = set(f.read().split("\n"))
            method.remove('')
            method = method.intersection(s_set)
        if method:
            query_median = np.median(
                original_distance[np.array(method)].iloc[i])
            ratio = true_max / (query_median + 1e-7)
            # print(ratio)
            b = original_distance.iloc[i].values[1:] * ratio
        else:
            b = original_distance.iloc[i].values[1:]
        seq_dict = dict(zip(s, b))
        data[seq_name] = seq_dict
    data = pd.DataFrame.from_dict(data, orient='index', columns=s)
    data.to_csv(os.path.join(args.outdir, f'depp_correction.csv'), sep='\t')
Esempio n. 7
0
        def read_dismat(f):
            tags = list(re.split("\s+", f.readline().rstrip()))[1:]
            for line in f.readlines():
                dists = list(re.split("\s+", line.strip()))
                query_name = dists[0]
                obs_dist = dict(zip(tags, map(float, dists[1:])))
                yield (query_name, None, obs_dist)

        queries = read_dismat(f)

    f = open(tree_fp)
    tree_string = f.readline()
    f.close()

    first_read_tree = ts.read_tree(tree_string, schema='newick')
    util.index_edges(first_read_tree)
    extended_newick_string = extended_newick(first_read_tree)
    treecore = Core(first_read_tree)
    treecore.init()
    second_read_tree = ts.read_tree(tree_string, schema='newick')
    util.index_edges(second_read_tree)
    treecore_frag = Core(second_read_tree)

    pool = mp.Pool(num_thread)
    results = pool.starmap(runquery, queries)

    result = join_jplace(results)
    result["tree"] = extended_newick_string
    result["metadata"] = {"invocation": " ".join(sys.argv)}
    result["fields"] = [
Esempio n. 8
0
    if not tree_fp:
        tree_fp = tempfile.NamedTemporaryFile(delete=True, mode='w+t').name
        dist_phy = tempfile.NamedTemporaryFile(delete=True, mode='w+t')
        nldef = tempfile.NamedTemporaryFile(delete=True, mode='w+t')

        dist_phy.write(write_phylip_dist(obs_dist))
        dist_phy.flush()

        s = ["fastme", "-i", dist_phy.name, "-o", tree_fp]
        subprocess.call(s, stdout=nldef, stderr=nldef)

    if treeout_fp:
        copyfile(tree_fp, treeout_fp + "/tree.nwk")
    treestr = open(tree_fp).readline().strip()
    tree = tw.read_tree(treestr, "newick")
    pdc = tree.distance_matrix(leaf_labels=True)

    try:
        errs = dict()
        glob = 0
        glob_fm = 0
        errs_fm = dict()
        for l1 in tree.labels(leaves=True, internal=False):
            tot = 0
            tot_fm = 0
            for l2 in tree.labels(leaves=True, internal=False):
                if not l1 == l2:
                    cont = (pdc[l1][l2] - obs_dist[l1][l2])**2
                    if cont > 0 and obs_dist[l1][l2] > 0:
                        tot += cont
Esempio n. 9
0
def reestimate_backbone(options):
    assert options.ref_fp
    start = time.time()
    orig_branch_tree = ts.read_tree(options.tree_fp, schema='newick')
    if len(orig_branch_tree.root.children) > 2:  # 3
        rooted = False
    else:
        rooted = True
    orig_branch_tree.suppress_unifurcations()
    if len(orig_branch_tree.root.children) > 3:
        # polytomy at the root
        orig_branch_tree.resolve_polytomies()
    else:
        # root node is ok, resolve the other nodes
        for i in orig_branch_tree.root.children:
            i.resolve_polytomies()
    all_branches_have_length = True
    for n in orig_branch_tree.traverse_postorder(internal=True, leaves=True):
        if not n.is_root() and n.edge_length is None:
            all_branches_have_length = False
            break

    if rooted and all_branches_have_length:
        left, right = orig_branch_tree.root.children
        if left.children:
            thetwo = [next(c.traverse_postorder(internal=False)) for c in left.children]
            theone = [next(right.traverse_postorder(internal=False))]
            lengthtwoside = left.edge_length
            lengthoneside = right.edge_length
        else:
            thetwo = [next(c.traverse_postorder(internal=False)) for c in right.children]
            theone = [next(left.traverse_postorder(internal=False))]
            lengthtwoside = right.edge_length
            lengthoneside = left.edge_length

    orig_branch_resolved_fp = tempfile.NamedTemporaryFile(delete=True, mode='w+t').name
    orig_branch_tree.write_tree_newick(orig_branch_resolved_fp)

    if _platform == "darwin":
        fasttree_exec = pkg_resources.resource_filename('apples', "tools/FastTree-darwin")
    elif _platform == "linux" or _platform == "linux2":
        fasttree_exec = pkg_resources.resource_filename('apples', "tools/FastTree-linux")
    elif _platform == "win32" or _platform == "win64" or _platform == "msys":
        fasttree_exec = pkg_resources.resource_filename('apples', "tools/FastTree.exe")
    else:
        # Unrecognised system
        raise ValueError('Your system {} is not supported yet.' % _platform)

    bb_fp = tempfile.NamedTemporaryFile(delete=False, mode='w+t')
    fasttree_log = tempfile.NamedTemporaryFile(delete=False, mode='w+t').name
    logging.info("FastTree log file is located here: %s" % fasttree_log)

    s = [fasttree_exec, "-nosupport", "-nome", "-noml", "-log", fasttree_log,
         "-intree", orig_branch_resolved_fp]
    if not options.protein_seqs:
        s.append("-nt")
    with open(options.ref_fp, "r") as rf:
        with Popen(s, stdout=PIPE, stdin=rf, stderr=sys.stderr) as p:
            #options.tree_fp = bb_fp.name
            tree_string = p.stdout.read().decode('utf-8')
            if rooted and all_branches_have_length:
                ft = ts.read_tree_newick(tree_string)
                for n in ft.traverse_postorder(internal=False):
                    if n.label == theone[0].label:
                        theone_inft = n
                        break
                ft.reroot(theone_inft)
                mrca = ft.mrca([n.label for n in thetwo])
                mrca_edge_length = mrca.edge_length
                ft.reroot(mrca, length=mrca_edge_length/2)
                if lengthtwoside+lengthoneside > 0:
                    for i in range(2):
                        if ft.root.children[i] == mrca:
                            ft.root.children[i].edge_length = mrca_edge_length*lengthtwoside/(lengthtwoside+lengthoneside)
                            ft.root.children[1-i].edge_length = mrca_edge_length*lengthoneside/(lengthtwoside+lengthoneside)
                ft.is_rooted = False
                tree_string = str(ft)
            with open(bb_fp.name, "w") as ntree:
                ntree.write(tree_string.strip())
                ntree.write("\n")
                options.tree_fp = bb_fp.name
    logging.info(
        "[%s] Reestimated branch lengths in %.3f seconds." % (time.strftime("%H:%M:%S"), (time.time() - start)))
Esempio n. 10
0
    bb_fp = tempfile.NamedTemporaryFile(delete=True, mode='w+t')
    fasttree_log = tempfile.NamedTemporaryFile(delete=True, mode='w+t').name

    s = [
        fasttree_exec, "-nosupport", "-nome", "-noml", "-log", fasttree_log,
        "-intree", orig_branch_resolved_fp
    ]
    if not options.protein_seqs:
        s.append("-nt")
    with open(uniq_ref, "r") as rf:
        with Popen(s, stdout=PIPE, stdin=rf, stderr=sys.stderr) as p:
            tree_string = p.stdout.read().decode('utf-8')
            print(tree_string)

    uniqs_tree_nj = ts.read_tree(tree_string, schema="newick")
    leaf_map = dict()
    for n in uniqs_tree_nj.traverse_postorder(internal=False):
        leaf_map[n.label] = n

    for k, v in uniqtags.items():
        seq, mrca = v
        if len(seqdict[seq]) == 1:
            continue

        existing = leaf_map[k]
        o_node_parent = existing.parent
        o_node_parent.remove_child(existing)
        mrca_copy = nodecopy(mrca)
        mrca_copy.edge_length = existing.edge_length
        o_node_parent.add_child(mrca_copy)