Exemple #1
0
    def calcDistance(self):
        if self.path1 != '' and self.path2 != '':
            self.fileEx1 = (os.path.splitext(self.path1)[1])[1:]
            self.fileEx2 = (os.path.splitext(self.path2)[1])[1:]

            tns = dendropy.TaxonNamespace()
            self.tree1 = dendropy.Tree.get_from_path(self.path1, self.fileEx1, taxon_namespace=tns)
            self.tree2 = dendropy.Tree.get_from_path(self.path2, self.fileEx2, taxon_namespace=tns)

            self.tree1.encode_bipartitions()
            self.tree2.encode_bipartitions()

            print(treecompare.false_positives_and_negatives(self.tree1, self.tree2))

            # self.tree1 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick')
            # self.tree2 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick')

            # self.tree1.encode_bipartitions()
            # self.tree2.encode_bipartitions()

            # oblicz dystans
            # self.symDist = self.tree1.symmetric_difference(self.tree2)
            self.symDist = treecompare.symmetric_difference(self.tree1, self.tree2)
            self.fpnDist = treecompare.false_positives_and_negatives(self.tree1, self.tree2)
            self.eucDist = treecompare.euclidean_distance(self.tree1, self.tree2)
            self.rfDist = treecompare.robinson_foulds_distance(self.tree1, self.tree2)

            # pokaz wyniki
            self.res1.setText(str(self.eucDist)) #eucDist
            self.res2.setText(str(self.rfDist))  #rfDist
Exemple #2
0
def compareDendropyTrees(tr1, tr2):
    from dendropy.calculate.treecompare \
        import false_positives_and_negatives

    lb1 = set([l.taxon.label for l in tr1.leaf_nodes()])
    lb2 = set([l.taxon.label for l in tr2.leaf_nodes()])

    com = lb1.intersection(lb2)
    if com != lb1 or com != lb2:
        com = list(com)
        tns = dendropy.TaxonNamespace(com)

        tr1.retain_taxa_with_labels(com)
        tr1.migrate_taxon_namespace(tns)

        tr2.retain_taxa_with_labels(com)
        tr2.migrate_taxon_namespace(tns)
    com = list(com)

    tr1.update_bipartitions()
    tr2.update_bipartitions()

    nl = len(com)
    ei1 = len(tr1.internal_edges(exclude_seed_edge=True))
    ei2 = len(tr2.internal_edges(exclude_seed_edge=True))

    [fp, fn] = false_positives_and_negatives(tr1, tr2)
    rf = float(fp + fn) / (ei1 + ei2)

    return (nl, ei1, ei2, fp, fn, rf)
Exemple #3
0
def false_positives_and_negatives(reference_tree, test_tree):
    deprecate.dendropy_deprecation_warning(
        preamble="Deprecated since DendroPy 4: The 'dendropy.treecalc.false_positives_and_negatives()' function has moved to 'dendropy.calculate.treecompare.false_positives_and_negatives()'.",
        old_construct="from dendropy import treecalc\nd = treecalc.false_positives_and_negatives(...)",
        new_construct="from dendropy.calculate import treecompare\nd = treecompare.false_positives_and_negatives(...)",
    )
    return treecompare.false_positives_and_negatives(reference_tree=reference_tree, comparison_tree=test_tree)
def compare_trees(tr1, tr2):
    from dendropy.calculate.treecompare \
        import false_positives_and_negatives

    lb1 = set([l.taxon.label for l in tr1.leaf_nodes()])
    lb2 = set([l.taxon.label for l in tr2.leaf_nodes()])

    com = lb1.intersection(lb2)
    if com != lb1 or com != lb2:
        com = list(com)
        tns = dendropy.TaxonNamespace(com)

        tr1.retain_taxa_with_labels(com)
        tr1.migrate_taxon_namespace(tns)

        tr2.retain_taxa_with_labels(com)
        tr2.migrate_taxon_namespace(tns)
    com = list(com)

    tr1.update_bipartitions()
    tr2.update_bipartitions()

    nl = len(com)
    ei1 = len(tr1.internal_edges(exclude_seed_edge=True))
    ei2 = len(tr2.internal_edges(exclude_seed_edge=True))

    [fp, fn] = false_positives_and_negatives(tr1, tr2)
    # Note that the normalized symmetric difference equals the normalized RF
    # distance when both trees are fully resolved, i.e., binary.
    sd = float(fp + fn) / (ei1 + ei2)
    rf = float(fp + fn) / (2 * nl - 6)

    return (nl, ei1, ei2, fp, fn, sd, rf)
Exemple #5
0
def compareRes(tree, taxa, anch, sp, outpath):
    tns = dendropy.TaxonNamespace()
    tree1 = dendropy.Tree.get_from_path(sp, "newick", taxon_namespace=tns, rooting="force-unrooted")
    tree2 = dendropy.Tree.get_from_path(tree, "newick", taxon_namespace=tns, rooting="force-unrooted")
    res = treecompare.false_positives_and_negatives(tree1, tree2)

    return res
Exemple #6
0
def false_positives_and_negatives(reference_tree, test_tree):
    deprecate.dendropy_deprecation_warning(
            preamble="Deprecated since DendroPy 4: The 'dendropy.treecalc.false_positives_and_negatives()' function has moved to 'dendropy.calculate.treecompare.false_positives_and_negatives()'.",
            old_construct="from dendropy import treecalc\nd = treecalc.false_positives_and_negatives(...)",
            new_construct="from dendropy.calculate import treecompare\nd = treecompare.false_positives_and_negatives(...)")
    return treecompare.false_positives_and_negatives(
            reference_tree=reference_tree,
            comparison_tree=test_tree)
def get_fnrate(reftreepath,outtreepath):
    tns = dendropy.TaxonNamespace()
    rtree = dendropy.Tree.get(path=reftreepath,schema='newick',
    taxon_namespace=tns)
    otree = dendropy.Tree.get(path=outtreepath,schema='newick',
    taxon_namespace=tns)
    rtree.encode_bipartitions()
    otree.encode_bipartitions()
    fn_rate=treecompare.false_positives_and_negatives(rtree, otree)[1]/float(len(tns)-3)
    return fn_rate
Exemple #8
0
    def calcDistance(self):
        if self.path1 != '' and self.path2 != '':
            self.fileEx1 = (os.path.splitext(self.path1)[1])[1:]
            self.fileEx2 = (os.path.splitext(self.path2)[1])[1:]

            tns = dendropy.TaxonNamespace()
            self.tree1 = dendropy.Tree.get_from_path(self.path1,
                                                     self.fileEx1,
                                                     taxon_namespace=tns)
            self.tree2 = dendropy.Tree.get_from_path(self.path2,
                                                     self.fileEx2,
                                                     taxon_namespace=tns)

            self.tree1.encode_bipartitions()
            self.tree2.encode_bipartitions()

            print(
                treecompare.false_positives_and_negatives(
                    self.tree1, self.tree2))

            # self.tree1 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick')
            # self.tree2 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick')

            # self.tree1.encode_bipartitions()
            # self.tree2.encode_bipartitions()

            # oblicz dystans
            # self.symDist = self.tree1.symmetric_difference(self.tree2)
            self.symDist = treecompare.symmetric_difference(
                self.tree1, self.tree2)
            self.fpnDist = treecompare.false_positives_and_negatives(
                self.tree1, self.tree2)
            self.eucDist = treecompare.euclidean_distance(
                self.tree1, self.tree2)
            self.rfDist = treecompare.robinson_foulds_distance(
                self.tree1, self.tree2)

            # pokaz wyniki
            self.res1.setText(str(self.eucDist))  #eucDist
            self.res2.setText(str(self.rfDist))  #rfDist
Exemple #9
0
    def calculateDistance(self):
        if self.path1 != '' and self.path2 != '':
            #get files extensions

            self.fileExtension1 = (os.path.splitext(self.path1)[1])[1:]
            self.fileExtension2 = (os.path.splitext(self.path2)[1])[1:]

            #open tree files
            tns = dendropy.TaxonNamespace()
            self.tree1 = dendropy.Tree.get_from_path(self.path1, self.fileExtension1, taxon_namespace=tns)
            self.tree2 = dendropy.Tree.get_from_path(self.path2, self.fileExtension2, taxon_namespace=tns)

            self.tree1.encode_bipartitions()
            self.tree2.encode_bipartitions()

            print(treecompare.false_positives_and_negatives(self.tree1, self.tree2))

            # self.tree1 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick')
            # self.tree2 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick')

            # self.tree1.encode_bipartitions()
            #self.tree2.encode_bipartitions()


            #calculate distances
            #self.symDist = self.tree1.symmetric_difference(self.tree2)
            self.symDist = treecompare.symmetric_difference(self.tree1, self.tree2)
            self.fpnDist = treecompare.false_positives_and_negatives(self.tree1, self.tree2)
            self.eucDist = treecompare.euclidean_distance(self.tree1, self.tree2)
            self.rfDist  = treecompare.robinson_foulds_distance(self.tree1, self.tree2)
            
            #show distances
            self.dist1Value.setText(str(self.eucDist))
            self.dist2Value.setText(str(self.rfDist))
            self.dist3Value.setText(str(self.symDist))
            self.dist4Value.setText(str(self.fpnDist))
Exemple #10
0
def distance(file_path, file_format, file_path2):
    taxon_namespace = dendropy.TaxonNamespace()
    tree1 = dendropy.Tree.get_from_path(file_path,
                                        file_format,
                                        taxon_namespace=taxon_namespace)
    tree2 = dendropy.Tree.get_from_path(file_path2,
                                        file_format,
                                        taxon_namespace=taxon_namespace)
    sym_diff = treecompare.symmetric_difference(tree1, tree2)
    euc_dis = treecompare.euclidean_distance(tree1, tree2)
    false_pos = treecompare.false_positives_and_negatives(tree1, tree2)
    robinson_dis = treecompare.robinson_foulds_distance(tree1, tree2)
    print("Symetric difference: ", sym_diff)
    print("Robinson Foulds distance: ", robinson_dis)
    print("False positives and negatives: ", false_pos)
    print("Euclidean distance: ", euc_dis)
def are_two_trees_incompatible(tree1, tree2):
    """Check if two unrooted trees are equivalent on their shared taxon set

    Parameters
    ----------
    tree1 : dendropy tree object
    tree2 : dendropy tree object

    Returns
    -------
    violates : bool
        True, if trees are NOT compatible
        False, if trees are compatible

    """
    leaves1 = get_leaf_set(tree1)
    leaves2 = get_leaf_set(tree2)
    shared = list(leaves1.intersection(leaves2))

    taxa = dendropy.TaxonNamespace(shared)  # CRITICAL!!!

    # No topological information
    if len(shared) < 4:
        return False

    # Move trees onto shared leaf set
    tree1.retain_taxa_with_labels(shared)
    tree1.migrate_taxon_namespace(taxa)
    tree1.is_rooted = False
    tree1.collapse_basal_bifurcation()
    tree1.update_bipartitions()

    tree2.retain_taxa_with_labels(shared)
    tree2.migrate_taxon_namespace(taxa)
    tree2.is_rooted = False
    tree2.collapse_basal_bifurcation()
    tree2.update_bipartitions()

    # Check for compatibility
    [fp, fn] = false_positives_and_negatives(tree1, tree2)
    if fp > 0 or fn > 0:
        return True
    else:
        return False
Exemple #12
0
def compareAnchoredRes(tree, taxa, achs, sp, outpath, trueAnch):
    taxa = set(taxa) - set(achs)
    tns = dendropy.TaxonNamespace()
    anch = trueAnch
    tree1 = dendropy.Tree.get_from_path(sp, "newick", taxon_namespace=tns, rooting="force-unrooted")
    inferedTree = tree1.clone(2)
    inferedTree.retain_taxa_with_labels(taxa, update_bipartitions=True)
    inferedTree.deroot()
    ftmp1 = tempfile.mkstemp(
        suffix=".nwk", prefix="sp.nwk-" + str(anch[0]) + "-" + str(anch[1]), dir=outpath, text=None
    )
    inferedTree.write(path=ftmp1[1], schema="newick", suppress_rooting=True)
    tree2 = dendropy.Tree.get_from_path(tree, "newick", taxon_namespace=tns, rooting="force-unrooted")
    inferedTree = tree2.clone(2)
    inferedTree.retain_taxa_with_labels(taxa, update_bipartitions=True)
    inferedTree.deroot()
    ftmp2 = tempfile.mkstemp(suffix=".nwk", prefix=tree + ".retained", dir=outpath, text=None)
    inferedTree.write(path=ftmp2[1], schema="newick", suppress_rooting=True)
    tns = dendropy.TaxonNamespace()
    tree1 = dendropy.Tree.get_from_path(ftmp1[1], taxon_namespace=tns, rooting="force-unrooted")
    tree2 = dendropy.Tree.get_from_path(ftmp2[1], "newick", taxon_namespace=tns, rooting="force-unrooted")
    res = treecompare.false_positives_and_negatives(tree1, tree2)
    return res
Exemple #13
0
def compare_trees(tr1, tr2):
    # Find leaf labels that are in both trees
    lb1 = set([l.taxon.label for l in tr1.leaf_nodes()])
    lb2 = set([l.taxon.label for l in tr2.leaf_nodes()])
    com = lb1.intersection(lb2)

    # Restrict trees to shared leaf set
    if com != lb1 or com != lb2:
        com = list(com)
        tns = dendropy.TaxonNamespace(com)

        tr1.retain_taxa_with_labels(com)
        tr1.migrate_taxon_namespace(tns)

        tr2.retain_taxa_with_labels(com)
        tr2.migrate_taxon_namespace(tns)
    com = list(com)

    # Update tree bipartitions
    tr1.update_bipartitions()
    tr2.update_bipartitions()

    # Compute number of leaves and number of internal edges
    nl = len(com)
    ei1 = len(tr1.internal_edges(exclude_seed_edge=True))
    ei2 = len(tr2.internal_edges(exclude_seed_edge=True))

    # Compute number of false positives and false negatives
    [fp, fn] = false_positives_and_negatives(tr1, tr2)

    # Compute symmetric difference rate
    sd = float(fp + fn) / (ei1 + ei2)

    # Compute Robinson-Foulds error rate
    rf = float(fp + fn) / (2 * nl - 6)

    return (nl, ei1, ei2, fp, fn, sd, rf)
Exemple #14
0
#!/usr/bin/env python

import sys
import argparse
import dendropy

from dendropy.calculate import treecompare

distance_functions = {
    "euclidean": treecompare.euclidean_distance,
    "bipartition": lambda t1, t2: sum(
        treecompare.false_positives_and_negatives(t1, t2)),
    "wrf": treecompare.weighted_robinson_foulds_distance,
    "weighted_robinson_foulds": treecompare.weighted_robinson_foulds_distance,
    "rf": treecompare.unweighted_robinson_foulds_distance,
    "unweighted_robinson_foulds":
        treecompare.unweighted_robinson_foulds_distance,
}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "original",
        type=argparse.FileType('r'),
        help="File with original trees")
    parser.add_argument(
        "reconstructed",
        type=argparse.FileType('r'),
        help="File with reconstructed trees")
    parser.add_argument(
Exemple #15
0
def remove_outliers(treeList, strategy, outpath, e, summary):
    print "the strategy is: " + strategy
    if len(treeList) < 10:
        print "number of trees is " + str(len(treeList)) + ". This is not enough for outlier removal!"
        return treeList
    if strategy == "consensus10" or strategy == "consensus3":
        ftmp = findMRL(treeList, e, outpath, summary)
        ref_tree = dendropy.Tree.get(path=ftmp, schema="newick")
        treeList.append(ref_tree)
        d = list()

        for tree in treeList:
            tree.encode_bipartitions()
            ref_tree.encode_bipartitions()
            res = treecompare.false_positives_and_negatives(ref_tree, tree)
            d.append(res[1])
        if strategy == "consensus3":
            mean = np.mean(d)
            #             mean = mstats.mode(d)
            #             mean = mean[0]
            print "the mean distance to consensus tree was: " + str(mean)
            st = np.std(d)
            print "the std of distances to consensus tree was: " + str(st)
            for i in range(len(d) - 1, 0, -1):
                if d[i] > mean + 2.0 * st:
                    print "deleting " + str(i) + "th tree!"
                    print "d[i] to delete: " + str(d[i])
                    del treeList[i]
        else:
            sortIdx = np.argsort(d, 0)
            print len(sortIdx)
            print sortIdx
            m = int(len(sortIdx) / 4.0)
            print "deleting " + str(m) + " of the trees"
            idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True)
            print idx
            print d
            for i in idx:
                print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i])
                del treeList[i]
    elif strategy == "pairwise1" or strategy == "pairwise2" or strategy == "pairwise3":
        D = np.ndarray(shape=(len(treeList), len(treeList)), dtype=float)
        for i in range(0, len(treeList)):
            D[i][i] = 0.0
            for j in range(i + 1, len(treeList)):
                tree1 = treeList[i]
                tree2 = treeList[j]
                tree1.encode_bipartitions()
                tree2.encode_bipartitions()
                res1 = treecompare.false_positives_and_negatives(tree1, tree2)
                D[i][j] = res1[1]
                D[j][i] = res1[0]
        if strategy == "pairwise1":
            d = np.mean(D, 1)

            C = np.cov(D)
            v = [distance.mahalanobis(D[:, i], d, C) for i in range(0, len(treeList))]
            print v
            sortIdx = np.argsort(v, 0)
            m = int(len(sortIdx) * 0.15)
            idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True)
            for i in idx:
                print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(v[i])
                del treeList[i]
        elif strategy == "pairwise3":
            d = np.mean(D, 0)

            sortIdx = np.argsort(d, 0)
            print len(sortIdx)
            print sortIdx
            m = int(len(sortIdx) / 5.0)
            print "deleting " + str(m) + " of the trees"
            idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True)
            print idx
            print d
            for i in idx:
                print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i])
                del treeList[i]
        else:
            d = np.mean(D, 0)
            print d
            mean = np.mean(d)
            st = np.std(d)
            idx = list()
            for k in range(len(d) - 1, 0, -1):
                if d[k] > mean + 1.5 * st:
                    print "deleting the tree " + str(k) + "the. The distance to consensus tree was: " + str(d[k])
                    del treeList[k]

    return treeList
Exemple #16
0
def compare_trees(tr1, tr2):
    """
    Compares two trees

    Parameters
    ----------
    tr1 : dendropy tree object
            First tree (typically the model tree)
    tr2 : dendropy tree object
            Second tree (typically the estimated tree)

    Returns
    -------
    nl : int
         Size of the shared leaf set, i.e., the number of leaves in both trees
    ei1 : int
          Number of internal edges in first tree (after restricting it to the shared leaf set)
    ei2 : int
          Number of internal edges in second tree (after restricting it to the shared leaf set)
    fn : int
         Number of edges in the first tree that are not in the second tree
    fp : int
         Number of edges in the second tree that are not in the first tree
    rf : float
         Normalized Robinson-Foulds (RF) distance between the first and second trees

    Example
    -------
    If tree 1 corresponds to "(((A,B,C),D),E);" and tree 2 corresponds to "((((A,B),C),D),E);",
    then the output is "5 1 2 0 1 0.25". In this example,
      + first and second trees share 5 leaves (A, B, C, D, E).
      + first tree has one internal edge "A,B,C|D,E"
      + second tree has two internal edges "A,B|C,D,E" and "A,B,C|D,E"
      + one edges in the first tree that are missing from the second tree
      + no edge "A,B|C,D,E" in the second tree that is missing in the first tree
      + normalized RF distance is (FP+FN)/(2*NL-6) = (1+0)/(2*5-6) = 0.25
    """

    # Unroot the two trees!
    tr1.is_rooted = False
    tr1.collapse_basal_bifurcation(set_as_unrooted_tree=True)

    tr2.is_rooted = False
    tr2.collapse_basal_bifurcation(set_as_unrooted_tree=True)

    # Restrict the two trees to the same leaf set if necessary!
    lb1 = set([l.taxon.label for l in tr1.leaf_nodes()])
    lb2 = set([l.taxon.label for l in tr2.leaf_nodes()])

    com = lb1.intersection(lb2)
    if com != lb1 or com != lb2:
        com = list(com)
        tns = dendropy.TaxonNamespace(com)

        tr1.retain_taxa_with_labels(com)
        tr1.migrate_taxon_namespace(tns)

        tr2.retain_taxa_with_labels(com)
        tr2.migrate_taxon_namespace(tns)
    com = list(com)

    # Compare trees!
    tr1.update_bipartitions()
    tr2.update_bipartitions()

    nl = len(com)
    ei1 = len(tr1.internal_edges(exclude_seed_edge=True))
    ei2 = len(tr2.internal_edges(exclude_seed_edge=True))

    [fn, fp] = false_positives_and_negatives(tr1, tr2)
    rf = (fn + fp) / (2.0 * nl - 6.0)

    return (nl, ei1, ei2, fn, fp, rf)
Exemple #17
0
    f.write('\nMinimum (non-zero) RF Distance: {}\n'.format(
        rf_pair_matrix[np.nonzero(rf_pair_matrix)].min()))
    f.write('Maximum RF Distance: {}\n'.format(rf_pair_matrix.max()))
    f.write('Mean RF Distance: {}\n'.format(
        rf_pair_matrix[np.nonzero(rf_pair_matrix)].mean()))
    f.write('Std. Dev. of RF Distance: {}\n'.format(
        np.sqrt(rf_pair_matrix[np.nonzero(rf_pair_matrix)].var())))

# Consensus Tree Methods
print('Calculating consensus trees...')
f.write('\n#### Strict Consensus Tree ####\n')
strict_con_tree = tlist.consensus(min_freq=1.0)
f.write('{}\n'.format(strict_con_tree.as_string('newick')))
strict_stats = np.zeros((N, 3))
for i in range(N):
    fp, fn = treecompare.false_positives_and_negatives(strict_con_tree,
                                                       dp_trees[i])
    strict_stats[i] = [fp, fn, fp + fn]
pd_strict_stats = pd.DataFrame(
    data=strict_stats,
    index=combined_iters,
    columns=['False Positive', 'False Negative', 'RF Distance'])
f.write('{}\n'.format(str(pd_strict_stats)))
f.write('\nMinimum RF Distance: {}\n'.format(strict_stats[:, 2].min()))
f.write('Maximum RF Distance: {}\n'.format(strict_stats[:, 2].max()))
f.write('Mean RF Distance: {}\n'.format(strict_stats[:, 2].mean()))
f.write('Std. Dev. of RF Distance: {}\n'.format(
    np.sqrt(strict_stats[:, 2].var())))

f.write('\n#### Majority Rule Consensus Tree ####\n')
maj_con_tree = tlist.consensus(min_freq=0.5)
f.write('{}\n'.format(maj_con_tree.as_string('newick')))
        #result_file.write(method + '\n')
        for i in range(20):
            truth = '../../{}/{}/R{}/rose.tt'.format(data, data, i)
            predicted_tree_file = (data + '/' + method + '/R' + str(i) +
                                   '/out_tree.nwk')
            if (not os.path.isfile(predicted_tree_file)
                    or os.stat(predicted_tree_file).st_size == 0):
                result_file.write(method + ',R' + str(i) + ',err,err\n')
                continue
            true_tree_file = (truth)

            tree1 = Tree.get_from_path(predicted_tree_file,
                                       "newick",
                                       taxon_namespace=tns)
            tree2 = Tree.get_from_path(true_tree_file,
                                       "newick",
                                       taxon_namespace=tns)

            tree1.encode_bipartitions()
            tree2.encode_bipartitions()

            print('R' + str(i),
                  treecompare.false_positives_and_negatives(tree1, tree2))
            result_file.write(method + ',R' + str(i) + ',' + ','.join([
                str(x) for x in treecompare.false_positives_and_negatives(
                    tree1, tree2)
            ]))
            result_file.write('\n')

    result_file.close()
Exemple #19
0
    result_file = open('result_{}_nj.txt'.format(data), 'w')
    for method in distance_methods:
            #result_file.write(method + '\n')
            for i in range(20):
                    truth = '../../{}/{}/R{}/rose.tt'.format(data, data, i)
                    predicted_tree_file = (data + '/' + method + '/R'+ str(i)
                            + '/out_tree.nwk')
                    if (not os.path.isfile(predicted_tree_file) or 
                            os.stat(predicted_tree_file).st_size == 0):
                        result_file.write(method+',R'+str(i)+',err,err\n')
                        continue
                    true_tree_file = (truth)

                    tree1 = Tree.get_from_path(
                            predicted_tree_file,
                            "newick",
                            taxon_namespace=tns)
                    tree2 = Tree.get_from_path(
                            true_tree_file,
                            "newick",
                            taxon_namespace=tns)

                    tree1.encode_bipartitions()
                    tree2.encode_bipartitions()

                    print('R'+str(i),treecompare.false_positives_and_negatives(tree1, tree2))
                    result_file.write(method+',R'+str(i)+','+','.join([str(x) for x in treecompare.false_positives_and_negatives(tree1, tree2)]))
                    result_file.write('\n')

    result_file.close()
Exemple #20
0
def main():
    parser = optparse.OptionParser(usage='ttp-parse-log [options] <log file>')
    parser.add_option('--out',
                      dest='out_path',
                      default=None,
                      help='Path for output')
    parser.add_option(
        '--near',
        dest='near_percent',
        default=20,
        help='Trees within <--near>% of the optimal cost will be captured')
    parser.add_option(
        '--true',
        dest='true_tree_path',
        default=None,
        help='Can provide a true tree to compare multiple optimal trees with')
    parser.add_option(
        '--include_near',
        action='store_true',
        dest='include_near',
        default=False,
        help='Include nearby optimal trees in summary statistics')
    parser.add_option(
        '--separate_trees',
        action='store_true',
        dest='separate_trees',
        default=False,
        help='Create two output files separating trees and statistics')

    options, args = parser.parse_args()
    if len(args) == 0 or len(args) > 1:
        parser.print_help()
        sys.exit(1)

    print('### {} Version {} ###'.format(NAME, VERSION))

    file_path = args[0]
    out_path = options.out_path
    near_percent = options.near_percent
    true_tree_path = options.true_tree_path
    include_near = options.include_near
    separate_trees = options.separate_trees

    print('Logfile: {}'.format(file_path))
    if true_tree_path is not None:
        try:
            true_tree = dp.Tree.get(path=true_tree_path, schema='newick')
        except:
            print('True tree path is not a valid tree file')
            sys.exit(1)
    else:
        true_tree = False

    if out_path:
        f = open(out_path, 'w+')
    else:
        f = sys.stdout

    if separate_trees:
        if not out_path:
            print('Cannot separate trees if using stdout')
            sys.exit(1)
        g = open('{}.trees'.format(out_path), 'w+')
    else:
        g = f

    final = False
    current_iter = 0
    best_cost = float("inf")
    accepted_iters = []
    best_iters = []
    best_trees = []
    near_iters = []
    near_trees = []

    print('Parsing log file...')
    with open(file_path, 'r') as h:
        for line in h:
            if line.strip().startswith('search: cost'):
                current_cost = int(re.search(r'\d+$', line.strip()).group())
                if current_cost < best_cost:
                    best_cost = current_cost
    near_cost = float(best_cost * (1 + (float(near_percent) / 100)))

    with open(file_path, 'r') as h:
        for line in h:
            line = line.strip()
            if final and line.startswith('search: changed') and line.endswith(
                    'no'):
                break
            if line.startswith('search: iter'):
                current_iter = int(re.search(r'\d+$', line).group())
            elif line.startswith('search: final') and not line.startswith(
                    'search: final cost'):
                current_iter = 'Final'
                final = True
            elif line.startswith('search: cost') or line.startswith(
                    'search: final cost'):
                current_cost = int(re.search(r'\d+$', line).group())
                if current_cost == best_cost:
                    best_iters.append(current_iter)
                elif current_cost <= near_cost:
                    near_iters.append(current_iter)
            elif current_iter in best_iters and line.startswith(
                    'tree:') and line.endswith(';'):
                tree_string = re.search('tree: (.*)', line).group(1)
                new_tree = tree_string
                best_trees.append(new_tree)
                if final:
                    break
            elif current_iter in near_iters and line.startswith(
                    'tree:') and line.endswith(';'):
                tree_string = re.search('tree: (.*)', line).group(1)
                new_tree = tree_string
                near_trees.append(new_tree)

    assert len(best_iters) == len(best_trees)

    # Best and Near Trees
    g.write('#### Best Trees ####\n')
    for i in range(len(best_iters)):
        g.write('>{}\n'.format(str(best_iters[i])))
        g.write('{}\n'.format(best_trees[i]))

    g.write('\n#### Near Best Trees ####\n')
    for i in range(len(near_iters)):
        g.write('>{}\n'.format(str(near_iters[i])))
        g.write('{}\n'.format(near_trees[i]))

    if include_near:
        combined_iters = best_iters + near_iters
        combined_trees = best_trees + near_trees
    else:
        combined_iters = best_iters
        combined_trees = best_trees

    N = len(combined_trees)

    f.write('\n#### Summary ####\n')
    f.write('There are {} trees with cost {}\n'.format(len(best_iters),
                                                       best_cost))
    f.write(
        'There are {} more trees within {}% of the best cost ({} < cost <= {})\n'
        .format(len(near_iters), near_percent, best_cost, near_cost))
    test_t = dp.Tree.get_from_string(combined_trees[0], 'newick')
    f.write('Number of taxa is {}\n'.format(len(test_t.leaf_nodes())))

    # Pairwise Robinson Foulds
    print('Calculating pairwise Robinson-Foulds distances...')
    f.write('\n#### Pairwise Robinson-Foulds Distances ####\n')
    dp_trees = [dp.Tree.get_from_string(i, 'newick') for i in combined_trees]
    tlist = dp.TreeList(dp_trees)
    rf_pair_matrix = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            if i != j:
                rf_pair_matrix[i, j] = treecompare.symmetric_difference(
                    dp_trees[i], dp_trees[j])
    pd.set_option('display.max_columns', None)
    pd_pair_matrix = pd.DataFrame(data=rf_pair_matrix,
                                  index=combined_iters,
                                  columns=combined_iters)
    f.write('{}\n'.format(str(pd_pair_matrix)))
    if len(combined_iters) > 1:
        f.write('\nMinimum (non-zero) RF Distance: {}\n'.format(
            rf_pair_matrix[np.nonzero(rf_pair_matrix)].min()))
        f.write('Maximum RF Distance: {}\n'.format(rf_pair_matrix.max()))
        f.write('Mean RF Distance: {}\n'.format(
            rf_pair_matrix[np.nonzero(rf_pair_matrix)].mean()))
        f.write('Std. Dev. of RF Distance: {}\n'.format(
            np.sqrt(rf_pair_matrix[np.nonzero(rf_pair_matrix)].var())))


# Consensus Tree Methods
    print('Calculating consensus trees...')
    f.write('\n#### Strict Consensus Tree ####\n')
    strict_con_tree = tlist.consensus(min_freq=1.0)
    f.write('{}\n'.format(strict_con_tree.as_string('newick')))
    strict_stats = np.zeros((N, 3))
    for i in range(N):
        fp, fn = treecompare.false_positives_and_negatives(
            strict_con_tree, dp_trees[i])
        strict_stats[i] = [fp, fn, fp + fn]
    pd_strict_stats = pd.DataFrame(
        data=strict_stats,
        index=combined_iters,
        columns=['False Positive', 'False Negative', 'RF Distance'])
    f.write('{}\n'.format(str(pd_strict_stats)))
    f.write('\nMinimum RF Distance: {}\n'.format(strict_stats[:, 2].min()))
    f.write('Maximum RF Distance: {}\n'.format(strict_stats[:, 2].max()))
    f.write('Mean RF Distance: {}\n'.format(strict_stats[:, 2].mean()))
    f.write('Std. Dev. of RF Distance: {}\n'.format(
        np.sqrt(strict_stats[:, 2].var())))

    f.write('\n#### Majority Rule Consensus Tree ####\n')
    maj_con_tree = tlist.consensus(min_freq=0.5)
    f.write('{}\n'.format(maj_con_tree.as_string('newick')))
    maj_stats = np.zeros((N, 3))
    for i in range(N):
        fp, fn = treecompare.false_positives_and_negatives(
            maj_con_tree, dp_trees[i])
        maj_stats[i] = [fp, fn, fp + fn]
    pd_maj_stats = pd.DataFrame(
        data=maj_stats,
        index=combined_iters,
        columns=['False Positive', 'False Negative', 'RF Distance'])
    f.write('{}\n'.format(str(pd_maj_stats)))
    f.write('\nMinimum RF Distance: {}\n'.format(maj_stats[:, 2].min()))
    f.write('Maximum RF Distance: {}\n'.format(maj_stats[:, 2].max()))
    f.write('Mean RF Distance: {}\n'.format(maj_stats[:, 2].mean()))
    f.write('Std. Dev. of RF Distance: {}\n'.format(
        np.sqrt(maj_stats[:, 2].var())))

    # Comparison with True Tree
    if true_tree:
        print('Calculating Robinson-Foulds distances to the true tree...')
        f.write('\n#### Comparison to True Tree ####\n')
        true_matrix = np.zeros((N + 2, 3))
        true_plus_all = dp_trees + [strict_con_tree, maj_con_tree, true_tree]
        truelist = dp.TreeList(true_plus_all)
        for i in range(N + 2):
            fp, fn = treecompare.false_positives_and_negatives(
                true_tree, true_plus_all[i])
            true_matrix[i] = [fp, fn, fp + fn]
        combined_consensus_iters = combined_iters + [
            'Strict Consensus', 'Majority Consensus'
        ]
        combined_consensus_trees = combined_trees + [
            strict_con_tree, maj_con_tree
        ]
        pd_true_matrix = pd.DataFrame(
            data=true_matrix,
            index=combined_consensus_iters,
            columns=['False Positive', 'False Negative', 'RF Distance'])
        f.write('{}\n'.format(str(pd_true_matrix)))
        f.write('\nMinimum RF Distance: {}\n'.format(true_matrix[:, 2].min()))
        f.write('Maximum RF Distance: {}\n'.format(true_matrix[:, 2].max()))
        f.write('Mean RF Distance: {}\n'.format(true_matrix[:, 2].mean()))
        f.write('Std. Dev. of RF Distance: {}\n'.format(
            np.sqrt(true_matrix[:, 2].var())))
        f.write('Tree {} is closest to the true tree\n'.format(
            combined_consensus_iters[np.argmin(true_matrix[:, 2])]))
        f.write('{}\n'.format(combined_consensus_trees[np.argmin(
            true_matrix[:, 2])]))

    f.close()
    if out_path and not separate_trees:
        print(
            'Optimal and Near-Optimal Trees and Summary Statistics written to {}'
            .format(out_path))
    elif out_path and separate_trees:
        print('Optimal and Near-Optimal Trees written to {}.trees'.format(
            out_path))
        print('Summary Statistics written to {}'.format(out_path))
        g.close()
Exemple #21
0
                    true_tree_file = (truth)
                    try:
                     
                        tree1 = Tree.get_from_path(
                                predicted_tree_file,
                                "newick",
                                taxon_namespace=tns)
                        tree2 = Tree.get_from_path(
                                true_tree_file,
                                "newick",
                                taxon_namespace=tns)

                        tree1.encode_bipartitions()
                        tree2.encode_bipartitions()
                        print("try")
                        print('R'+str(i),treecompare.false_positives_and_negatives(tree1, tree2))
                        effective_samples = effective_samples+1
                        agg_false_positives = agg_false_negatives+treecompare.false_positives_and_negatives(tree1, tree2)[0]
                        print(treecompare.false_positives_and_negatives(tree1, tree2)[0],treecompare.false_positives_and_negatives(tree1, tree2)[1])
                        agg_false_negatives = agg_false_negatives+treecompare.false_positives_and_negatives(tree1, tree2)[1]
                        result_file.write('R'+str(i)+str(treecompare.false_positives_and_negatives(tree1, tree2)))
                    except Exception as e:
                        print("exception")
                        print(e)
                        result_file.write('R'+str(i)+"(err,err)\n")
            ave_false_positive = agg_false_positives/effective_samples
            ave_false_negative = agg_false_negatives/effective_samples
            result_file.write("total effective{}, averge false positive{},average false negative{}".format(effective_samples,ave_false_positive,ave_false_negative))   
            result_file.write('\n')

    result_file.close()