Exemple #1
0
    def add_sampled_network(self):

        # Create networkx DiGraph to represent true_tree
        tree = nx.DiGraph()

        cell_record = self.get_cell_record()
        keep_labels = self.get_node_labels()
        parent_ix_levels = self.get_parent_child_map()

        # Create nodes representing the leaves

        record = cell_record[0]
        prev_level = [
            Node(label, format_char_vec(record[i]))
            for i, label in enumerate(keep_labels[0])
        ]

        for level, mapping in enumerate(parent_ix_levels):
            # Construct edges from this level to the next level
            level_labels = keep_labels[level + 1]
            record = cell_record[level + 1]
            current_level = []
            for child in mapping:
                # Create a child node
                child_node = Node(level_labels[child],
                                  format_char_vec(record[child]))
                parent = prev_level[mapping[child]]
                tree.add_edges_from([(parent, child_node)])
                current_level.append(child_node)
            # Current level finished adding to tree, move on to lower level
            prev_level = current_level

        self.true_network = tree
Exemple #2
0
def extend_dummy_branches(G, max_depth):
    """
    Converts the tree to an ultrametric tree by adding in dummy nodes and branches &
    extending true leaves to the max depth.

    :param G:
        Input tree
    :param max_depth:
        Depth to extend leaves to.

    :returns:
        Ultrametric tree with dummy edges/nodes.
    """

    leaves = [n for n in G.nodes if G.out_degree(n) == 0]
    for n in leaves:

        new_node_iter = 1
        while G.nodes[n]["depth"] < max_depth:

            d = G.nodes[n]["depth"]
            new_node = Node('state-node', n.get_character_vec())
            parents = list(G.predecessors(n))
            for p in parents:
                G.remove_edge(p, n)
                G.add_edge(p, new_node)
            G.add_edge(new_node, n)

            G.nodes[new_node]["depth"] = d
            G.nodes[n]["depth"] = d + 1

            new_node_iter += 1

    return G
def cassiopeia_reconstruct(simulation):
    print('Reconstructing Cassiopeia Tree')
    
    priors = None 
    character_matrix = simulation.get_final_cells()

    # Cassiopeia takes a string dataframe
    cm = character_matrix.replace(np.nan, -1)
    cm = cm.astype(int).astype(str).replace('-1','-')
    cm_uniq = cm.drop_duplicates(inplace=False)
    target_nodes = cm_uniq.values.tolist()
    target_nodes = list(map(lambda x, n: Node(n,x), target_nodes, cm_uniq.index))


    t = time.time()
    reconstructed_network_greedy = solve_lineage_instance(target_nodes, 
                                                          method="greedy", 
                                                          prior_probabilities=priors)
    cass_time = time.time()-t
    cass_network = reconstructed_network_greedy[0]
    true_tree = simulation.get_cleaned_tree()
    cass_tree, duplicates = utilities.convert_nx_to_tree(cass_network.network)
    
    our_score = utilities.triplets_correct(true_tree, cass_tree)
    print('Cassiopeia:', our_score)

    return cass_tree, {'our_score':our_score}
def convert_tree_to_nx(tree):
    """
    Convert a binary tree, G, represented as an skbio TreeNode to
    a networkx DiGraph.
    """
    
    from cassiopeia.TreeSolver.Node import Node 

    network = nx.DiGraph()
    level_nodes = [tree]
    level_nx = [Node(x.name, is_target=False) for x in level_nodes]
    level = 0

    stop = False
    while not stop:
        successor_nodes = []
        successor_nx = []

        for i, tree_node in enumerate(level_nodes):
            node = level_nx[i]
            # Lookup Cassiopeia node from dictionary that was created when node was probed as a child

            for child_node in tree_node.children:
                # Create CassiopeiaNode for each child and add to the DiGraph
                # If the child is a leaf, then we need to add a character vector

                if child_node.is_tip():
                    child = Node(child_node.name, 
                                 character_vec = child_node.get_character_matrix().replace(-1,'-').values.reshape(-1).tolist(), 
                                 is_target=True)
                else:
                    child = Node(child_node.name, is_target=False)

                network.add_edges_from([(node, child)])

                successor_nodes.append(child_node)
                successor_nx.append(child)

        # Now the successor level is the current level for the next iteration
        level_nodes = successor_nodes
        level_nx = successor_nx

        if len(level_nodes) == 0:
            stop = True 
    return network 
Exemple #5
0
def add_redundant_leaves(G, cm):
    """
    To fairly take into account sample purity, we'll add back in 'redundant' leaves (i.e.
    leaves that were removed because of non-unique character strings).

    :param G:
        Input graph
    :param cm:
        Character matrix pandas Dataframe

    :return:
        Graph with redundant samples added back on.
    """

    # create lookup value for duplicates
    if 'lookup' not in cm.columns:
        cm["lookup"] = cm.astype('str').apply('|'.join, axis=1)

    net_nodes = np.intersect1d(cm.index, [n.name for n in G])

    uniq = cm.loc[net_nodes]

    if uniq.shape == cm.shape:
        return G

    # find all non-unique character states in cm
    #nonuniq = np.setdiff1d(cm.index, np.array(uniq))
    nonuniq = np.setdiff1d(cm.index, uniq.index)

    for n in nonuniq:

        new_node = str(n)

        try:
            _leaf = uniq.index[uniq["lookup"] == cm.loc[n]["lookup"]][0]

            new_node = Node(str(n), cm.loc[n].values, is_target=True)

            parents = list(G.predecessors(_leaf))
            for p in parents:
                G.add_edge(p, new_node)
        except:
            continue

    return G
Exemple #6
0
def assign_samples_to_charstrings(G, cm):
    """
    Preprocessing step if sample names are not in the tree. Assigns sample name to appropriate
    character states in the phylogeny. 

    :param G:
        Input graph.
    :param cm:
        Character matrix pandas Dataframe.

    :return:
        Networkx Graph object as a tree with samples mapped onto the tree.
    """

    new_nodes = []
    new_edges = []

    nodes_to_remove = []

    root = [n for n in G if G.in_degree(n) == 0][0]

    if 'lookup' not in cm.columns:
        cm["lookup"] = cm.astype(str).apply(lambda x: "|".join(x), axis=1)

    for n in G:

        if n.get_character_string() in cm['lookup'].values and n.is_target:
            n.is_target = False
            sub_cm = cm.loc[cm["lookup"] == n.get_character_string()]
            _nodes = sub_cm.apply(
                lambda x: Node(x.name, x.values[:-1], is_target=True), axis=1
            )  # make sure to do up to [:-1] b/c you don't want the lookup in your character vec
            if len(_nodes) == 0:
                continue
            for new_node in _nodes:
                new_nodes.append(new_node)
                new_edges.append((n, new_node))

    G.add_nodes_from(new_nodes)
    G.add_edges_from(new_edges)

    return G
Exemple #7
0
def main():
    """
    Takes in a character matrix, an algorithm, and an output file and
    returns a tree in newick format.

    """
    parser = argparse.ArgumentParser()
    parser.add_argument("netfp", type=str, help="character_matrix")
    parser.add_argument("-nj",
                        "--neighbor-joining",
                        action="store_true",
                        default=False)
    parser.add_argument("--neighbor_joining_weighted",
                        action="store_true",
                        default=False)
    parser.add_argument("--ilp", action="store_true", default=False)
    parser.add_argument("--hybrid", action="store_true", default=False)
    parser.add_argument("--cutoff",
                        type=int,
                        default=80,
                        help="Cutoff for ILP during Hybrid algorithm")
    parser.add_argument(
        "--hybrid_lca_mode",
        action="store_true",
        help=
        "Use LCA distances to transition in hybrid mode, instead of number of cells",
    )
    parser.add_argument("--time_limit",
                        type=int,
                        default=-1,
                        help="Time limit for ILP convergence")
    parser.add_argument(
        "--iter_limit",
        type=int,
        default=-1,
        help="Max number of iterations for ILP solver",
    )
    parser.add_argument("--greedy", "-g", action="store_true", default=False)
    parser.add_argument("--camin-sokal",
                        "-cs",
                        action="store_true",
                        default=False)
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="output verbosity")
    parser.add_argument("--mutation_map", type=str, default="")
    parser.add_argument("--num_threads", type=int, default=1)
    parser.add_argument("--no_triplets", action="store_true", default=False)
    parser.add_argument("--max_neighborhood_size", type=str, default=3000)
    parser.add_argument("--out_fp",
                        type=str,
                        default=None,
                        help="optional output file")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Random seed for ILP solver")

    args = parser.parse_args()

    netfp = args.netfp
    outfp = args.out_fp
    verbose = args.verbose

    lca_mode = args.hybrid_lca_mode
    if lca_mode:
        lca_cutoff = args.cutoff
        cell_cutoff = None
    else:
        cell_cutoff = args.cutoff
        lca_cutoff = None
    time_limit = args.time_limit
    iter_limit = args.iter_limit
    num_threads = args.num_threads
    max_neighborhood_size = args.max_neighborhood_size
    seed = args.seed

    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    score_triplets = not args.no_triplets

    prior_probs = None
    if args.mutation_map != "":

        prior_probs = pic.load(open(args.mutation_map, "rb"))

    name = netfp.split("/")[-1]
    stem = ".".join(name.split(".")[:-1])

    true_network = nx.read_gpickle(netfp)

    if isinstance(true_network, Cassiopeia_Tree):
        true_network = true_network.get_network()

    target_nodes = get_leaves_of_tree(true_network)

    target_nodes_uniq = []
    seen_charstrings = []
    for t in target_nodes:
        if t.char_string not in seen_charstrings:
            seen_charstrings.append(t.char_string)
            target_nodes_uniq.append(t)

    if args.greedy:

        if verbose:
            print("Running Greedy Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")

        reconstructed_network_greedy = solve_lineage_instance(
            target_nodes_uniq,
            method="greedy",
            prior_probabilities=prior_probs)

        net = reconstructed_network_greedy[0]

        if outfp is None:
            outfp = name.replace("true", "greedy")
        pic.dump(net, open(outfp, "wb"))

    elif args.hybrid:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_hybrid = solve_lineage_instance(
            target_nodes_uniq,
            method="hybrid",
            hybrid_cell_cutoff=cell_cutoff,
            hybrid_lca_cutoff=lca_cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            threads=num_threads,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_hybrid[0]

        if outfp is None:
            outfp = name.replace("true", "hybrid")
        pic.dump(net, open(outfp, "wb"))

    elif args.ilp:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_ilp = solve_lineage_instance(
            target_nodes_uniq,
            method="ilp",
            hybrid_subset_cutoff=cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_ilp[0]
        # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample)
        if outfp is None:
            outfp = name.replace("true", "ilp")
        pic.dump(net, open(outfp, "wb"))

    elif args.neighbor_joining:

        if verbose:
            print("Running Neighbor-Joining on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        infile = "".join(name.split(".")[:-1]) + "infile.txt"
        fn = "".join(name.split(".")[:-1]) + "phylo.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(
            script) + " " + fn + " " + infile + " --relaxed"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        aln = AlignIO.read(infile, "phylip-relaxed")

        aln = unique_alignments(aln)

        t0 = time.time()
        calculator = DistanceCalculator("identity", skip_letters="?")
        constructor = DistanceTreeConstructor(calculator, "nj")

        tree = constructor.build_tree(aln)

        tree.root_at_midpoint()

        nj_net = Phylo.to_networkx(tree)

        # convert labels to characters for writing to file
        i = 0
        rndict = {}
        for n in nj_net:

            if n.name is None:
                rndict[n] = Node("state-node", [])
                # n.name = "internal" + str(i)
                # i += 1
            else:
                rndict[n] = Node(n.name, [])

        nj_net = nx.relabel_nodes(nj_net, rndict)

        # convert labels to strings, not Bio.Phylo.Clade objects
        # c2str = map(lambda x: x.name, list(nj_net.nodes()))
        # c2strdict = dict(zip(list(nj_net.nodes()), c2str))
        # nj_net = nx.relabel_nodes(nj_net, c2strdict)

        cm = pd.read_csv(fn, sep="\t", index_col=0)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        nj_net = fill_in_tree(nj_net, cm)

        nj_net = tree_collapse(nj_net)

        for n in nj_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj")
        pic.dump(nj_net, open(outfp, "wb"))
        # Phylo.write(tree, out, 'newick')

        os.system("rm " + infile)
        os.system("rm " + fn)

    elif args.neighbor_joining_weighted:

        if verbose:
            print("Running Neighbor-Joining with Weighted Scoring on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        target_node_charstrings = np.array(
            [t.get_character_vec() for t in target_nodes_uniq])
        dm = compute_distance_mat(target_node_charstrings,
                                  len(target_node_charstrings),
                                  priors=prior_probs)

        ids = [t.name for t in target_nodes_uniq]
        cm_uniq = pd.DataFrame(target_node_charstrings)
        cm_uniq.index = ids
        dm = sp.spatial.distance.squareform(dm)

        dm = DistanceMatrix(dm, ids)

        newick_str = nj(dm, result_constructor=str)

        tree = newick_to_network(newick_str, cm_uniq)

        nj_net = fill_in_tree(tree, cm_uniq)
        nj_net = tree_collapse(nj_net)

        cm_lookup = dict(
            zip(
                list(
                    cm_uniq.apply(
                        lambda x: "|".join([str(k) for k in x.values]),
                        axis=1)),
                cm_uniq.index.values,
            ))

        rdict = {}
        for n in nj_net:
            if n.char_string in cm_lookup:
                n.is_target = True
            else:
                n.is_target = False

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj_weighted")
        pic.dump(nj_net, open(outfp, "wb"))

    elif args.camin_sokal:

        if verbose:
            print("Running Camin-Sokal Max Parsimony Algorithm on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        samples_to_cells = {}
        indices = []
        for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq):
            samples_to_cells["s" + str(i)] = n.name
            indices.append(n.name)
            n.name = str(i)

        infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt"
        fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt"
        weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(script) + " " + fn + " " + infile
        pi = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(pi.pid, 0)

        weights = construct_weights(infile, weights_fn)

        os.system("touch outfile")
        os.system("touch outtree")

        outfile = stem + "outfile.txt"
        outtree = stem + "outtree.txt"
        # run phylip mix with camin-sokal
        responses = "." + stem + ".temp.txt"
        FH = open(responses, "w")
        current_dir = os.getcwd()
        FH.write(infile + "\n")
        FH.write("F\n" + outfile + "\n")
        FH.write("P\n")
        FH.write("W\n")
        FH.write("Y\n")
        FH.write(weights_fn + "\n")
        FH.write("F\n" + outtree + "\n")
        FH.close()

        t0 = time.time()
        cmd = "~/software/phylip-3.697/exe/mix"
        cmd += " < " + responses + " > screenout1"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        consense_outtree = stem + "consenseouttree.txt"
        consense_outfile = stem + "consenseoutfile.txt"

        FH = open(responses, "w")
        FH.write(outtree + "\n")
        FH.write("F\n" + consense_outfile + "\n")
        FH.write("Y\n")
        FH.write("F\n" + consense_outtree + "\n")
        FH.close()

        if verbose:
            print("Computing Consensus Tree, elasped time: " +
                  str(time.time() - t0))

        cmd = "~/software/phylip-3.697/exe/consense"
        cmd += " < " + responses + " > screenout"
        p2 = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p2.pid, 0)

        newick_str = ""
        with open(consense_outtree, "r") as f:
            for l in f:
                l = l.strip()
                newick_str += l

        cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str)
        cm.index = indices

        cs_net = newick_to_network(newick_str, cm)

        for n in cs_net:
            if n.name in samples_to_cells:
                n.name = samples_to_cells[n.name]

        cs_net = fill_in_tree(cs_net, cm)

        cs_net = tree_collapse2(cs_net)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        for n in cs_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net)
        if outfp is None:
            outfp = name.replace("true", "cs")
        pic.dump(cs_net, open(outfp, "wb"))

        os.system("rm " + outfile)
        os.system("rm " + responses)
        os.system("rm " + outtree)
        os.system("rm " + consense_outfile)
        os.system("rm " + infile)
        os.system("rm " + fn)

    else:

        raise Exception(
            "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal"
        )
Exemple #8
0
def main():
    """
    Takes in a character matrix, an algorithm, and an output file and 
    returns a tree in newick format.

    """

    parser = argparse.ArgumentParser()
    parser.add_argument("char_fp", type=str, help="character_matrix")
    parser.add_argument("out_fp", type=str, help="output file name")
    parser.add_argument("-nj",
                        "--neighbor-joining",
                        action="store_true",
                        default=False)
    parser.add_argument("--neighbor_joining_weighted",
                        action="store_true",
                        default=False)
    parser.add_argument("--ilp", action="store_true", default=False)
    parser.add_argument("--hybrid", action="store_true", default=False)
    parser.add_argument("--cutoff",
                        type=int,
                        default=80,
                        help="Cutoff for ILP during Hybrid algorithm")
    parser.add_argument(
        "--hybrid_lca_mode",
        action="store_true",
        help=
        "Use LCA distances to transition in hybrid mode, instead of number of cells",
    )
    parser.add_argument("--time_limit",
                        type=int,
                        default=1500,
                        help="Time limit for ILP convergence")
    parser.add_argument("--greedy", "-g", action="store_true", default=False)
    parser.add_argument("--camin-sokal",
                        "-cs",
                        action="store_true",
                        default=False)
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="output verbosity")
    parser.add_argument("--mutation_map", type=str, default="")
    parser.add_argument("--num_threads", type=int, default=1)
    parser.add_argument("--max_neighborhood_size", type=int, default=10000)
    parser.add_argument("--weighted_ilp",
                        "-w",
                        action="store_true",
                        default=False)
    parser.add_argument("--greedy_min_allele_rep", type=float, default=1.0)
    parser.add_argument("--fuzzy_greedy", action="store_true", default=False)
    parser.add_argument("--multinomial_greedy",
                        action="store_true",
                        default=False)
    parser.add_argument("--num_neighbors", default=10)
    parser.add_argument("--num_alternative_solutions", default=100, type=int)
    parser.add_argument("--greedy_missing_data_mode",
                        default="lookahead",
                        type=str)
    parser.add_argument("--greedy_lookahead_depth", default=3, type=int)

    args = parser.parse_args()

    char_fp = args.char_fp
    out_fp = args.out_fp
    verbose = args.verbose

    lca_mode = args.hybrid_lca_mode
    if lca_mode:
        lca_cutoff = args.cutoff
        cell_cutoff = None
    else:
        cell_cutoff = args.cutoff
        lca_cutoff = None
    time_limit = args.time_limit
    num_threads = args.num_threads

    n_neighbors = args.num_neighbors
    num_alt_soln = args.num_alternative_solutions

    max_neighborhood_size = args.max_neighborhood_size

    missing_data_mode = args.greedy_missing_data_mode
    lookahead_depth = args.greedy_lookahead_depth
    if missing_data_mode not in ["knn", "lookahead", "avg", "modified_avg"]:
        raise Exception("Greedy missing data mode not recognized")

    stem = "".join(char_fp.split(".")[:-1])

    cm = pd.read_csv(char_fp, sep="\t", index_col=0, dtype=str)

    cm_uniq = cm.drop_duplicates(inplace=False)

    cm_lookup = list(cm.apply(lambda x: "|".join(x.values), axis=1))
    newick = ""

    prior_probs = None
    if args.mutation_map != "":

        prior_probs = read_mutation_map(args.mutation_map)

    weighted_ilp = args.weighted_ilp
    if prior_probs is None and weighted_ilp:
        raise Exception(
            "If you'd like to use weighted ILP reconstructions, you need to provide a mutation map (i.e. prior probabilities)"
        )

    greedy_min_allele_rep = args.greedy_min_allele_rep
    fuzzy = args.fuzzy_greedy
    probabilistic = args.multinomial_greedy

    if args.greedy:

        target_nodes = list(
            cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1))

        if verbose:
            print("Read in " + str(cm.shape[0]) + " Cells")
            print("Running Greedy Algorithm on " + str(len(target_nodes)) +
                  " Unique States")

        reconstructed_network_greedy, potential_graph_sizes = solve_lineage_instance(
            target_nodes,
            method="greedy",
            prior_probabilities=prior_probs,
            greedy_minimum_allele_rep=greedy_min_allele_rep,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            n_neighbors=n_neighbors,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )

        net = reconstructed_network_greedy.get_network()

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_greedy, open(out_stem + ".pkl", "wb"))

        newick = reconstructed_network_greedy.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

        root = [n for n in net if net.in_degree(n) == 0][0]
        # score parsimony
        score = 0
        for e in nx.dfs_edges(net, source=root):
            score += e[0].get_mut_length(e[1])

        print("Parsimony: " + str(score))

    elif args.hybrid:

        target_nodes = list(
            cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1))

        if verbose:
            print("Running Hybrid Algorithm on " + str(len(target_nodes)) +
                  " Cells")
            if lca_mode:
                print(
                    "Parameters: ILP on sets of cells with a maximum LCA distance of "
                    + str(lca_cutoff) + " with " + str(time_limit) +
                    "s to complete optimization")
            else:
                print("Parameters: ILP on sets of " + str(cell_cutoff) +
                      " cells with " + str(time_limit) +
                      "s to complete optimization")

        # string_to_sample = dict(zip(target_nodes, cm_uniq.index))

        # target_nodes = list(map(lambda x, n: x + "_" + n, target_nodes, cm_uniq.index))

        print("running algorithm...")
        reconstructed_network_hybrid, potential_graph_sizes = solve_lineage_instance(
            target_nodes,
            method="hybrid",
            hybrid_cell_cutoff=cell_cutoff,
            hybrid_lca_cutoff=lca_cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            threads=num_threads,
            max_neighborhood_size=max_neighborhood_size,
            weighted_ilp=weighted_ilp,
            greedy_minimum_allele_rep=greedy_min_allele_rep,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            n_neighbors=n_neighbors,
            maximum_alt_solutions=num_alt_soln,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )

        net = reconstructed_network_hybrid.get_network()

        if verbose:
            print("Writing the tree to output...")

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_hybrid, open(out_stem + ".pkl", "wb"))

        newick = reconstructed_network_hybrid.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

        ## plot out diagnostic potential graph sizes
        h = plt.figure(figsize=(10, 10))
        for i in range(len(potential_graph_sizes)):
            try:
                x, y = (
                    [k for k in potential_graph_sizes[i].keys()],
                    [
                        potential_graph_sizes[i][k]
                        for k in potential_graph_sizes[i].keys()
                    ],
                )
                plt.plot(x, y)
            except:
                continue
        # plt.xlim(0, int(cutoff))
        plt.xlabel("LCA Distance")
        plt.ylabel("Size of Potential Graph")
        plt.savefig(out_stem + "_potentialgraphsizes.pdf")

        # score parsimony
        score = 0
        for e in net.edges():
            score += e[0].get_mut_length(e[1])

        print("Parsimony: " + str(score))

    elif args.ilp:

        target_nodes = list(
            cm_uniq.apply(lambda x: Node(x.name, x.values), axis=1))

        if verbose:
            print("Running ILP Algorithm on " + str(len(target_nodes)) +
                  " Unique Cells")
            print("Paramters: ILP allowed " + str(time_limit) +
                  "s to complete optimization")

        reconstructed_network_ilp, potential_graph_sizes = solve_lineage_instance(
            target_nodes,
            method="ilp",
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            max_neighborhood_size=max_neighborhood_size,
            weighted_ilp=weighted_ilp,
            maximum_alt_solutions=num_alt_soln,
        )

        net = reconstructed_network_ilp.get_network()

        root = [n for n in net if net.in_degree(n) == 0][0]

        # score parsimony
        score = 0
        for e in nx.dfs_edges(net, source=root):
            score += e[0].get_mut_length(e[1])

        print("Parsimony: " + str(score))

        newick = reconstructed_network_ilp.get_newick()

        if verbose:
            print("Writing the tree to output...")

        out_stem = "".join(out_fp.split(".")[:-1])
        pic.dump(reconstructed_network_ilp, open(out_stem + ".pkl", "wb"))

        with open(out_fp, "w") as f:
            f.write(newick)

        h = plt.figure(figsize=(10, 10))
        for i in range(len(potential_graph_sizes)):
            try:
                x, y = (
                    [k for k in potential_graph_sizes[i].keys()],
                    [
                        potential_graph_sizes[i][k]
                        for k in potential_graph_sizes[i].keys()
                    ],
                )
                plt.plot(x, y)
            except:
                continue
        # plt.xlim(0, int(cutoff))
        plt.xlabel("LCA Distance")
        plt.ylabel("Size of Potential Graph")
        plt.savefig(out_stem + "_potentialgraphsizes.pdf")

    elif args.neighbor_joining:

        out_stem = "".join(out_fp.split(".")[:-1])

        ret_tree = run_nj_naive(cm_uniq, stem, verbose)

        pic.dump(ret_tree, open(out_stem + ".pkl", "wb"))

        newick = ret_tree.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

    elif args.neighbor_joining_weighted:

        out_stem = "".join(out_fp.split(".")[:-1])
        ret_tree = run_nj_weighted(cm_uniq, prior_probs, verbose)

        pic.dump(ret_tree, open(out_stem + ".pkl", "wb"))

        newick = ret_tree.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

    elif args.camin_sokal:

        out_stem = "".join(out_fp.split(".")[:-1])

        ret_tree = run_camin_sokal(cm_uniq, stem, verbose)

        pic.dump(ret_tree, open(out_stem + ".pkl", "wb"))

        newick = convert_network_to_newick_format(ret_tree.get_network())
        # newick = ret_tree.get_newick()

        with open(out_fp, "w") as f:
            f.write(newick)

    elif alg == "--max-likelihood" or alg == "-ml":

        # cells = cm.index
        # samples = [("s" + str(i)) for i in range(len(cells))]
        # samples_to_cells = dict(zip(samples, cells))

        # cm.index = list(range(len(cells)))

        if verbose:
            print("Running Maximum Likelihood on " + str(cm.shape[0]) +
                  " Unique Cells")

        infile = stem + "infile.txt"
        fn = stem + "phylo.txt"

        cm.to_csv(fn, sep="\t")

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(
            script) + " " + fn + " " + infile + " --relaxed"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        os.system("/home/mattjones/software/FastTreeMP < " + infile + " > " +
                  out_fp)

        tree = Phylo.parse(out_fp, "newick").next()

        ml_net = Phylo.to_networkx(tree)

        i = 0
        for n in ml_net:
            if n.name is None:
                n.name = "internal" + str(i)
                i += 1

        c2str = map(lambda x: str(x), ml_net.nodes())
        c2strdict = dict(zip(ml_net.nodes(), c2str))
        ml_net = nx.relabel_nodes(ml_net, c2strdict)

        out_stem = "".join(out_fp.split(".")[:-1])

        pic.dump(ml_net, open(out_stem + ".pkl", "wb"))

        os.system("rm " + infile)
        os.system("rm " + fn)

    else:

        raise Exception(
            "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, max-likelihood, or camin-sokal"
        )
Exemple #9
0
def solve_lineage_instance(
    _target_nodes,
    prior_probabilities=None,
    method="hybrid",
    threads=8,
    hybrid_cell_cutoff=200,
    hybrid_lca_cutoff=None,
    time_limit=1800,
    max_neighborhood_size=10000,
    seed=None,
    num_iter=-1,
    weighted_ilp=False,
    fuzzy=False,
    probabilistic=False,
    plot_diagnostics=True,
    maximum_alt_solutions=100,
    greedy_minimum_allele_rep=1.0,
    n_neighbors=10,
    missing_data_mode="lookahead",
    lookahead_depth=3,
):
    """
	Aggregated lineage solving method, which given a set of target nodes, will find the maximum parsimony tree
	accounting the given target nodes

	:param target_nodes:
		A list of target nodes, where each node is in the form 'Ch1|Ch2|....|Chn'
	:param prior_probabilities:
		A nested dictionary containing prior probabilities for [character][state] mappings
		where characters are in the form of integers, and states are in the form of strings,
		and values are the probability of mutation from the '0' state.
	:param method:
		The method used for solving the problem ['ilp, 'hybrid', 'greedy']
			- ilp: Attempts to solve the problem based on steiner tree on the potential graph
				   (Recommended for instances with several hundred samples at most)
			- greedy: Runs a greedy algorithm to find the maximum parsimony tree based on choosing the most occurring split in a
				   top down fasion (Algorithm scales to any number of samples)
			- hybrid: Runs the greedy algorithm until there are less than hybrid_subset_cutoff samples left in each leaf of the
				   tree, and then returns a series of small instance ilp is then run on these smaller instances, and the
				   resulting graph is created by merging the smaller instances with the greedy top-down tree
	:param threads:
		The number of threads to use in parallel for the hybrid algorithm
	:param hybrid_subset_cutoff:
		The maximum number of nodes allowed before the greedy algorithm terminates for a given leaf node
	:return:
		A reconstructed subgraph representing the nodes
	"""

    if method == "hybrid":
        assert (hybrid_cell_cutoff is None or hybrid_lca_cutoff is None
                ), "You can only use one type of cutoff in Hybrid"

    target_nodes = [
        n.get_character_string() + "_" + n.name for n in _target_nodes
    ]

    node_name_dict = dict(
        zip(
            [n.split("_")[0] for n in target_nodes],
            [n + "_target" for n in target_nodes],
        ))

    if seed is not None:
        np.random.seed(seed)
        random.seed(seed)

    # clip identifier for now, but make sure to add later
    target_nodes = [n.split("_")[0] for n in target_nodes]

    # target_nodes = list(set(target_nodes))
    master_root = root_finder(target_nodes)
    if method == "ilp":

        subgraphs, r, pid, graph_sizes = find_good_gurobi_subgraph(
            master_root,
            target_nodes,
            node_name_dict,
            prior_probabilities,
            time_limit,
            1,
            max_neighborhood_size,
            seed=seed,
            num_iter=num_iter,
            weighted=weighted_ilp,
            n_neighbors=n_neighbors,
        )

        subgraph = subgraphs[0]

        rdict = {}
        target_seen = []

        for n in subgraph:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)

            if len(spl) == 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True

            if len(spl) > 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True
                nn.pid = spl[-1]

            if nn.is_target:
                target_seen.append(nn.char_string)

            rdict[n] = nn

        state_tree = nx.relabel_nodes(subgraph, rdict)

        return (
            Cassiopeia_Tree(method="ilp",
                            network=state_tree,
                            name="Cassiopeia_state_tree"),
            graph_sizes,
        )

    if method == "hybrid":

        neighbors, distances = None, None
        if missing_data_mode == "knn":
            print("Computing neighbors for imputing missing values...")
            neighbors, distances = find_neighbors(target_nodes,
                                                  n_neighbors=n_neighbors)

        network, target_sets = greedy_build(
            target_nodes,
            neighbors,
            distances,
            priors=prior_probabilities,
            cell_cutoff=hybrid_cell_cutoff,
            lca_cutoff=hybrid_lca_cutoff,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            minimum_allele_rep=greedy_minimum_allele_rep,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )

        print(
            "Using " + str(min(multiprocessing.cpu_count(), threads)) +
            " threads, " + str(multiprocessing.cpu_count()) + " available.",
            flush=True,
        )
        executor = concurrent.futures.ProcessPoolExecutor(
            min(multiprocessing.cpu_count(), threads))
        print("Sending off Target Sets: " + str(len(target_sets)), flush=True)

        # just in case you've hit a target node during the greedy reconstruction, append name at this stage
        # so the composition step doesn't get confused when trying to join to the root.
        network = nx.relabel_nodes(network, node_name_dict)

        futures = [
            executor.submit(
                find_good_gurobi_subgraph,
                root,
                targets,
                node_name_dict,
                prior_probabilities,
                time_limit,
                1,
                max_neighborhood_size,
                seed,
                num_iter,
                weighted_ilp,
                n_neighbors,
            ) for root, targets in target_sets
        ]

        concurrent.futures.wait(futures)

        base_network = network.copy()
        base_rdict = {}
        for n in base_network:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)
            if len(spl) > 1:
                nn.pid = spl[1]
            if spl[0] in node_name_dict:
                nn.is_target = True

            base_rdict[n] = nn

        base_network = nx.relabel_nodes(base_network, base_rdict)

        num_solutions = 1  # keep track of number of possible solutions
        potential_graph_sizes = []
        all_res = []
        alt_solutions = {}

        for future in futures:
            results, r, pid, graph_sizes = future.result()
            potential_graph_sizes.append(graph_sizes)

            subproblem_solutions = []
            for res in results:
                new_names = {}
                for n in res:
                    if res.in_degree(n) == 0 or n == r:
                        new_names[n] = n
                    else:
                        new_names[n] = n + "_" + str(pid)
                res = nx.relabel_nodes(res, new_names)
                subproblem_solutions.append(res)

            num_solutions *= len(subproblem_solutions)
            all_res.append(subproblem_solutions)

            rt = [
                n for n in subproblem_solutions[0]
                if subproblem_solutions[0].in_degree(n) == 0
            ][0]
            alt_solutions[base_rdict[rt]] = subproblem_solutions

            network = nx.compose(network, subproblem_solutions[0])

        rdict = {}
        target_seen = []

        for n in network:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)

            if len(spl) == 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True

            if len(spl) > 2:
                if "target" in n and nn.char_string not in target_seen:
                    nn.is_target = True
                nn.pid = spl[-1]

            if nn.is_target:
                target_seen.append(nn.char_string)

            rdict[n] = nn

        state_tree = nx.relabel_nodes(network, rdict)

        # create alternative solutions
        pbar = tqdm(total=len(alt_solutions.keys()),
                    desc="Enumerating alternative solutions")
        for r in alt_solutions.keys():
            soln_list = []

            # get original target char strings
            # sub_targets = [n.char_string for n in state_tree.successors(r) if n.is_target]
            for res in alt_solutions[r]:

                rdict = {}
                for n in res:
                    spl = n.split("_")
                    nn = Node(n, spl[0].split("|"), is_target=False)

                    if len(spl) > 2:
                        nn.pid = spl[-1]

                    rdict[n] = nn

                res = nx.relabel_nodes(res, rdict)
                soln_list.append(res)

            alt_solutions[r] = soln_list

            pbar.update(1)  # update progress bar

        # iterate through all possible solutions
        # alt_solutions = []

        # if num_solutions > 1:

        # 	num_considered_solutions = 0
        # 	sol_identifiers = []  # keep track of solutions already sampled

        # 	# we'll sample maximum_alt_solutions from the set of possible solutions
        # 	pbar = tqdm(
        # 		total=maximum_alt_solutions, desc="Enumerating alternative solutions"
        # 	)
        # 	while num_considered_solutions < min(num_solutions, maximum_alt_solutions):

        # 		current_sol = []
        # 		for res_list in all_res:
        # 			current_sol.append(np.random.choice(len(res_list)))

        # 		if tuple(current_sol) not in sol_identifiers:

        # 			new_network = base_network.copy()
        # 			for i in range(len(current_sol)):
        # 				res_list = all_res[i]
        # 				net = res_list[current_sol[i]]
        # 				new_network = nx.compose(new_network, net)

        # 			rdict = {}
        # 			target_seen = []
        # 			for n in new_network:
        # 				spl = n.split("_")
        # 				nn = Node("state-node", spl[0].split("|"), is_target=False)

        # 				if len(spl) == 2:
        # 					if "target" in n and n not in target_seen:
        # 						nn.is_target = True

        # 				if len(spl) > 2:
        # 					if 'target' in n and n not in target_seen:
        # 						nn.is_target = True
        # 					nn.pid = spl[-1]

        # 				if nn.is_target:
        # 					target_seen.append(nn.char_string)

        # 				rdict[n] = nn

        # 			new_network = nx.relabel_nodes(new_network, rdict)

        # 			alt_solutions.append(new_network)

        # 			sol_identifiers.append(tuple(current_sol))
        # 			num_considered_solutions += 1

        # 			pbar.update(1)  # update progress bar

        return (
            Cassiopeia_Tree(
                method="hybrid",
                network=state_tree,
                name="Cassiopeia_state_tree",
                alternative_solutions=alt_solutions,
                base_network=base_network,
            ),
            potential_graph_sizes,
        )

    if method == "greedy":

        neighbors, distances = None, None
        if missing_data_mode == "knn":
            print("Computing neighbors for imputing missing values...")
            neighbors, distances = find_neighbors(target_nodes,
                                                  n_neighbors=n_neighbors)

        graph = greedy_build(
            target_nodes,
            neighbors,
            distances,
            priors=prior_probabilities,
            cell_cutoff=-1,
            lca_cutoff=None,
            fuzzy=fuzzy,
            probabilistic=probabilistic,
            minimum_allele_rep=greedy_minimum_allele_rep,
            missing_data_mode=missing_data_mode,
            lookahead_depth=lookahead_depth,
        )[0]

        rdict = {}
        for n in graph:
            spl = n.split("_")
            nn = Node(n, spl[0].split("|"), is_target=False)
            if len(spl) > 1:
                nn.pid = spl[1]
            if spl[0] in node_name_dict and len(spl) == 1:
                nn.is_target = True
            rdict[n] = nn

        state_tree = nx.relabel_nodes(graph, rdict)

        return (
            Cassiopeia_Tree(method="greedy",
                            network=state_tree,
                            name="Cassiopeia_state_tree"),
            None,
        )

    else:
        raise Exception(
            "Please specify one of the following methods: ilp, hybrid, greedy")
def run_nj_naive(cm_uniq, stem, verbose=True):

    if verbose:
        print("Running Neighbor-Joining on " + str(cm_uniq.shape[0]) +
              " Unique Cells")

    cm_lookup = list(cm_uniq.apply(lambda x: "|".join(x.values), axis=1))

    fn = stem + "phylo.txt"
    infile = stem + "infile.txt"

    cm_uniq.to_csv(fn, sep='\t')

    script = (SCLT_PATH / 'TreeSolver' / 'binarize_multistate_charmat.py')
    cmd = "python3.6 " + str(script) + " " + fn + " " + infile + " --relaxed"
    p = subprocess.Popen(cmd, shell=True)
    pid, ecode = os.waitpid(p.pid, 0)

    aln = AlignIO.read(infile, "phylip-relaxed")

    calculator = DistanceCalculator('identity')
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)

    tree.root_at_midpoint()

    nj_net = Phylo.to_networkx(tree)

    # convert labels to characters for writing to file
    rndict = {}
    for n in nj_net:

        if n.name is None:
            rndict[n] = Node('state-node', [])
        elif n.name in cm_uniq:
            rndict[n] = Node(n.name, cm_uniq.loc[n.name].values)

    # convert labels to strings, not Bio.Phylo.Clade objects
    #c2str = map(lambda x: x.name, list(nj_net.nodes()))
    #c2strdict = dict(zip(list(nj_net.nodes()), c2str))
    nj_net = nx.relabel_nodes(nj_net, rndict)

    # nj_net = fill_in_tree(nj_net, cm_uniq)

    # nj_net = tree_collapse2(nj_net)

    rdict = {}
    for n in nj_net:
        if nj_net.out_degree(n) == 0 and n.char_string in cm_lookup:
            n.is_target = True
        else:
            n.is_target = False

    state_tree = nj_net
    ret_tree = Cassiopeia_Tree(method='neighbor-joining',
                               network=state_tree,
                               name='Cassiopeia_state_tree')

    os.system("rm " + infile)
    os.system("rm " + fn)

    return ret_tree
Exemple #11
0
def prune_and_clean_leaves(G):
    """
    Prune off leaves that don't correspond to samples and clean up the names on leaves (i.e. only keep the sample
    labels and remove character states or post-processing hashes.)

    :param G: 
        Networkx Graph as a tree
    :return: 
        Pruned and cleaned tree as a Networkx object.
    """

    new_nodes = []
    new_edges = []

    def prune_leaves(G):

        nodes_to_remove = []

        root = [n for n in G if G.in_degree(n) == 0][0]

        # first remove paths to leaves that don't correspond to samples
        _leaves = [n for n in G if G.out_degree(n) == 0]

        for n in _leaves:
            # we detect leaves that are not targets by the `is_target` attribute.
            if not n.is_target:
                nodes_to_remove.append(n)

        return nodes_to_remove

    nodes_to_remove = prune_leaves(G)
    while len(nodes_to_remove) > 0:
        for n in set(nodes_to_remove):
            G.remove_node(n)

        nodes_to_remove = prune_leaves(G)

    # remove character strings from node name
    # node_dict = {}
    # for n in tqdm(G.nodes, desc="removing character strings from sample names"):
    #     spl = n.split("_")
    #     if "|" in spl[0] and "target" in n:
    #         nn = "_".join(spl[1:])
    #         node_dict[n] = nn

    # G = nx.relabel_nodes(G, node_dict)
    for n in G.nodes:
        # spl = n.split("_")
        if n.is_target:
            #if spl[-1] == "target":
            #    name = "_".join(spl[:-1])
            #else:
            #    name = "_".join(spl[:-2])

            # if this target is a leaf, just rename it
            # else we must add an extra 'redundant' leaf here
            if G.out_degree(n) != 0:
                #    node_dict2[n] = name
                if n.name == 'state-node':
                    n.is_target = False
                else:
                    n.is_target = False
                    new_node = Node(n.name,
                                    n.get_character_vec(),
                                    is_target=True)
                    # else:
                    new_nodes.append(new_node)
                    new_edges.append((n, new_node))

    G.add_nodes_from(new_nodes)
    G.add_edges_from(new_edges)

    # G = nx.relabel_nodes(G, node_dict2)

    return G