def Neigborjoin(Matrice): ids = list(Matrice) dm = DistanceMatrix(Matrice, ids) tree = nj(dm) print(tree.ascii_art()) print('\n') newick_str = nj(dm, result_constructor=str) return newick_str
def construct_tree(X_2d, acc, title): data = pairwise_distances(X_2d) data[np.isnan(data)] = 0 for i in range(data.shape[0]): for j in range(i, data.shape[0]): data[j, i] = data[i, j] dm = DistanceMatrix(data, acc) tree = nj(dm) newick_str = nj(dm, result_constructor=str) with open(title + ".nwk", "w") as f: f.write(newick_str)
def collect(self, column: List[List[Token]]) -> Dict: """the collect method takes in a list of Tokens and collects the closest ones together. The returned object is a dict of lists with each inner list representing the tree of indices of nearest neighbors in the format for e.g. [[2, (3, 1)], 0] to represent the tree: --- 0 ---| --- 2 --- | --- 3 ---| --- 1 Parameters ---------- column: list of list[Token] the column to align Returns ------- a dict of lists with key 'n' representing the cluster and each inner list representing row_indices of groups with n tokens / row_index of part of the cluster """ distances = self._compute_pairwise_distance(column) # create the tree with the indices of the rows instead of the actual values nw = list() [nw.append(i) for i in range(len(column))] dm = DistanceMatrix(distances, nw) tree = nj(dm) _, order = deserialize(serialize(tree)) return {0: order[0]}
def find_tree(npop: int, numerical_label: 'np.ndarray[int]', arr: 'np.ndarray[float]', ) -> TreeNode: """Find tree topology using the centers of mass of clusters. 'inferred_labels' contains assigned labels. Return the neighbor join tree, population sizes, and the bloks of original distance matrix that correspond to given population pairs (for further determination of fitting window). """ if npop == 2: tree = read(StringIO('(0:0.1, 1:0.1);'), format='newick', into=TreeNode) return tree arr = arr[:, :npop + OFFSET] ds = np.zeros((npop, npop)) coords = np.zeros((npop, npop+OFFSET)) for i in set(numerical_label): coords[i, :] = np.mean(arr[np.where(numerical_label == i)[0], :], axis=0) for i in range(npop): for j in range(npop): ds[i, j] = np.sqrt(np.sum((coords[i] - coords[j])**2)) ids = list(map(str, range(npop))) dm = DistanceMatrix(ds, ids) tree = nj(dm) new_tree = tree.root_at_midpoint() print(new_tree.ascii_art()) print(new_tree) return new_tree
def drawTree(MS_distDict, Methyl_distDict, filtered_samples, ratio, outgroup): ''' Merge MS and Methyl distance matrices ''' merged_distMatrix = [] for sample1 in sorted(filtered_samples): sample1_dist = [] for sample2 in sorted(filtered_samples): merged_dist = (MS_distDict[sample1][sample2] * ratio) + ( Methyl_distDict[sample1][sample2] * (1 - ratio) ) / 100 #We want to scale methyl PD dist properly because PD is calculated from a 0-100 scale while MS dist is 0-1 scale sample1_dist.append(merged_dist) merged_distMatrix.append(sample1_dist) ''' Run neighbor-joining phylogenetic tree building algorithm on pairwise cell distance (saved in distDict) ''' distObj = DistanceMatrix(merged_distMatrix, sorted(filtered_samples)) print(distObj.data) skbio_tree = nj(distObj, result_constructor=str) ete_tree = Tree( skbio_tree ) #We use skbio to first make a tree from distance matrix then convert to ete tree if outgroup is "NA": return ete_tree else: if outgroup == "Midpoint": tree_midpoint = ete_tree.get_midpoint_outgroup() ete_tree.set_outgroup(tree_midpoint) else: ete_tree.set_outgroup(outgroup) return ete_tree
def __init__(self, dist_matrix): self.dist_matrix = dist_matrix nr_elements = self.dist_matrix.nr_elements self.matrix = [] for i in range(nr_elements): row = [] for j in range(nr_elements): row.append(self.dist_matrix.get_distance(i, j)) self.matrix.append(row) self.ids = list(map(str, self.dist_matrix.labels)) self.nj_dm = DistanceMatrix(self.matrix, self.ids) tree = nj(self.nj_dm) self.ids = [] self.sources = [] self.targets = [] self.weights = [] self.colors = [] self.node_size = [] self.virtual_nodes = 0 self.shown_labels = {} self.font_colors = [] # true #00A693 -- false #CC3333 for node in tree.preorder(): name_str = '' if node.name is None: self.virtual_nodes = self.virtual_nodes + 1 name_str = 'v' + str(self.virtual_nodes) node.name = name_str self.ids.append(node.name) self.colors.append("black") self.node_size.append(20) self.shown_labels[str(name_str)] = "" self.font_colors.append('k') else: name = node.name.rsplit(' ', 1) if len(name) > 1: node.name = name[1] name2 = name[0].rsplit(' ', 1) if len(name2) > 1: node.name = name2[1] + name[1] name = node.name if name in []: self.ids.append(node.name) self.colors.append("#CC3333") self.node_size.append(800) name_str = node.name self.shown_labels[str(name_str)] = name_str else: self.ids.append(node.name) self.colors.append("#00A693") self.node_size.append(800) name_str = node.name self.shown_labels[str(name_str)] = name_str for node in tree.preorder(): for child in node.children: self.sources.append(str(node.name)) self.targets.append(str(child.name)) self.weights.append(str(child.length))
def fromSequences(cls, labels, sequences, findParams=None, **kwargs): """ Construct an NJTree instance from some seqeunces. @param cls: Our class. @param labels: An iterable producing C{str} labels for the sequences. @param sequences: Either A C{str} filename of sequences to consider or a C{light.reads.Reads} instance. @param findParams: An instance of C{FindParameters}. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @return: An C{NJTree} instance. """ if isinstance(sequences, str): sequences = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) new = cls() new.sequences = list(sequences) new.labels = labels findParams = findParams or FindParameters() affinity = np.array( affinityMatrix(new.sequences, findParams=findParams, **kwargs)) new.distance = np.ones(affinity.shape) - affinity new.tree = nj(DistanceMatrix(new.distance, labels)) return new
def get_tree_and_order(self, words: List[List[Token]]) -> Tuple[TreeNode, List]: """creates a nearest neighbor tree and returns a list of tuples in the form: [s1, (s2, s3), s4] depicting the different order of how they should be aligned Parameters ---------- words: list list of inputs Returns ------- tuple of TreeNode and Order """ distances = self._compute_pairwise_distance(words) # create the tree with the indices of the rows instead of the actual values nw = list() [nw.append(str(i)) for i in range(len(words))] dm = DistanceMatrix(distances, nw) tree = nj(dm) tree, order = deserialize(serialize(tree), words) return tree, order[0]
def NJ(names, matrix): """ input: a numpy matrix return: newick string corresponding to neighbor joining """ dm = DistanceMatrix(matrix, names) newick_str = nj(dm, result_constructor=str) return newick_str
def single_file_nj(input_file, output_file): dm = DistanceMatrix.read(input_file) tree = nj(dm) # write output f = open(output_file, 'w') f.write(tree.to_newick(with_distances=True)) f.close()
def tree_from_distmatrix(D): """tree from distance matrix""" from skbio import DistanceMatrix from skbio.tree import nj ids = list(D.index) dm = DistanceMatrix(D.values, ids) tree = nj(dm) #print(tree.ascii_art()) return tree
def get_tree(core=False, newick=False): core_collection = kv.get_collection('core') all_species = core_collection.distinct('species') if core: pass else: other_collection = kv.get_collection('other') all_species.extend(other_collection.distinct('species')) ssu_species = [n for n in all_species if kv.db['16S'].find_one({'species':n})] dm = DistanceMatrix(get_distance_matrix(core=core, to_file=False), ssu_species) t = tree.nj(dm) print t.ascii_art() tips = [] for node in t.tips(): print node.name, node.length tips.append(node.name.replace(' ', '_')) if newick: n = tree.nj(dm, result_constructor=str) print n else: return (t, tips)
def reconstruct_neighborjoining(df_mutation_table, path_out_newick): from skbio import DistanceMatrix from skbio.tree import nj tdf = df_mutation_table.drop(columns='root').transpose() dm = DistanceMatrix( scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(tdf)), tdf.index) tree = nj(dm) tree.write(path_out_newick, format='newick') return tree
def run_nj_get_dist_matrix(self, dist_matrix): dm = DistanceMatrix(dist_matrix) # run neighbor join and get dist matrix from the tree nj_tree = nj(dm) df = nj_tree.tip_tip_distances().to_data_frame() df.index = df.index.astype(int) # sort rows and cols df.sort_index(inplace=True) df.columns = df.columns.values.astype(np.int32) df = df[sorted(df.columns)] return df.as_matrix()
def fromDistanceMatrix(cls, labels, distance): """ Construct an NJTree instance, given a distance matrix. @param cls: Our class. @param labels: An iterable producing C{str} labels corresponding to the rows (equivalently, columns) of the distance matrix. @param distance: A square matrix of numeric distances. @return: An C{NJTree} instance. """ new = cls() new.labels = labels new.distance = distance new.tree = nj(DistanceMatrix(distance, labels)) return new
def drawTree(distDict, alleleDict, sample_list, outgroup, prefix, bootstrap): ''' Run neighbor-joining phylogenetic tree building algorithm on pairwise cell distance (saved in distDict) ''' distMatrix = [] targetMatrix = [] pairwise_numTargets = [] sample_numTargets = [] for sample1 in sorted(sample_list): sample1_dist = [] sample1_targets = [] for sample2 in sorted(sample_list): sample_pair = tuple(sorted([sample1, sample2])) sample1_dist.append(distDict["sampleComp"][sample_pair]["dist"]) sample1_targets.append(distDict["sampleComp"][sample_pair]["num_targets"]) if sample1 != sample2: pairwise_numTargets.append(distDict["sampleComp"][sample_pair]["num_targets"]) else: sample_numTargets.append(distDict["sampleComp"][sample_pair]["num_targets"]) distMatrix.append(sample1_dist) targetMatrix.append(sample1_targets) if bootstrap is False: #Only output statistics for distance and number targets shared if for original tree (don't output for bootstrap resampling) statsOutput = open(prefix + ".buildPhylo.stats.txt", 'w') statsOutput.write("Number of Samples Analyzed:\t" + str(len(sample_list)) + "\n" + ','.join(sample_list) + "\n") statsOutput.write("Avg targets shared per pair of cells:\t" + str(float(sum(pairwise_numTargets) / len(pairwise_numTargets))) + "\t[" + str(min(pairwise_numTargets)) + "," + str(max(pairwise_numTargets)) + "]\n") statsOutput.write("Avg targets captured per single cell:\t" + str(float(sum(sample_numTargets) / len(sample_numTargets))) + "\t[" + str(min(sample_numTargets)) + "," + str(max(sample_numTargets)) + "]\n") for dist_indx,dist_list in enumerate(distMatrix): #Print matrix containing distances statsOutput.write(sorted(sample_list)[dist_indx] + "," + ",".join(str(round(i,3)) for i in dist_list) + "\n") for target_indx,target_list in enumerate(targetMatrix): #Print matrix containing number targets shared between each pair statsOutput.write(sorted(sample_list)[target_indx] + "," + ",".join(str(j) for j in target_list) + "\n") statsOutput.close() pickle.dump(distDict, open(prefix + ".buildPhylo.distDict.pkl", "wb")) #We want to print out the distance information for each single cell pair that was used to buildPhylo (this will be useful for downstream statistics) distObj = DistanceMatrix(distMatrix,sorted(sample_list)) skbio_tree = nj(distObj, result_constructor=str) ete_tree = Tree(skbio_tree) #We use skbio to first make a tree from distance matrix then convert to ete tree if outgroup is "NA": return ete_tree else: if outgroup == "Midpoint": tree_midpoint = ete_tree.get_midpoint_outgroup() if tree_midpoint is not None: ete_tree.set_outgroup(tree_midpoint) else: print(ete_tree.write(format = 0)) return None #We want to throw out tree if midpoint was not found else: ete_tree.set_outgroup(outgroup) return ete_tree
def __load_distance_matrix(self, data): dm = DistanceMatrix(data) nj_tree = nj(dm) df = nj_tree.tip_tip_distances().to_data_frame() df.index = df.index.astype(int) df.sort_index(inplace=True) df.columns = df.columns.values.astype(np.int32) df = df[sorted(df.columns)] self.dist_matrix = df.as_matrix() nj_tree.bifurcate() self.__post_order(nj_tree) self.__build_genotype(nj_tree)
def main_vec(): args = parse_args() genomes = parse_msa(args['msa'], args['max_samples']) try: os.makedirs(args['out_dir']) except: pass print("Count SNPs") dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv') print(" path: %s" % dist_path) dist_file = open(dist_path, 'w') matrix = [] occurs = [] for id in genomes: occurs.append(genomes[id][0] != '-') occurs = np.array(occurs) for i, id in enumerate(genomes.keys()): occ_row = occurs[i] cooccs = occ_row & occurs diffs = [] for sid in genomes: diffs.append(genomes[id][0] != genomes[sid][0]) diffs = np.array(diffs) raw_counts = np.sum(diffs & cooccs, axis=1) norm_counts = raw_counts / np.sum(cooccs, axis=1) for j, sid in enumerate(genomes.keys()): dist_file.write('\t'.join([id, sid, str(raw_counts[j]), str(norm_counts[j])])+'\n') matrix.append(norm_counts) print("Build SNP tree") tree_path = os.path.join(args['out_dir'], 'snp_dist.tree') print(" path: %s" % tree_path) dm = DistanceMatrix(matrix, genomes.keys()) tree = nj(dm, result_constructor=str) open(tree_path,'w').write(tree) print("\nDone!")
def njWithRoot(dis_matrix, muestraPmid): # no culcula la distancia, solo le da un formato mas adecuado a las distancias con los ids muestraPmidStr = [str(i) for i in muestraPmid] ver = dis_matrix.tolist() dm = DistanceMatrix(ver, muestraPmidStr) treeOrig = nj(dm, result_constructor=str) # ponerle raiz t = TreeEte(treeOrig) R = t.get_midpoint_outgroup() t.set_outgroup(R) # imprime el arbol #print(t) # imprime el newick tree = t.write(format=3) tree = TreeEte(tree, format=1) #print(tree) #a = newick_to_pairwise_nodes(tree) #print(a) return tree
def export_tree_for_all(all_patterns, matrixoutput, treeoutput): result_patterns = [] for idx, (samplename, pattern, count, pcnt) in enumerate(all_patterns): removes = set() for pos, na, ref in pattern: if na == '.': removes |= { (pos, ntmp, ref) for ntmp in ['A', 'C', 'G', 'T', 'ins', 'del', '.'] } result_patterns.append([ idx, set(pattern) - removes, removes, '{}_{}_{:.1f}%'.format( samplename, idx + 1, pcnt * 100), count]) patterns = result_patterns num_patterns = len(patterns) if num_patterns < 3: with open(treeoutput, 'w') as fp: fp.write('();') return dist_matrix = np.zeros((num_patterns, num_patterns), dtype=float) patternstrs = [ptnstr for _, _, _, ptnstr, _ in patterns] for (idx1, ptn1, rm1, ptnstr1, c1), (idx2, ptn2, rm2, ptnstr2, c2) in \ combinations(patterns, 2): distance = len((ptn1 - rm2) ^ (ptn2 - rm1)) # xor dist_matrix[idx1, idx2] = distance dist_matrix[idx2, idx1] = distance with open(matrixoutput, 'w') as fp: writer = csv.writer(fp) writer.writerow(['##', *patternstrs]) writer.writerows(dist_matrix) if True or num_patterns > 10000: # TODO: add a switch to this # Too many patterns, unable to calculate dist_matrix return dist_matrix = DistanceMatrix(dist_matrix, patternstrs) tree = nj(dist_matrix) with open(treeoutput, 'w') as fp: fp.write(str(tree.root_at_midpoint()))
def main(): args = parse_args() genomes = parse_msa(args['msa'], args['max_samples']) try: os.makedirs(args['out_dir']) except: pass print("Count SNPs") dist_path = os.path.join(args['out_dir'], 'snp_dist.tsv') print(" path: %s" % dist_path) dist_file = open(dist_path, 'w') matrix = [] for id1 in genomes: array = [] is_present1 = genomes[id1] != '-' for id2 in genomes: is_present2 = genomes[id2] != '-' is_diff = genomes[id1] != genomes[id2] co_occur = is_present1 & is_present2 raw_count = (is_diff & co_occur).sum() norm_count = 0 co_sum = co_occur.sum() if raw_count != 0 and co_sum != 0: norm_count = float(raw_count) / co_sum array.append(norm_count) dist_file.write('\t'.join([id1, id2, str(raw_count), str(norm_count)])+'\n') matrix.append(array) print("Build SNP tree") tree_path = os.path.join(args['out_dir'], 'snp_dist.tree') print(" path: %s" % tree_path) dm = DistanceMatrix(matrix, genomes.keys()) tree = nj(dm, result_constructor=str) open(tree_path,'w').write(tree) print("\nDone!")
def get_guide_tree(seqs, random=False): """ Get a guide tree representing distances between sequences :param seqs: Sequences to create a tree for :return: Guide tree """ # Get distances and ids if random: distances = calc_random_distances(seqs) else: distances = calc_distances(seqs) ids = [x.name for x in seqs] # distances = [[ 0, 16, 22, 26.5], # [16, 0, 25.5, 24.5], # [22, 25.5, 0, 22.5], # [26.5, 24.5, 22.5, 0. ]] # # Make a distance matrix and Neighbour-Joining tree dm = DistanceMatrix(distances, ids) tree = nj(dm) # print ('maxxy') # # print (distances) # print (np.amin(distances)) # print (np.argmin(distances)) # result = np.where(distances == 0.5692307692307692) # # print (result) # Mid-point root and then label the internal nodes tree = tree.root_at_midpoint() label_internal_nodes(tree) return tree
def nj_tree(feature_matrix): from skbio import DistanceMatrix from skbio.tree import nj import sklearn import time t = time.time() data = sklearn.metrics.pairwise_distances(feature_matrix.values, metric='hamming') print(time.time() - t) t = time.time() dm = DistanceMatrix(data) print('distance matrix', time.time() - t) t = time.time() tree = nj(dm) print('tree build', time.time() - t) return tree
def build_tree(dist_matrix, names_list, clust): tree = None if clust == 'nj': # print(dist_matrix) dm = DistanceMatrix(dist_matrix, names_list) tree_scikit = nj(dm,result_constructor=str) tree = Tree(tree_scikit) elif clust == 'upgma': dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix)) constructor = DistanceTreeConstructor() tree_biopython = constructor.upgma(dm) # remove InnerNode names for i in tree_biopython.get_nonterminals(): i.name = None output = StringIO() Phylo.write(tree_biopython,output, "newick") tree = Tree(output.getvalue()) else: print("Unknown tree clustering method ! Aborting") sys.exit() return tree
def build_tree(dist_matrix, names_list, clust): tree = None if clust == 'nj': # print(dist_matrix) dm = DistanceMatrix(dist_matrix, names_list) tree_scikit = nj(dm, result_constructor=str) tree = Tree(tree_scikit) elif clust == 'upgma': dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix)) constructor = DistanceTreeConstructor() tree_biopython = constructor.upgma(dm) # remove InnerNode names for i in tree_biopython.get_nonterminals(): i.name = None output = StringIO() Phylo.write(tree_biopython, output, "newick") tree = Tree(output.getvalue()) else: print("Unknown tree clustering method ! Aborting") sys.exit() return tree
def run_nj_weighted(cm_uniq, prior_probs=None, verbose=True): if verbose: print("Running Neighbor-Joining with Weighted Scoring on " + str(cm_uniq.shape[0]) + " Unique Cells") cm_lookup = list(cm_uniq.apply(lambda x: "|".join(x.values), axis=1)) dm = compute_distance_mat(cm_uniq.values.astype(np.str), cm_uniq.shape[0], priors=prior_probs) ids = cm_uniq.index dm = sp.spatial.distance.squareform(dm) dm = DistanceMatrix(dm, ids) newick_str = nj(dm, result_constructor=str) tree = newick_to_network(newick_str, cm_uniq) nj_net = fill_in_tree(tree, cm_uniq) for n in nj_net: if nj_net.out_degree(n) == 0 and n.char_string in cm_lookup: n.is_target = True n.name = 'state-node' else: n.is_target = False state_tree = nj_net ret_tree = Cassiopeia_Tree(method='neighbor-joining', network=state_tree, name='Cassiopeia_state_tree') return ret_tree
def create_tree(df, column): samples = df.index.tolist() # idenfity values > 0 (greater than the mean, which is zero after zscore) values = [] valid_samples = [] for i in range(len(samples)): if not np.isnan(df[column][samples[i]]): values.append(df[column][samples[i]]) valid_samples.append(samples[i]) size = len(valid_samples) print(size) if size > 3: dist_matrix = np.zeros((size, size)) for i in range(size): for j in range(i, size): dist_matrix[i][j] = distance.euclidean(values[i], values[j]) dist_matrix[j][i] = dist_matrix[i][j] dmat = DistanceMatrix(dist_matrix, valid_samples) return nj(dmat).root_at_midpoint() else: return None
from __future__ import unicode_literals from __future__ import print_function from __future__ import division from __future__ import absolute_import from future import standard_library standard_library.install_aliases() import pandas as pd from skbio import tree, DistanceMatrix import numpy as np import sys m = pd.read_csv(sys.argv[1]) m[m.isnull()] = 0 arr = m.as_matrix() M = arr + arr.T dm = DistanceMatrix(M) tree = tree.nj(dm)
graphList.append(g) print('Done') GL = pd.DataFrame(list(zip(label, graphList)), columns=['organism', 'graph']) #compute GK similarity matrix kernel = gk.WeisfeilerLehman(base_kernel=gk.VertexHistogram, normalize=True) GK = pd.DataFrame(kernel.fit_transform(GL['graph'].values)) GK.columns = GK.index = label # Use 1-K as measure of Distance DM_GK = DistanceMatrix(1 - GK.values) #make GK tree sktree = nj(DM_GK, result_constructor=str) GK_tree = Tree(sktree) GK_tree.name = 'AGORA network similarity tree' # style ts = TreeStyle() ts.show_leaf_name = True ts.mode = "c" ts.arc_start = -180 ts.arc_span = 360 #plot tree #GK_tree.render(file_name='/home/acabbia/Documents/Muscle_Model/GSMM-distance/figures/GK_tree_AGORA.png', tree_style=ts) #GK_tree.show(tree_style=ts) #%% ####
def main(): """ Takes in a character matrix, an algorithm, and an output file and returns a tree in newick format. """ parser = argparse.ArgumentParser() parser.add_argument("netfp", type=str, help="character_matrix") parser.add_argument("-nj", "--neighbor-joining", action="store_true", default=False) parser.add_argument("--neighbor_joining_weighted", action="store_true", default=False) parser.add_argument("--ilp", action="store_true", default=False) parser.add_argument("--hybrid", action="store_true", default=False) parser.add_argument("--cutoff", type=int, default=80, help="Cutoff for ILP during Hybrid algorithm") parser.add_argument( "--hybrid_lca_mode", action="store_true", help= "Use LCA distances to transition in hybrid mode, instead of number of cells", ) parser.add_argument("--time_limit", type=int, default=-1, help="Time limit for ILP convergence") parser.add_argument( "--iter_limit", type=int, default=-1, help="Max number of iterations for ILP solver", ) parser.add_argument("--greedy", "-g", action="store_true", default=False) parser.add_argument("--camin-sokal", "-cs", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False, help="output verbosity") parser.add_argument("--mutation_map", type=str, default="") parser.add_argument("--num_threads", type=int, default=1) parser.add_argument("--no_triplets", action="store_true", default=False) parser.add_argument("--max_neighborhood_size", type=str, default=3000) parser.add_argument("--out_fp", type=str, default=None, help="optional output file") parser.add_argument("--seed", type=int, default=None, help="Random seed for ILP solver") args = parser.parse_args() netfp = args.netfp outfp = args.out_fp verbose = args.verbose lca_mode = args.hybrid_lca_mode if lca_mode: lca_cutoff = args.cutoff cell_cutoff = None else: cell_cutoff = args.cutoff lca_cutoff = None time_limit = args.time_limit iter_limit = args.iter_limit num_threads = args.num_threads max_neighborhood_size = args.max_neighborhood_size seed = args.seed if seed is not None: random.seed(seed) np.random.seed(seed) score_triplets = not args.no_triplets prior_probs = None if args.mutation_map != "": prior_probs = pic.load(open(args.mutation_map, "rb")) name = netfp.split("/")[-1] stem = ".".join(name.split(".")[:-1]) true_network = nx.read_gpickle(netfp) if isinstance(true_network, Cassiopeia_Tree): true_network = true_network.get_network() target_nodes = get_leaves_of_tree(true_network) target_nodes_uniq = [] seen_charstrings = [] for t in target_nodes: if t.char_string not in seen_charstrings: seen_charstrings.append(t.char_string) target_nodes_uniq.append(t) if args.greedy: if verbose: print("Running Greedy Algorithm on " + str(len(target_nodes_uniq)) + " Cells") reconstructed_network_greedy = solve_lineage_instance( target_nodes_uniq, method="greedy", prior_probabilities=prior_probs) net = reconstructed_network_greedy[0] if outfp is None: outfp = name.replace("true", "greedy") pic.dump(net, open(outfp, "wb")) elif args.hybrid: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_hybrid = solve_lineage_instance( target_nodes_uniq, method="hybrid", hybrid_cell_cutoff=cell_cutoff, hybrid_lca_cutoff=lca_cutoff, prior_probabilities=prior_probs, time_limit=time_limit, threads=num_threads, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_hybrid[0] if outfp is None: outfp = name.replace("true", "hybrid") pic.dump(net, open(outfp, "wb")) elif args.ilp: if verbose: print("Running Hybrid Algorithm on " + str(len(target_nodes_uniq)) + " Cells") print("Parameters: ILP on sets of " + str(cutoff) + " cells " + str(time_limit) + "s to complete optimization") reconstructed_network_ilp = solve_lineage_instance( target_nodes_uniq, method="ilp", hybrid_subset_cutoff=cutoff, prior_probabilities=prior_probs, time_limit=time_limit, max_neighborhood_size=max_neighborhood_size, seed=seed, num_iter=iter_limit, ) net = reconstructed_network_ilp[0] # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample) if outfp is None: outfp = name.replace("true", "ilp") pic.dump(net, open(outfp, "wb")) elif args.neighbor_joining: if verbose: print("Running Neighbor-Joining on " + str(len(target_nodes_uniq)) + " Unique Cells") infile = "".join(name.split(".")[:-1]) + "infile.txt" fn = "".join(name.split(".")[:-1]) + "phylo.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str( script) + " " + fn + " " + infile + " --relaxed" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) aln = AlignIO.read(infile, "phylip-relaxed") aln = unique_alignments(aln) t0 = time.time() calculator = DistanceCalculator("identity", skip_letters="?") constructor = DistanceTreeConstructor(calculator, "nj") tree = constructor.build_tree(aln) tree.root_at_midpoint() nj_net = Phylo.to_networkx(tree) # convert labels to characters for writing to file i = 0 rndict = {} for n in nj_net: if n.name is None: rndict[n] = Node("state-node", []) # n.name = "internal" + str(i) # i += 1 else: rndict[n] = Node(n.name, []) nj_net = nx.relabel_nodes(nj_net, rndict) # convert labels to strings, not Bio.Phylo.Clade objects # c2str = map(lambda x: x.name, list(nj_net.nodes())) # c2strdict = dict(zip(list(nj_net.nodes()), c2str)) # nj_net = nx.relabel_nodes(nj_net, c2strdict) cm = pd.read_csv(fn, sep="\t", index_col=0) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) nj_net = fill_in_tree(nj_net, cm) nj_net = tree_collapse(nj_net) for n in nj_net: if n.char_string in cm_lookup.keys(): n.is_target = True nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj") pic.dump(nj_net, open(outfp, "wb")) # Phylo.write(tree, out, 'newick') os.system("rm " + infile) os.system("rm " + fn) elif args.neighbor_joining_weighted: if verbose: print("Running Neighbor-Joining with Weighted Scoring on " + str(len(target_nodes_uniq)) + " Unique Cells") target_node_charstrings = np.array( [t.get_character_vec() for t in target_nodes_uniq]) dm = compute_distance_mat(target_node_charstrings, len(target_node_charstrings), priors=prior_probs) ids = [t.name for t in target_nodes_uniq] cm_uniq = pd.DataFrame(target_node_charstrings) cm_uniq.index = ids dm = sp.spatial.distance.squareform(dm) dm = DistanceMatrix(dm, ids) newick_str = nj(dm, result_constructor=str) tree = newick_to_network(newick_str, cm_uniq) nj_net = fill_in_tree(tree, cm_uniq) nj_net = tree_collapse(nj_net) cm_lookup = dict( zip( list( cm_uniq.apply( lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm_uniq.index.values, )) rdict = {} for n in nj_net: if n.char_string in cm_lookup: n.is_target = True else: n.is_target = False nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net) if outfp is None: outfp = name.replace("true", "nj_weighted") pic.dump(nj_net, open(outfp, "wb")) elif args.camin_sokal: if verbose: print("Running Camin-Sokal Max Parsimony Algorithm on " + str(len(target_nodes_uniq)) + " Unique Cells") samples_to_cells = {} indices = [] for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq): samples_to_cells["s" + str(i)] = n.name indices.append(n.name) n.name = str(i) infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt" fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt" weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt" write_leaves_to_charmat(target_nodes_uniq, fn) script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py" cmd = "python3.6 " + str(script) + " " + fn + " " + infile pi = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(pi.pid, 0) weights = construct_weights(infile, weights_fn) os.system("touch outfile") os.system("touch outtree") outfile = stem + "outfile.txt" outtree = stem + "outtree.txt" # run phylip mix with camin-sokal responses = "." + stem + ".temp.txt" FH = open(responses, "w") current_dir = os.getcwd() FH.write(infile + "\n") FH.write("F\n" + outfile + "\n") FH.write("P\n") FH.write("W\n") FH.write("Y\n") FH.write(weights_fn + "\n") FH.write("F\n" + outtree + "\n") FH.close() t0 = time.time() cmd = "~/software/phylip-3.697/exe/mix" cmd += " < " + responses + " > screenout1" p = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p.pid, 0) consense_outtree = stem + "consenseouttree.txt" consense_outfile = stem + "consenseoutfile.txt" FH = open(responses, "w") FH.write(outtree + "\n") FH.write("F\n" + consense_outfile + "\n") FH.write("Y\n") FH.write("F\n" + consense_outtree + "\n") FH.close() if verbose: print("Computing Consensus Tree, elasped time: " + str(time.time() - t0)) cmd = "~/software/phylip-3.697/exe/consense" cmd += " < " + responses + " > screenout" p2 = subprocess.Popen(cmd, shell=True) pid, ecode = os.waitpid(p2.pid, 0) newick_str = "" with open(consense_outtree, "r") as f: for l in f: l = l.strip() newick_str += l cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str) cm.index = indices cs_net = newick_to_network(newick_str, cm) for n in cs_net: if n.name in samples_to_cells: n.name = samples_to_cells[n.name] cs_net = fill_in_tree(cs_net, cm) cs_net = tree_collapse2(cs_net) cm_lookup = dict( zip( list( cm.apply(lambda x: "|".join([str(k) for k in x.values]), axis=1)), cm.index.values, )) for n in cs_net: if n.char_string in cm_lookup.keys(): n.is_target = True cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net) if outfp is None: outfp = name.replace("true", "cs") pic.dump(cs_net, open(outfp, "wb")) os.system("rm " + outfile) os.system("rm " + responses) os.system("rm " + outtree) os.system("rm " + consense_outfile) os.system("rm " + infile) os.system("rm " + fn) else: raise Exception( "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal" )
#plt.tick_params(axis='both', which='minor', labelsize=6) plt.xticks(rotation=45, horizontalalignment='right', fontweight='light', fontsize=6 ) #fig.text(0.5, 0.04, 'Sequence', ha='center') fig.text(0.04, 0.5, 'Distances', va='center', rotation='vertical', fontsize=6 ) plt.savefig( results_file, dpi = 200, bbox_inches='tight') ################################################################################################################### ########## phylogenetics ########################################################### ################################################################################################################### dm = DistanceMatrix(DIST_proposed, sequences) tree = nj(dm) #print(tree.ascii_art()) newick_str_fos = nj(dm, result_constructor=str) t = PhyloTree(newick_str_fos) f = open(current_dir + "/results/" + path_netwick + "/fos.txt", "w") f.write(newick_str_fos) f.close() #t.show() dm = DistanceMatrix(DIST_proposed_glcm, sequences) tree = nj(dm) #print(tree.ascii_art()) newick_str_glcm = nj(dm, result_constructor=str) t = PhyloTree(newick_str_glcm) f = open(current_dir + "/results/" + path_netwick + "/glcm.txt", "w") f.write(newick_str_glcm)
from ete3 import PhyloTree, TreeStyle from skbio import DistanceMatrix from skbio.tree import nj data = [[0, 8, 4, 6], [8, 0, 8, 8], [4, 8, 0, 6], [6, 8, 6, 0]] ids = list('abcd') dm = DistanceMatrix(data, ids) tree = nj(dm) print(tree.ascii_art()) newick_str = nj(dm, result_constructor=str) print(newick_str) #print(newick_str[:55], "...") t = PhyloTree(newick_str) t.show() alg = """ >Dme_001 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEAL--YYASQTDDIKDRREEAH >Dme_002 MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH >Cfa_001 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH >Mms_001 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH >Hsa_001 MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH >Ptr_002 MAEIPDATIQ-FMALTNVSHNIAVQY--EFGDLNEALNSY--YQTDDQKDRREEAH >Mmu_002 MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
def construct_cluster(args, dm): # neighbor joining: from skbio.tree import nj mycluster = nj(dm) return mycluster
for j in range(len(my_data[i])): if (math.isnan(my_data[i][j])): my_data[i][j] = 0 my_data = numpy.array(my_data) data = my_data.T + my_data for i in data: for j in i: print(j, end=' ') print() dm = DistanceMatrix(data, ids) tree = nj(dm, disallow_negative_branch_length=False) print(tree.ascii_art()) tree_file = open(tree_file, 'w+') tree_file.write(tree.ascii_art()) tree_file.close() nws = nj(dm, result_constructor=str) print(nws) nws_file_l = open(nws_file, 'w+') nws_file_l.write(nws) nws_file_l.close() bio_tree = Phylo.read("work/NLP/Trees/output_data.txt", 'newick')
log_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] parser.add_argument( '--log-level', '-l', default="INFO", choices=log_choices, help="Set logging level. Default is info." ) return parser if __name__ == '__main__': parser = get_argument_parser() args = parser.parse_args() level = getattr(logging, args.log_level.upper(), logging.INFO) logging.basicConfig(level=level) sequences = SequenceCollection.read(args.infile, format=args.format) if args.parallel == 0 and len(sequences) > 16: pool_size = multiprocessing.cpu_count() else: pool_size = 1 dmatrix = create_distance_matrix(sequences, d2.distance, pool_size, statistic=d2.d2_neighbourhood_dna) print(dmatrix) phylo_tree = nj(dmatrix) print(phylo_tree.ascii_art()) phylo_tree.write(args.outfile, format=args.target)
trees_file.close() print("\n%d/%d trees were < 4." % (count, len(matrix_df.columns.tolist()))) matrix_df = matrix_df.fillna(0.0) sps = matrix_df.index.tolist() size = len(sps) dmat = np.zeros((size, size)) for i in range(size): for j in range(i, size): dmat[i][j] = distance.euclidean(matrix_df.loc[sps[i]], matrix_df.loc[sps[j]]) dmat[j][i] = dmat[i][j] dmat = DistanceMatrix(dmat, sps) nw = str(nj(dmat).root_at_midpoint()).replace('root','') njf = open(os.path.join(trees_path, 'nj_tree_euclidean.txt'),'w') njf.write(nw) njf.close() all_annotations = pd.read_csv(os.path.join(current_path, 'dataset/all_prostate_cancer_annotations.csv'), index_col=0, header=0) selected_proteins_annotations = all_annotations.loc[up_proteins] selected_proteins_annotations.to_csv(os.path.join(current_path, 'dataset/selected_prostate_cancer_annotations_up_regulated.csv'), header=True, index=True) selected_proteins_annotations = all_annotations.loc[down_proteins] selected_proteins_annotations.to_csv(os.path.join(current_path, 'dataset/selected_prostate_cancer_annotations_down_regulated.csv'), header=True, index=True)