def get_tree(self, chrom, start=1, end=None, samples=None, return_format="tree_obj"): print("chrom: {} start: {} end: {} samples: {}".format( chrom, start, end, samples)) names, matrix = self.get_matrix(chrom, start=start, end=end, samples=samples, return_format="Phylo") distance_matrix = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(distance_matrix) # neighbour joining tree if return_format == "tree_obj": return tree elif return_format == "newick": treeIO = StringIO() Phylo.write(tree, treeIO, "newick") treeString = treeIO.getvalue() treeString = treeString.strip() return treeString
def test_good_construction(self): dm = _DistanceMatrix(self.names, self.matrix) self.assertTrue(isinstance(dm, TreeConstruction._DistanceMatrix)) self.assertEqual(dm.names[0], 'Alpha') self.assertEqual(dm.matrix[2][1], 3) self.assertEqual(len(dm), 4) self.assertEqual(repr(dm), "_DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]])")
def construct_tree(matrix, nj=True): """Build a tree from a distance matrix Can either use neighbor-joining (nj) or UPGMA. """ if not (matrix and type(matrix) == list and len(matrix) > 0): print "matrix has invalid value" return dm = _DistanceMatrix(names=[str(i) for i in range(len(matrix))], matrix=matrix) constructor = DistanceTreeConstructor() if nj: tree = constructor.nj(dm) else: tree = constructor.upgma(dm) # this will remove the names from the inner nodes # this is critical for seq-gen to read in the tree for clade in tree.get_nonterminals(): clade.name = '' return tree
def calcTree(ensemble, distance_matrix): """ Given a distance matrix for an ensemble, it creates an returns a tree structure. :arg ensemble: an ensemble with labels. :type ensemble: prody.ensemble.Ensemble or prody.ensemble.PDBEnsemble :arg distance_matrix: a square matrix with length of ensemble. If numbers does not mismatch it will raise an error. :type distance_matrix: numpy.ndarray """ try: from Bio import Phylo except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') names = ensemble.getLabels() if len(names) != distance_matrix.shape[0] or len( names) != distance_matrix.shape[1]: raise ValueError("The size of matrix and ensemble has a mismatch.") return None matrix = [] k = 1 for row in distance_matrix: matrix.append(list(row[:k])) k = k + 1 from Bio.Phylo.TreeConstruction import _DistanceMatrix dm = _DistanceMatrix(names, matrix) constructor = Phylo.TreeConstruction.DistanceTreeConstructor() tree = constructor.nj(dm) for node in tree.get_nonterminals(): node.name = None return tree
def distance_matrix(cls, cluster_list): print cluster_list dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list) distance_pairs = {g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all()} matrix = [] for i in range(0,len(cluster_list)): matrix_iteration = [] for j in range(0,i+1): if i == j: matrix_iteration.append(0) elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs: matrix_iteration.append(distance_pairs[cluster_list[i] + '_' + cluster_list[j]]) elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs: matrix_iteration.append(distance_pairs[cluster_list[j] + '_' + cluster_list[i]]) else: raise("Error, can't find pair!") matrix.append(matrix_iteration) #print matrix_iteration cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list] matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(matrix_obj) tree.ladderize() #Phylo.draw_ascii(tree) output = StringIO.StringIO() Phylo.write(tree, output, 'newick') tree_str = output.getvalue() #print tree_str return tree_str
def nj_wordlist( wordlist, column="Value", method=DistanceTreeConstructor.nj): """Create a tree using Hamming distances. From the CLDF Dataframe `wordlist`, create a tree using a distance method (neighbor joining, the default, or UPGMA) based on the Hamming distance (size of the symmetric difference) of presence/absence of the set of values in `column`. """ wordlist = pandas.read_csv(wordlist, sep="\t") cogids = [] languages = [] for language, data in wordlist.groupby("Language_ID"): languages.append(language) cogids.append(set(data[column])) dm = _DistanceMatrix(languages, [ [len(cogids[i] ^ cogids[j]) for j in range(i + 1)] for i in range(len(cogids))]) constructor = DistanceTreeConstructor() tree = method(constructor, dm) return tree
def test_good_manipulation(self): dm = _DistanceMatrix(self.names, self.matrix) # getitem self.assertEqual(dm[1], [1, 0, 3, 5]) self.assertEqual(dm[2, 1], 3) self.assertEqual(dm[2][1], 3) self.assertEqual(dm[1, 2], 3) self.assertEqual(dm[1][2], 3) self.assertEqual(dm['Alpha'], [0, 1, 2, 4]) self.assertEqual(dm['Gamma', 'Delta'], 6) # setitem dm['Alpha'] = [0, 10, 20, 40] self.assertEqual(dm['Alpha'], [0, 10, 20, 40]) # delitem insert item del dm[1] self.assertEqual(dm.names, ['Alpha', 'Gamma', 'Delta']) self.assertEqual(dm.matrix, [[0], [20, 0], [40, 6, 0]]) dm.insert('Beta', [1, 0, 3, 5], 1) self.assertEqual(dm.names, self.names) self.assertEqual(dm.matrix, [[0], [1, 0], [20, 3, 0], [40, 5, 6, 0]]) del dm['Alpha'] self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta']) self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0]]) dm.insert('Alpha', [1, 2, 4, 0]) self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta', 'Alpha']) self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0], [1, 2, 4, 0]])
def measure_D_net(G,qmod,qcon): D_net_dic = {} D_net_ret = {} D_net = [] for u in G: D_net_dic[u] = {} for u in sorted(G): key1 = "Taxon" + str(u) tmp_row = [] for v in sorted(G): key2 = "Taxon" + str(v) if u < v: continue D_net_dic[u][v] = 1.0 - G.dmc_likelihood(u,v,qmod,qcon) tmp_row.append(D_net_dic[u][v]) print D_net_dic[u][v], D_net.append(tmp_row) print '\n' names = [] for u in G: names.append('Taxon'+str(u)) print names print D_net D_net_final = _DistanceMatrix(names,D_net) #print D_net_final.names constructor = DistanceTreeConstructor() tree_dmc = constructor.upgma(D_net_final) #print tree_dmc Phylo.write(tree_dmc,'ph_dmc.nre','newick') return D_net_final
def dm_to_tree(dm): dm = dm.astype(float) distance_triangular = [list(dm.values[i, : i + 1]) for i in range(len(dm))] try: dm = _DistanceMatrix(names=[str(i) for i in dm.columns], matrix=distance_triangular) except Exception, e: print list(dm.columns) print [type(i) for i in dm.columns]
def construct_tree(X_2d, acc, title): acc = list(acc) data = pairwise_distances(X_2d).astype('float') data[np.isnan(data)] = 0 data_list = [] for i in range(data.shape[0]): #for j in range(i, data.shape[0]): data_list.append([data[i, j] for j in range(0, i+1)]) data = data_list dm = _DistanceMatrix(acc, matrix=data) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) Phylo.write(tree, title + ".nwk", 'newick')
def dm_to_tree(dm): dm = dm.astype(float) distance_triangular = [list(dm.values[i, :i + 1]) for i in range(len(dm))] try: dm = _DistanceMatrix(names=[str(i) for i in dm.columns], matrix=distance_triangular) except Exception, e: print list(dm.columns) print[type(i) for i in dm.columns] print type(distance_triangular) print type(distance_triangular[0]) print set([str(type(i)) for j in distance_triangular for i in j]) print distance_triangular raise e
def lower_tri(full_matrix): ''' Take a symmetrical matrix, convert it to a lower triangle _Matrix object. ''' lower_triangle = [] names = [] k = 2 for i in full_matrix: lower_triangle.append(list(map(float, i[1:k]))) names.append(i[0]) k += 1 from Bio.Phylo.TreeConstruction import _DistanceMatrix matrix = _DistanceMatrix(names, lower_triangle) return matrix
def construct_tree(align, ssr_regions, motifs, weights=[1, 0.1]): """ Construct an upgma tree based on a pairwise Levenshtein distance matrix. For each pairwise comparison, the Levenshtein distances are calculated for sequences of non-SSR and SSR regions separately, and the weighted sum of them are used as the distance to construct an upgma tree. By default, weights for non-SSR and SSR regions are 1 and 0.1, respectively. In SSR regions, one repeat difference is considered to be one edit distance. Parameters ---------- align: Bio.AlignIO.MultipleSeqAlignment input sequence alignment ssr_regions: list of tuple start and end positions of SSR regions in the alignment motifs: list repeat motifs weights: list weights for non-SSR and SSR regions to culculate pairwise distances (default: [1, 0.1]) """ non_ssr_seqs = [] ssr_seqs = [] for a in align: seq = str(a.seq.upper()) ssr_idx = np.array(list(chain(*[list(range(*x)) for x in ssr_regions]))) non_ssr_idx = list(set(range(len(seq))) - set(ssr_idx)) seq_arr = np.array(list(seq)) non_ssr_seq = "".join(seq_arr[non_ssr_idx]) non_ssr_seqs.append(non_ssr_seq) ssr_seq = "" for rr, mot in zip(ssr_regions, motifs): ssr_seq += seq[rr[0]:rr[1]].replace("-", "").replace(mot, "x") ssr_seqs.append(ssr_seq) mat1 = pairwise_dist_Levenstein(non_ssr_seqs) mat2 = pairwise_dist_Levenstein(ssr_seqs) mat = [ list(np.array(i) * weights[0] + np.array(j) * weights[1]) for i, j in zip(mat1, mat2) ] names = ["seq{}".format(i) for i in range(len(align))] dmat = _DistanceMatrix(names, mat) constructor = DistanceTreeConstructor() return constructor.upgma(dmat)
def create_distance_matrix(strainList, strainDict): print "Calculating distance matrix" matrix = [] for i in range(1, len(strainList) + 1): matrix.append([0] * i) dm = _DistanceMatrix(strainList, matrix) for a in range(len(strainList)): for b in range(a, len(strainList)): strA = strainList[a] strB = strainList[b] genA = strainDict[strA] genB = strainDict[strB] dm[strA, strB] = calc_dist(len(genA), len(genB), len(genA & genB)) print "Done calculating distance matrix" return dm
def create_distance_matrix(strainList, strainDict): print "Calculating distance matrix" matrix = [] for i in range(1, len(strainList) + 1): matrix.append([0]*i) dm = _DistanceMatrix(strainList, matrix) for a in range(len(strainList)): for b in range(a, len(strainList)): strA = strainList[a] strB = strainList[b] genA = strainDict[strA] genB = strainDict[strB] dm[strA, strB] = calc_dist(len(genA), len(genB), len(genA & genB)) print "Done calculating distance matrix" return dm
def build_guide_trees(distance_matrix): # build distance matrix biopython object matrix = [distance_matrix[i, :i + 1].tolist() for i in range(len(distance_matrix))] names = ['S' + str(i) for i in range(len(distance_matrix))] dm = _DistanceMatrix(names, matrix) print('Constructed matrix') constructor = DistanceTreeConstructor() # construct neighbour joining tree t = time.time() tree = constructor.nj(dm) print('Constructed nj tree in {:.4f}s'.format(time.time() - t)) Phylo.write(tree, "njtree.dnd", "newick") remove_inner_nodes_tree("njtree.dnd") """
def test_correct_res(self): dist_matrix = pd.read_csv("data/wiki_tree.csv", index_col=0) self.tree.set_distance_matrix(dist_matrix) self.tree.fit() dist_matrix = _DistanceMatrix(names=['a', 'b', 'c', 'd', 'e'], matrix=[[0], [5, 0], [9, 10, 0], [9, 10, 8, 0], [8, 9, 7, 3, 0]]) constructor = DistanceTreeConstructor() lib_tree = constructor.nj(dist_matrix) self.assertTrue( is_isomorphic( Phylo.to_networkx(lib_tree).to_undirected(), Phylo.to_networkx(self.tree.get_tree()).to_undirected()))
def main(): ### Main Arg Parse ### parser = argparse.ArgumentParser(description="Fast tree builder v1") parser.add_argument('-i','--input',help="Input list of assembly paths") parser.add_argument('-t','--threads',help="Threads for mash)",default="1") args = vars(parser.parse_args()) start_time = time.time() temp_dir = tempfile.mkdtemp() genome_list = args["input"] # List of assembly paths with open(genome_list) as f: input_data = f.read().strip().split("\n") threads = args["threads"] mash_matrix = make_mash_matrix(input_data,temp_dir,threads) # with open("test_out.txt","w") as f: # f.write(mash_matrix) i=2 matrix = [] names = [] firstLine = True mash_matrix_lines = mash_matrix.split("\n") for line in mash_matrix_lines: if line.strip() != "": if firstLine: current_names = line.split("\t") for obj in current_names: if len(obj) > 0: names.append(obj) firstLine = False else: sub_matrix = [] values = line.split("\t") for q in range(1,i): val = float(values[q]) sub_matrix.append(val) matrix.append(sub_matrix) i+=1 #print(names,len(names)) #print(len(names),len(matrix)) print("building tree") dm = _DistanceMatrix(names,matrix) constructor = DistanceTreeConstructor(method="nj") tree = constructor.nj(dm) unique_time = str(time.time()).split(".")[1] Phylo.write([tree],"my_tree_{}.tree".format(unique_time),"newick")
def calcTree(names, distance_matrix, method='nj'): """ Given a distance matrix for an ensemble, it creates an returns a tree structure. :arg names: an list of names :type names: list, :class:`~numpy.ndarray` :arg distance_matrix: a square matrix with length of ensemble. If numbers does not match *names* it will raise an error :type distance_matrix: :class:`~numpy.ndarray` """ try: from Bio import Phylo except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') if len(names) != distance_matrix.shape[0] or len( names) != distance_matrix.shape[1]: raise ValueError("Mismatch between the sizes of matrix and names.") matrix = [] k = 1 for row in distance_matrix: matrix.append(list(row[:k])) k = k + 1 from Bio.Phylo.TreeConstruction import _DistanceMatrix if isinstance(names, np.ndarray): names = names.tolist() dm = _DistanceMatrix(names, matrix) constructor = Phylo.TreeConstruction.DistanceTreeConstructor() method = method.strip().lower() if method == 'nj': tree = constructor.nj(dm) elif method == 'upgma': tree = constructor.upgma(dm) else: raise ValueError('Method can be only either "nj" or "upgma".') for node in tree.get_nonterminals(): node.name = None return tree
def bootstrap(ps, jobID, basename='majorityTree', treebuilder='nj', bootstraps=1, outgroup=None): """treebuilder could be nj/upgma, outgroup: a population name or 'midpoint'""" allLoci = set() for pop in ps.populations: allLoci = allLoci.union(pop.allpolySites) allLoci = list( allLoci ) ## sort it? Reduce to independent sites (www.pnas.org/content/93/23/13429, run LD?) sites = len(allLoci) trees = [] for bootstrap in range(bootstraps): selectedLoci0 = np.random.choice(range(len(allLoci)), sites, replace=True) selectedLoci = [allLoci[l] for l in selectedLoci0] df = pd.DataFrame( [pop.bootstrap(selectedLoci) for pop in ps.populations], index=ps.popnames) dmNei = neiDF(df, [5] * (sites - 1)) ## annoying conversion, BioPython couldnt be just more compatible with scipy/pdist? dmTriangular = [list(dmNei[i, :(i + 1)]) for i in range(len(dmNei))] try: m = _DistanceMatrix(ps.popnames, dmTriangular) except ValueError: pdb.set_trace() constructor = DistanceTreeConstructor( ) # could've passed treebuilder here too tree = getattr(constructor, treebuilder)(m) if outgroup == 'midpoint': tree.root_at_midpoint() elif not outgroup is None: tree.root_with_outgroup({'name': outgroup}) trees.append(tree) filename = '../Data/%s_%s_%s_%s_%04d.pcl' % ( basename, bootstraps, treebuilder, len(ps.populations), jobID) with open(filename, "wb") as w: pickle.dump(trees, w)
def bootstrap(self, afbased=True, basename='majorityTree', treebuilder='nj', bootstraps=1000, outgroup=None, useAllLoci=False): """treebuilder could be nj/upgma, outgroup: a population name or 'midpoint'""" ## allLoci: all loci that are variable in at least one population allpolySites, pwm = {True: ['allpolySites', 'pwm'], False: ['allpolySitesVCF', 'pwmVCF']}[afbased] allLoci = set() for pop in self.populations: allLoci = allLoci.union(getattr(pop, allpolySites)) allLoci = list(allLoci) ## sort it? Reduce to independent sites (www.pnas.org/content/93/23/13429, run LD?) sites = len(allLoci) trees = [] print ("Bootstrapping, rounds:", end=' ') for bootstrap in range(bootstraps): ## see also parallelized version print(bootstrap, end=' ') if useAllLoci: selectedLoci = allLoci else: selectedLoci0 = np.random.choice(range(len(allLoci)), sites, replace=True) selectedLoci = [allLoci[l] for l in selectedLoci0] df = pd.DataFrame([pop.bootstrap(selectedLoci, afbased) for pop in self.populations], index=self.popnames) #import pdb; pdb.set_trace() self.dmNei = neiDF(df, [5]*(sites-1)) ## annoying conversion, BioPython couldnt be just more compatible with scipy/pdist? dmTriangular = [list(self.dmNei[i, :(i + 1)]) for i in range(len(self.dmNei))] m = _DistanceMatrix(self.popnames, dmTriangular) constructor = DistanceTreeConstructor() # could've passed treebuilder here too tree = getattr(constructor, treebuilder)(m) if outgroup == 'midpoint': tree.root_at_midpoint() elif not outgroup is None: tree.root_with_outgroup({'name': outgroup}) trees.append(tree) ## use nj! ## debug info: print(f'selectedLoci: {selectedLoci[:30]}') ## see https://biopython.org/wiki/Phylo, turned out to be more suitable than dendropy/sumtrees self.majorityTree = Consensus.majority_consensus(trees) ## also consider strict_consensus and adam_consensus (but they don't have bootstrap support values) treefile = '%s/%s_%s_%s_%s.nwk' %(resultDir, basename, bootstraps,treebuilder, len(self.populations)) Phylo.write(self.majorityTree, treefile, format='newick') print(f'wrote {treefile}') Phylo.draw_ascii(self.majorityTree)
def build_phylo_tree(file_name_pair_dist): #to return total branch lenght file_pair_dist = file_name_pair_dist #all.reciprocal pair_dist = commands.getoutput("cut -f1,2,3 " + file_pair_dist + " ") #need to delete header of the table pair_dist = pair_dist.split('\n') list_genome = commands.getoutput("awk '{print $1}{print $2}' " + file_pair_dist + " |sort -g|uniq") list_genome = list_genome.split('\n') #print 'Total genomes >> ',len(list_genome) Total_genomes = len(list_genome) #print 'Total pair distance >> ',(len(list_genome)*(len(list_genome)-1))/2 Total_pair_distance = (len(list_genome) * (len(list_genome) - 1)) / 2 check1 = False if len(pair_dist) == ( len(list_genome) * (len(list_genome) - 1)) / 2: #To check the number of pair distance #print 'Pass check 1 : total pair distances are correctly found ' check1 = True matrix = [] for n in range(1, len(list_genome) + 1): matrix.append([0] * n) #print matrix Total_metrix = len(matrix) #print 'Total matrix >>',len(matrix) Ds = _DistanceMatrix(list_genome, matrix) for pair in pair_dist: i = pair.split() #print i Ds[i[0], i[1]] = float(i[2]) #print Ds constructor = DistanceTreeConstructor() tree = constructor.nj(Ds) #print (tree) #for visualization #Phylo.draw(tree, branch_labels=lambda c: c.branch_length) print 'total_branch_length >> ', tree.total_branch_length() return Total_genomes, Total_pair_distance, check1, tree.total_branch_length( )
def test_bad_manipulation(self): dm = _DistanceMatrix(self.names, self.matrix) # getitem self.assertRaises(ValueError, dm.__getitem__, 'A') self.assertRaises(ValueError, dm.__getitem__, ('Alpha', 'A')) self.assertRaises(TypeError, dm.__getitem__, (1, 'A')) self.assertRaises(TypeError, dm.__getitem__, (1, 1.2)) self.assertRaises(IndexError, dm.__getitem__, 6) self.assertRaises(IndexError, dm.__getitem__, (10, 10)) # setitem: item or index test self.assertRaises(ValueError, dm.__setitem__, 'A', [1, 3, 4]) self.assertRaises(ValueError, dm.__setitem__, ('Alpha', 'A'), 4) self.assertRaises(TypeError, dm.__setitem__, (1, 'A'), 3) self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2) self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4]) self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1) # setitem: value test self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2]) self.assertRaises(TypeError, dm.__setitem__, ('Alpha', 'Beta'), 'a') self.assertRaises(TypeError, dm.__setitem__, 'Alpha', ['a', 'b', 'c'])
def test_bad_manipulation(self): dm = _DistanceMatrix(self.names, self.matrix) #getitem self.assertRaises(ValueError, dm.__getitem__, 'A') self.assertRaises(ValueError, dm.__getitem__, ('Alpha', 'A')) self.assertRaises(TypeError, dm.__getitem__, (1, 'A')) self.assertRaises(TypeError, dm.__getitem__, (1, 1.2)) self.assertRaises(IndexError, dm.__getitem__, 6) self.assertRaises(IndexError, dm.__getitem__, (10, 10)) #setitem: item or index test self.assertRaises(ValueError, dm.__setitem__, 'A', [1, 3, 4]) self.assertRaises(ValueError, dm.__setitem__, ('Alpha', 'A'), 4) self.assertRaises(TypeError, dm.__setitem__, (1, 'A'), 3) self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2) self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4]) self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1) #setitem: value test self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2]) self.assertRaises(TypeError, dm.__setitem__, ('Alpha', 'Beta'), 'a') self.assertRaises(TypeError, dm.__setitem__, 'Alpha', ['a', 'b', 'c'])
def distance_matrix(cls, cluster_list): print cluster_list dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list) distance_pairs = { g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all() } matrix = [] for i in range(0, len(cluster_list)): matrix_iteration = [] for j in range(0, i + 1): if i == j: matrix_iteration.append(0) elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs: matrix_iteration.append( distance_pairs[cluster_list[i] + '_' + cluster_list[j]]) elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs: matrix_iteration.append( distance_pairs[cluster_list[j] + '_' + cluster_list[i]]) else: raise ("Error, can't find pair!") matrix.append(matrix_iteration) #print matrix_iteration cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list] matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(matrix_obj) tree.ladderize() #Phylo.draw_ascii(tree) output = StringIO.StringIO() Phylo.write(tree, output, 'newick') tree_str = output.getvalue() #print tree_str return tree_str
def get_local_tree(chrom, start, end, vcf_fn, samples=None, outgroup=None, plot=False): pwd = hap.get_pairwise_diff(vcf_fn, chrom=chrom, start=start, end=end, samples=samples, chunksize=30000) distance_triangular = [ list(pwd.values[i, :i + 1]) for i in range(len(pwd)) ] dm = _DistanceMatrix(names=list(pwd.columns), matrix=distance_triangular) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) if outgroup is not None: tree.root_with_outgroup(outgroup) tree.ladderize() for t in tree.get_nonterminals(): t.name = None tree_no = copy.deepcopy(tree) if outgroup is not None: tree_no.prune(outgroup) #tree_no.prune('AstTwe1') #tree_no.prune('SerRob1') if plot: fig = plt.figure(figsize=(15, 50)) ax = plt.gca() Phylo.draw(tree_no, axes=ax, do_show=False) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.set_yticks([]) ax.set_ylabel('') return pwd, tree
def build_tree(dist_matrix, names_list, clust): tree = None if clust == 'nj': # print(dist_matrix) dm = DistanceMatrix(dist_matrix, names_list) tree_scikit = nj(dm,result_constructor=str) tree = Tree(tree_scikit) elif clust == 'upgma': dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix)) constructor = DistanceTreeConstructor() tree_biopython = constructor.upgma(dm) # remove InnerNode names for i in tree_biopython.get_nonterminals(): i.name = None output = StringIO() Phylo.write(tree_biopython,output, "newick") tree = Tree(output.getvalue()) else: print("Unknown tree clustering method ! Aborting") sys.exit() return tree
def ParseMatrix(filename): mat_names = [] # FASTA headers mat_names_num = [] lt_matrix = [] #lower triangular matrix with open(filename, 'rU') as MAT: for l in MAT: l = l.strip('\n') if len(l) == 0: continue elif l[0] == '>': mat_names.append(l[1:]) else: lt_matrix.append([float(i) for i in l.split()]) #Switch to integer headers and print Mapping file with open(args.out + '.map', 'w') as MAP: for index, name in enumerate(mat_names): MAP.write(str(index) + '\t' + name + '\n') mat_names_num.append(str(index)) #Fill into Biopython distmat Data Structures dist_matrix = _DistanceMatrix(names=mat_names_num, matrix=lt_matrix) return dist_matrix
def build_tree(dist_matrix, names_list, clust): tree = None if clust == 'nj': # print(dist_matrix) dm = DistanceMatrix(dist_matrix, names_list) tree_scikit = nj(dm, result_constructor=str) tree = Tree(tree_scikit) elif clust == 'upgma': dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix)) constructor = DistanceTreeConstructor() tree_biopython = constructor.upgma(dm) # remove InnerNode names for i in tree_biopython.get_nonterminals(): i.name = None output = StringIO() Phylo.write(tree_biopython, output, "newick") tree = Tree(output.getvalue()) else: print("Unknown tree clustering method ! Aborting") sys.exit() return tree
def D_F_matrix(D_Seq,D_net,final_tree): names_Seq = D_Seq.names names_Net = D_net.names D_F = [] D_F_names = [] for key1 in names_Net: i = names_Net.index(key1) #print key1 temp_row = [] for j in range(0,i+1): key2 = names_Net[j] #print key2, if key1 in names_Net and key2 in names_Seq: if not key1 in D_F_names: D_F_names.append(key1) i1 = names_Net.index(key1) j2 = names_Net.index(key2) new_val = (0.5*D_net[key1,key2] + 0.5*D_Seq[key1,key2]) #print new_val, temp_row.append(new_val) #print temp_row D_F.append(temp_row) print D_F D_F_final = _DistanceMatrix(D_F_names,D_F) constructor = DistanceTreeConstructor() tree_D_F = constructor.upgma(D_F_final) #print tree_dmc Phylo.write(tree_D_F,final_tree,'newick') return D_F_final
def D_F_matrix(D_Seq,D_net,final_tree, alpha): names_Seq = D_Seq.names names_Net = D_net.names D_F = [] D_F_names = [] for key1 in names_Net: i = names_Net.index(key1) #print key1 temp_row = [] for j in range(0,i+1): key2 = names_Net[j] #print key2, if key1 in names_Net and key2 in names_Seq: if not key1 in D_F_names: D_F_names.append(key1) i1 = names_Net.index(key1) j2 = names_Net.index(key2) # should be 1-alpha * D_net and alpha * D_seq new_val = ((1-alpha) * D_net[key1,key2]) + (alpha * D_Seq[key1,key2]) # alpha can be set to any value (between 0 and 1) #print new_val, # we can change alpha to choose how much of D_Seq and D_net we want to use temp_row.append(new_val) #print temp_row D_F.append(temp_row) print D_F D_F_final = _DistanceMatrix(D_F_names,D_F) constructor = DistanceTreeConstructor() tree_D_F = constructor.upgma(D_F_final) #print tree_dmc Phylo.write(tree_D_F,final_tree,'newick') return D_F_final
def build_tree(languages, lang_dist): ''' Builds a tree and prints it to a specified location. ''' print """ Where should the Tree be Printed: (1) Console (2) Text File (3) Both """ user_in = input() #Build and print the tree if user_in > 0 and user_in < 4: #decode the strings in languages for i in range(len(languages)): languages[i] = codecs.encode(languages[i], 'utf-8') #get the lower triangle matrix format for i in range(len(lang_dist)): lang_dist[i] = lang_dist[i][:i + 1] dist_matrix = _DistanceMatrix(languages, lang_dist) tree_constructor = DistanceTreeConstructor() upgma_tree = tree_constructor.upgma(dist_matrix) neighbor_tree = tree_constructor.nj(dist_matrix) if not upgma_tree is None and not neighbor_tree is None: #Draw to the console if user_in == 1: print "upgma tree:\n" Phylo.draw_ascii(upgma_tree) Phylo.draw(upgma_tree) print "\nneighbor joining tree:\n" Phylo.draw_ascii(neighbor_tree) #draw to the files only elif user_in == 2: with open(r"reports/language_distances/upgma_tree.txt", 'w') as f: # f.write(str(upgma_tree)) Phylo.draw_ascii(upgma_tree, f) with open(r"reports/language_distances/neighbor_tree.txt", 'w') as f: # f.write(str(neighbor_tree)) Phylo.draw_ascii(neighbor_tree, f) #draw to the files and the console elif user_in == 3: print "upgma tree:\n" Phylo.draw_ascii(upgma_tree) print "\nneighbor joining tree:\n" Phylo.draw_ascii(neighbor_tree) with open(r"reports/language_distances/upgma_tree.txt", 'w') as f: # f.write(str(upgma_tree)) Phylo.draw_ascii(upgma_tree, f) with open(r"reports/language_distances/neighbor_tree.txt", 'w') as f: # f.write(str(neighbor_tree)) Phylo.draw_ascii(neighbor_tree, f) else: print "Run a comparison to generate the distance matrix first\n" else: print "That is not a valid option. Please choose a valid option\n"
from Bio import Phylo from Bio.Phylo.TreeConstruction import _DistanceMatrix from Bio.Phylo.TreeConstruction import DistanceTreeConstructor from io import StringIO import re # hamming distance def hamming(seq1, seq2): # assert len(seq1) == len(seq2), 'unequal reads!' return int(sum([i[0] != i[1] for i in zip(seq1, seq2)])) f = open('rosalind_chbp.txt') species = f.readline().rstrip().split() table = [''.join(i) for i in zip(*f.read().rstrip().split())] n = len(table) ''' For the Phylo.TreeConstruction to work, integers in the distance matrix must be Python int and not numpy.int64 ''' dm = [[hamming(table[i], table[j]) for j in range(i+1)] for i in range(n)] constructor = DistanceTreeConstructor() tree = constructor.nj(_DistanceMatrix(names=species, matrix=dm)) handle = StringIO() Phylo.write(tree, handle, format='newick', plain=True) result = handle.getvalue() result = re.sub('Inner[0-9]+', '', result) open('rosalind_chbp_sub.txt', 'wt').write(result)
l += 1 dist = {} with open(blasttable) as b: for ln in b.read().splitlines(): s = ln.split("\t") if s[0] not in dist: dist[s[0]] = {} dist[s[0]][s[1]] = 1-float(s[2]) ## Distance matrix dist_score_assembly_line = generate_distance_matrix(pathways, domain_names, annotation, dist, Jaccardw, GKw, DDSw, AIw, scale=1, nbhood=3, outfile=outfile) #-- Plot the tree names = pathways.keys() score = [s for s in open(outfile, 'r').read().split('\n')[1:] if s != ''] matrix = [] for i in range(len(score)): input_i = score[i].split(',')[1:(i+2)] input_i_int = [float(n) for n in input_i] matrix.append(input_i_int) m = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree1 = constructor.upgma(m) #Bio.Phylo.draw_ascii(tree1) Bio.Phylo.write(tree1, tree_outfile, 'newick')
if((i-1)==j): row.append(0) else: row.append(np.linalg.norm(scaled_ps[j]-scaled_ps[i-1])) dist_mat.append(row) return dist_mat if __name__ == "__main__": names = [] sequences = [] with open("kmercoded.fasta", "r") as handle: for record in SeqIO.parse(handle, "fasta"): seq = record.seq N = len(seq) c = Counter(seq) if(c['$']==N): continue names.append(record.description) sequences.append(record.seq) pspec = power_spectrum(sequences) scaled_ps = linear_scaling(pspec) #cubic_scaling(pspec) dist_mat = get_euclidean_distance(scaled_ps) constructor = DistanceTreeConstructor() distance_matrix_10 = _DistanceMatrix(names=names[0:10], matrix=dist_arr[0:10]) tree_upgma_10 = constructor.upgma(distance_matrix_10) Phylo.draw(tree_upgma_10) tree_nj_10 = constructor.nj(distance_matrix_10) Phylo.draw(tree_nj_10)
def compute_tree(options, mat, names): """ make upgma hierarchical clustering and write it as png and graphviz dot """ # oops, convert to biopython matrix matrix = [] for i in xrange(len(names)): row = [] for j in xrange(i + 1): # tree constructor writes 0-distances as 1s for some reason # so we hack around here val = float(mat[names[i]][names[j]]) if val == 0.: val = 1e-10 elif val == 1.: val = 1.1 row.append(val) matrix.append(row) dm = _DistanceMatrix(names, matrix) # upgma tree constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) robust_makedirs(os.path.dirname(tree_path(options))) Phylo.write(tree, tree_path(options), "newick") # png tree -- note : doesn't work in toil def f(x): if "Inner" in str(x): return "" else: return x Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10) pylab.savefig(tree_path(options).replace("newick", "png")) # graphviz # get networkx graph nxgraph = Phylo.to_networkx(tree) # make undirected nxgraph = nx.Graph(nxgraph) # push names to name labels nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label") for node_id in nxgraph.nodes(): node = nxgraph.node[node_id] if "Inner" in str(node["label"]): node["label"] = "\"\"" node["width"] = 0.001 node["height"] = 0.001 else: node["fontsize"] = 18 for edge_id in nxgraph.edges(): edge = nxgraph.edge[edge_id[0]][edge_id[1]] # in graphviz, weight means something else, so make it a label weight = float(edge["weight"]) # undo hack from above if weight > 1: weight = 1. if weight <= 1e-10 or weight == 1.: weight = 0. edge["weight"] = None edge["label"] = "{0:.3g}".format(float(weight) * 100.) edge["fontsize"] = 14 edge["len"] = draw_len(weight) nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
def generate_tree_dendro(G_ref,H,outfile): length = len(H) length = length - 2 ''' tree = dendropy.Tree(stream=StringIO.StringIO(str(H[length])),schema="newick") for i in range(length-1,-1,-1): (u,v) = H[i] temp_tree = dendropy.Tree(stream=StringIO.StringIO(str(H[i])),schema="newick") current_parent = filter(lambda x: x.taxon.label == str(u), [y for y in tree.leaf_nodes()]) temp_tree_nodes = [x for x in temp_tree.nodes()] current_parent[0].add_child(temp_tree_nodes[1]) current_parent[0].add_child(temp_tree_nodes[2]) print(tree.as_ascii_plot()) ''' (u,v) = H[length] counter = 1 #[999]((Taxon1:0.3,Taxon2:0.14):0.5,(Taxon3:0.34, Taxon4:0.5):0.12); string_root = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")" tree = dendropy.Tree(stream=StringIO.StringIO(string_root),schema="newick") for i in range(length-1,-1,-1): (u,v) = H[i] string_temp = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")" temp_tree = dendropy.Tree(stream=StringIO.StringIO(string_temp),schema="newick") current_parent = filter(lambda x: x.taxon.label == "Taxon" + str(u), [y for y in tree.leaf_nodes()]) temp_tree_nodes = [x for x in temp_tree.nodes()] current_parent[0].add_child(temp_tree_nodes[1]) current_parent[0].add_child(temp_tree_nodes[2]) current_parent[0].taxon.label = current_parent[0].oid #print tree out = tree.as_string('newick') pdm = treecalc.PatristicDistanceMatrix(tree) T = [t1 for i, t1 in enumerate(tree.taxon_set)] T.sort(key=lambda x: x.label) out = out.replace('[&U]','[50]') ## 50 unit long sequences open(outfile,'w').write(out) # The first T produced by the history print(tree.as_ascii_plot()) print "New tree data structure" D_net_dic = {} D_net_ret = {} D_net = [] for u in G_ref: D_net_dic[u] = {} for u in sorted(G_ref): print "size" key1 = "Taxon" + str(u) tmp_row = [] for v in sorted(G_ref): key2 = "Taxon" + str(v) if u < v: continue D_net_dic[u][v] = 1.0 / (pdm(T[int(u)-1],T[int(v)-1])+1) tmp_row.append(D_net_dic[u][v]) print D_net_dic[u][v], D_net.append(tmp_row) print '\n' names = [] for u in G_ref: names.append('Taxon'+str(u)) print names print D_net D_net_final = _DistanceMatrix(names,D_net) return D_net_final
def dmc_delorean_plain(G,G_ref,qmod,qcon,outfile): """ Reconstructs the network using the dmc model and delorean algorithm. """ # Make initial pairwise likelihoods. #target = open(hisfile,'w') global H1 L = {} for u in G: L[u] = {} for u in G: for v in G: if u >= v: continue L[u][v] = G.dmc_likelihood(u,v,qmod,qcon) level_counter = 0 while (G.num_nodes >= 2): # at least two nodes in the graph. # Get largest Luv. L_list = [] L_prob = -10000000000 for u in G: for v in G: if u >= v: continue Luv = L[u][v] if Luv > L_prob: L_list = [(u,v)] L_prob = Luv elif Luv == L_prob: L_list.append((u,v)) # Choose random pair; assign random daddy. pair = random.choice(L_list) (u,v) = (pair[0],pair[1]) if random.random() > 0.5 else (pair[1],pair[0]) # Nodes whose likelihood values need to be computed. altered = (G.neighbors(u) | G.neighbors(v) | set([u])) - set([v]) # Prepare to delete v: add new edges in symmetric difference of v to u. for neighbor in G.neighbors(v): if u == neighbor: continue # Don't add self-edge. elif v == neighbor: continue # Don't add, will remove v anyways. elif G.has_edge(u,neighbor): continue # Edge already exists. else: G.add_edge(u,neighbor) G.remove_node(v) H1.append((u,v)) print "%s\t%s" %(u,v) # Fix up altered Luv values. for x in altered: for y in G: if x == y: continue L[min(x,y)][max(x,y)] = G.dmc_likelihood(x,y,qmod,qcon) last_node = G.nodes()[0] H1.append((last_node,last_node)) print "%s\t%s" %(last_node,last_node) length = len(H1) length = length - 2 ''' tree = dendropy.Tree(stream=StringIO.StringIO(str(H[length])),schema="newick") for i in range(length-1,-1,-1): (u,v) = H[i] temp_tree = dendropy.Tree(stream=StringIO.StringIO(str(H[i])),schema="newick") current_parent = filter(lambda x: x.taxon.label == str(u), [y for y in tree.leaf_nodes()]) temp_tree_nodes = [x for x in temp_tree.nodes()] current_parent[0].add_child(temp_tree_nodes[1]) current_parent[0].add_child(temp_tree_nodes[2]) print(tree.as_ascii_plot()) ''' (u,v) = H1[length] counter = 1 #[999]((Taxon1:0.3,Taxon2:0.14):0.5,(Taxon3:0.34, Taxon4:0.5):0.12); string_root = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")" tree = dendropy.Tree(stream=StringIO.StringIO(string_root),schema="newick") for i in range(length-1,-1,-1): (u,v) = H1[i] string_temp = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")" temp_tree = dendropy.Tree(stream=StringIO.StringIO(string_temp),schema="newick") current_parent = filter(lambda x: x.taxon.label == "Taxon" + str(u), [y for y in tree.leaf_nodes()]) temp_tree_nodes = [x for x in temp_tree.nodes()] current_parent[0].add_child(temp_tree_nodes[1]) current_parent[0].add_child(temp_tree_nodes[2]) current_parent[0].taxon.label = current_parent[0].oid #print tree out = tree.as_string('newick') pdm = treecalc.PatristicDistanceMatrix(tree) T = [t1 for i, t1 in enumerate(tree.taxon_set)] T.sort(key=lambda x: x.label) #t = Tree(out.replace('[&U]','')) out = out.replace('[&U]','[50]') ## 50 unit long sequences open(outfile,'w').write(out) # The first T produced by the history print(tree.as_ascii_plot()) print "New tree data structure" D_net_dic = {} D_net_ret = {} D_net = [] for u in G_ref: D_net_dic[u] = {} for u in sorted(G_ref): print "size" key1 = "Taxon" + str(u) tmp_row = [] for v in sorted(G_ref): key2 = "Taxon" + str(v) if u < v: continue D_net_dic[u][v] = 1.0 / (pdm(T[int(u)-1],T[int(v)-1])+1) tmp_row.append(D_net_dic[u][v]) print D_net_dic[u][v], D_net.append(tmp_row) print '\n' names = [] for u in G_ref: names.append('Taxon'+str(u)) print names print D_net D_net_final = _DistanceMatrix(names,D_net) return D_net_final
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x): #Make sure names are unique names = org_names for name in names: if names.count(name)>1: temp_name = name i=1 for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup names[names.index(temp_name)] = temp_name + "_" + str(i) i = i +1 #Normalize the x vector x = map(lambda y: y/sum(x),x) ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30)) ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] matrix=list() for i in range(num_rows): matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)]) #Make the list of distances (ave of the two ckm matrices) ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm ckm_ave_train_dist = dict() for i in range(len(org_names)): ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))] #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t=Tree(tree.format('newick'),format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t&insert_above).up orig_branch_length = t.get_distance(insert_at_node,parent) if orig_branch_length < dist_along: raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py") removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else) def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names): dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name]) nearby_indicies = list() #Add all the organisms that are within 0.05 of the given percent # for i in range(len(dists)): # if dists[i]<=.05: # nearby_indicies.append(i) nearby_names = list() #If there are no nearby indicies, add the closest organism to the given percent if nearby_indicies==[]: nearby_names.append(org_names[dists.index(min(dists))]) else: for i in range(len(nearby_indicies)): nearby_names.append(org_names[i]) mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names)) nearby_names.append(leaf_name) LCA = t.get_common_ancestor(nearby_names) LCA_to_leaf_dist = t.get_distance(LCA,leaf_name) #divide the dist to the right/left of the LCA node by the number of percentage points in there if LCA.name==t.name: percent_dist = percent*LCA_to_leaf_dist if mean_dist <= percent: child_node = (t&leaf_name) else: child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy ancestor_node = (t&child_node.name).up elif mean_dist <= percent: percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist) child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up else: percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist) child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t&child_node.name).up insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node)) #Set outgroup if outgroup in names: t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names... else: print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ") print(names) print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.") #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1] cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs] for i in range(len(org_names)): xi = x[i:len(x):len(org_names)] for j in range(1,len(cutoffs)+1): if xi[j]>0: insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names) hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names size_factor=250 font_size=55 #Now put the bubbles on the nodes def layout(node): node_style = NodeStyle() node_style["hz_line_width"] = 10 node_style["vt_line_width"] = 10 node.set_style(node_style) #print(node) if node.is_leaf(): if node.name in org_names: #make reconstructed bubble size = x[org_names.index(node.name)] F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #Denote that this was a training organism nameFace = AttrFace("name", fsize=font_size, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in org_names: idx = hyp_node_names[node.name][2] size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)] F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #This is if I want the names of the hypothetical nodes to be printed as well #nameFace = AttrFace("name", fsize=font_size, fgcolor='black') #faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: size=0 else: size=0 ts = TreeStyle() ts.layout_fn = layout ts.mode = "r" #ts.mode = "c" ts.scale = 2*1000 ts.show_leaf_name = False ts.min_leaf_separation = 50 F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 ts.legend.add_face(F,0) ts.legend.add_face(TextFace(" Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1) ts.legend.add_face(TextFace(" Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1) ts.legend_position=4 #t.show(tree_style=ts) t.render(outfile, w=550, units="mm", tree_style=ts) #Redner the XML file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) project.add_phylogeny(phylo) project.export(open(outfilexml,'w'))
# rosalind_ba7b ''' Limb Length Problem Find the limb length for a leaf in a tree. Given: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated additive distance matrix D (whose elements are integers). Return: The limb length of the leaf in Tree(D) corresponding to row j of this distance matrix (use 0-based indexing). ''' import numpy as np from Bio.Phylo.TreeConstruction import _DistanceMatrix from Bio.Phylo.TreeConstruction import DistanceTreeConstructor f = open('rosalind_ba7b.txt') n = int(f.readline().rstrip()) j = int(f.readline().rstrip()) D = np.fromfile(f, sep=' ', dtype=int).reshape(n, n) #For the Phylo.TreeConstruction to work, integers must be Python int and not numpy.int64 dm = [[int(D[i, j]) for j in range(i+1)] for i in range(n)] names = [str(i) for i in range(n)] constructor = DistanceTreeConstructor() tree = constructor.nj(_DistanceMatrix(names, dm)) print(round(tree.find_any(str(j)).branch_length))
t_ = open('taxas.txt') taxas_ = [] for line in t_: taxas_.append((line.rsplit()[0])) t_.close() dist_ = np.genfromtxt('distance_.log') dist1_ = np.tril(dist_) # Make list of lists dist2_ = [] for x in range(0, np.shape(dist1_)[0]): dist2_.append(list(dist1_[x][0:x + 1])) mm_ = _DistanceMatrix(taxas_, dist2_) root_ = DistanceTreeConstructor() tree = root_.nj(mm_) path_ = os.path.dirname(os.path.realpath(__file__)) path_break = path_.split('/') dir_ = path_break[len(path_break) - 1] Phylo.write(tree, str(dir_) + '.nex', 'newick') ############################################################################################################################ g_ = open('final.nex.tree', 'a') with open(dir_ + '.p1.nex', 'r') as f_: for line in f_: g_.write(line) g_.write('\nBEGIN TREES;\nTREE tree1 =') with open(dir_ + '.nex', 'r') as f_: for line in f_:
def construct_distance_matrix(names, bit_vectors): return _DistanceMatrix(names, get_similarity_from_bit_vectors(bit_vectors))
def block_jsd(genome_ffp_vector, output_file): self_txt_result = output_file with open(genome_ffp_vector, "r") as self_genome_ffp_vector: genomes = collections.defaultdict(list) for lines in self_genome_ffp_vector: line = None line = None line = lines.strip().split() genome_block_name = line[0] genome_name = genome_block_name.split(".")[1].split("-")[0] #transfrom list of strings to floats block_ffp_vector = [float(i) for i in line[1:]] genomes[genome_name].append(block_ffp_vector) #matrix for final jsd distance, it should have the row&col length of total number of genomes final_jsd_matrix = np.zeros(shape=(len(genomes.keys()), len(genomes.keys()))) #now give index to each genome position index_position = dict() for index, genome in enumerate(genomes.keys()): index_position[genome] = index #Start pairwise comparison by combination of 2 genomes for genome_pairs in combinations(genomes.keys(), 2): genome_pair1 = None genome_pair2 = None pairwise_block_comparison = None pairwise_min_jsds = None Genome_A = None Genome_B = None pairwise_block_comparison = collections.defaultdict(list) pairwise_min_jsds = collections.defaultdict(list) genome_pair1 = genome_pairs[0] genome_pair2 = genome_pairs[1] #Compare every block of genome_pair1 to every block of genome_pair2 block_A = 0 for genome_pair1_vector in genomes[genome_pair1]: block_A += 1 for genome_pair2_vector in genomes[genome_pair2]: pairwise_block_comparison[ "%s_%i-%s" % (genome_pair1, block_A, genome_pair2)].append( JSD(np.array(genome_pair1_vector), np.array(genome_pair2_vector))) #Compare every block of genome_pair2 to every block of genome_pair1 block_B = 0 for genome_pair2_vector in genomes[genome_pair2]: block_B += 1 for genome_pair1_vector in genomes[genome_pair1]: pairwise_block_comparison[ "%s_%i-%s" % (genome_pair2, block_B, genome_pair1)].append( JSD(np.array(genome_pair2_vector), np.array(genome_pair1_vector))) #Select and save minumum values from each block pairwise comparison for pairwise_block in pairwise_block_comparison: if genome_pair1 == pairwise_block.split("-")[0].split("_")[ 0] and genome_pair2 == pairwise_block.split("-")[1]: pairwise_min_jsds[ "%s_%s" % (genome_pair1, genome_pair2)].append( min(pairwise_block_comparison[pairwise_block])) if genome_pair2 == pairwise_block.split("-")[0].split("_")[ 0] and genome_pair1 == pairwise_block.split("-")[1]: pairwise_min_jsds[ "%s_%s" % (genome_pair2, genome_pair1)].append( min(pairwise_block_comparison[pairwise_block])) #Now at this point pairwise_min_jsds.keys() should be just 2 if len(pairwise_min_jsds.keys()) == 2: Genome_A = pairwise_min_jsds["%s_%s" % (genome_pair1, genome_pair2)] Genome_B = pairwise_min_jsds["%s_%s" % (genome_pair2, genome_pair1)] final_jsd = ((sum(Genome_A) / len(Genome_A)) + (sum(Genome_B) / len(Genome_B))) / 2 #insert this value in the final matrix final_jsd_matrix[index_position[genome_pair1], index_position[genome_pair2]] = final_jsd final_jsd_matrix[index_position[genome_pair2], index_position[genome_pair1]] = final_jsd #Write final results np.savetxt(self_txt_result, final_jsd_matrix, fmt="%.18e", delimiter="\t", newline="\n", header="\t".join(genomes.keys()), footer="", comments="") #convert np matrix to lower triangular matrix and then to list of lists names = genomes.keys() new_jsd_matrix = [] loop_count = 0 for i in np.tril(final_jsd_matrix): loop_count += 1 new_jsd_matrix.append(i.tolist()[:loop_count]) jsd_distance_lowertriange = _DistanceMatrix(names, new_jsd_matrix) #construct nj phylogenetic tree constructor = DistanceTreeConstructor() nj_tree = constructor.nj(jsd_distance_lowertriange) Phylo.draw_ascii(nj_tree) #and write tree in newick formart Phylo.write(nj_tree, 'nj_ffp_jsd_tree.newick', "newick")
def main(): ### Main Arg Parse ### parser = argparse.ArgumentParser( description="Automated Phylogeny Builder v1") parser.add_argument( '-d', '--indir', help="Input Directory: Directory of FASTA files to analyze") parser.add_argument('-o', '--out', help="Output Directory", required=True) parser.add_argument('-t', '--threads', help="Number of max threads to use (default=1)", default="1") parser.add_argument( '-b', '--mash_db', help="Provide prebuilt mash DB, otherwise build from scratch") parser.add_argument( '-f', '--fast', help="Fast option for distance based neighbor joining tree", action="store_true") parser.add_argument( '-m', '--max_num', help="Maximum number of isolates to include (default=50)", default="50") parser.add_argument( '-g', '--genomes', help= "Provide genome directory to build tree with instead of automatically picking, requires -r flag" ) parser.add_argument( '-r', '--reference', help= "Required with -g flag; provide reference to use for phylogeny when providing genome directory" ) parser.add_argument('-s', '--snippy', help="existing snippy dir, requires -g and -r") parser.add_argument( '-p', '--proj_name', help= "project prefix - will be used to label all files associated with project", required=True) args = vars(parser.parse_args()) start_time = time.time() ### Print Args ### print("Running with the following parameters:") for arg in args: print(arg, ":", args[arg]) ### Set Output (Create if doesn't exist already) ### set_output(args["out"]) ### Initialize variables ### automatic_selection = True threads = args["threads"] q_dict = {} sketches_dict = {} sketches = [] sketch_info = {} results_dict = {} thresholds = {} error_dict = {} temp_dir = tempfile.mkdtemp() project_name = args["proj_name"] dir_flag = False mash_assembly_list = [] max_num = int(args["max_num"]) if args["fast"]: need_ref = False else: need_ref = True if args["mash_db"]: mash_db = args["mash_db"] if args["indir"]: input_dir = args["indir"] query_assemblies = [] if args["snippy"]: if not args["genomes"]: error_dict[ "snippy dir provided without genome dir, exiting"] = "Input error: " error(error_dict) if not args["reference"]: error_dict[ "snippy dir provided without reference, exiting"] = "Input error: " error(error_dict) automatic_selection = False if args["genomes"] and args["reference"]: input_dir = args["genomes"] reference = args["reference"] dir_flag = True automatic_selection = False if args["genomes"] and not args["reference"]: error_dict[ "Genome dir provided without reference, exiting"] = "Input error: " error(error_dict) if args["reference"] and not args["genomes"]: error_dict[ "Reference provided without genome directory, exiting"] = "Input error: " error(error_dict) in_file_counter = 0 for in_file in os.listdir(input_dir): in_file_path = os.path.join(input_dir, in_file) query_assemblies.append(in_file_path) in_file_counter += 1 max_num_per_query = (max_num - in_file_counter) / in_file_counter query_sketch = mash_sketch_list(threads, query_assemblies, OUTPUT_DIR, project_name, temp_dir) if need_ref: ref_path = pick_reference(query_sketch, threads) if not args["mash_db"]: bmgap_data = call_bmgap_api() for record in bmgap_data: mash_assembly_list.append(bmgap_data[record]["assembly_path"]) mash_db = mash_sketch_list(threads, mash_assembly_list, OUTPUT_DIR, project_name, temp_dir) if automatic_selection: final_genomes = pick_genomes(query_sketch, mash_db, args["threads"], int(max_num_per_query), force_max) if need_ref: final_genomes["ref"] = ref_path print(ref_path) else: final_genomes = { "all_genomes": [], "details": {}, "ref": args["reference"] } for infile in os.listdir(args["genomes"]): for ext in fasta_extensions: if ext in infile: infile_path = os.path.join(args["genomes"], infile) if infile_path not in final_genomes["all_genomes"]: final_genomes["all_genomes"].append(infile_path) continue #pp.pprint(final_genomes) if not args["fast"]: if not args["snippy"]: snippy_dir = run_snippy(final_genomes, args["threads"], query_assemblies, dir_flag) else: snippy_dir = args["snippy"] redo_list = snippy_check(snippy_dir) for obj in redo_list: print(obj) for genome in os.listdir(input_dir): print(genome) if obj in genome: print("found") redo_obj = os.path.join(genome_dir, genome) call_snippy(reference, redo_obj) call([ "snippy-core --prefix={}_core --aformat=fasta {}/*".format( project_name, snippy_dir) ], shell=True) p2 = Popen(["mv {}_core* {}".format(project_name, snippy_dir)], shell=True) p2.wait() p3 = Popen([ "python3 {} {}/{}_core.full.aln -o {}".format( mask_map_script, snippy_dir, project_name, OUTPUT_DIR) ], shell=True) p3.wait() masked_aln_file = "{}/{}_core.full_masked.aln".format( OUTPUT_DIR, project_name) partition_file = "{}/{}_core.full_partition.txt".format( OUTPUT_DIR, project_name) print("gubbins") p4 = Popen([ "run_gubbins.py -c {} -i 10 -u -p {}/gubbins_masked -v -t raxml {}" .format(args["threads"], OUTPUT_DIR, masked_aln_file) ], shell=True) p4.wait() gubbins_phylip_file = "{}/gubbins_masked.filtered_polymorphic_sites.phylip".format( OUTPUT_DIR) p5 = Popen([ "python3 {} {} {}".format(adjust_size_script, gubbins_phylip_file, partition_file) ], shell=True) p5.wait() abs_output = os.path.abspath(OUTPUT_DIR) print("raxml") p6 = Popen([ "raxmlHPC-PTHREADS -s {} -w {} -n {}_out --asc-cor=stamatakis -q {} -m GTRGAMMAX -T {} -N autoMRE -p 6420662893125220392 -f a -x 7125452922221827660" .format(gubbins_phylip_file, abs_output, project_name, partition_file, args["threads"]) ], shell=True) p6.wait() else: mash_matrix = make_mash_matrix(threads, final_genomes["all_genomes"], OUTPUT_DIR, project_name, temp_dir) # with open("test_out.txt","w") as f: # f.write(mash_matrix) i = 2 matrix = [] names = [] firstLine = True mash_matrix_lines = mash_matrix.split("\n") for line in mash_matrix_lines: if line.strip() != "": if firstLine: print(line) current_names = line.split("\t") for obj in current_names: if len(obj) > 0: names.append(obj) firstLine = False else: sub_matrix = [] values = line.split("\t") for q in range(1, i): val = float(values[q]) sub_matrix.append(val) matrix.append(sub_matrix) i += 1 #print(names) #print(len(names),len(matrix)) print("building tree") dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor(method="nj") tree = constructor.nj(dm) Phylo.write([tree], "my_tree.tree", "newick")
def main(argv): input_file='' title='Title' label_internal_nodes = False label_leaves = False out_file='' width=750 out_file_xml='' plot_rectangular = False common_kmer_data_path='' taxonomic_names_on_leaves = False try: opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="]) except getopt.GetoptError: print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) for opt, arg in opts: if opt == '-h': print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) elif opt in ("-i", "--InputCommonKmerXFile"): input_file = arg elif opt in ("-l", "--LabelLeaves"): label_leaves = True elif opt in ("-n","--LabelInternalNodes"): label_internal_nodes = True elif opt in ("-o", "--OutFile"): out_file = arg elif opt in ("-w", "--Width"): width = int(arg) elif opt in ("-x", "--OutFileXML"): out_file_xml = arg elif opt in ("-D", "--CommonKmerDataPath"): common_kmer_data_path = arg elif opt in ("-r", "--Rectangular"): plot_rectangular = True elif opt in ("-t", "--TaxonomicNamesOnLeaves"): taxonomic_names_on_leaves = True #Read in the x vector fid = open(input_file,'r') x = map(lambda y: float(y),fid.readlines()) fid.close() #Normalize the x vector #x = map(lambda y: y/sum(x),x) #Read in the taxonomy taxonomy = list() fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r') for line in fid: taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID) fid.close() #Read in the basis for the ckm matrices x_file_names = list() fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r') for line in fid: x_file_names.append(os.path.basename(line.strip())) fid.close() #Read in the common kmer matrix f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r') ckm30=np.array(f['common_kmers'],dtype=np.float64) f.close() f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r') ckm50=np.array(f['common_kmers'],dtype=np.float64) f.close() ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30)) ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] names = x_file_names matrix=list() for i in range(num_rows): matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)]) #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t=Tree(tree.format('newick'),format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t&insert_above).up orig_branch_length = t.get_distance(insert_at_node,parent) if orig_branch_length < dist_along: raise ValueError("error: dist_along larger than orig_branch_length") removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch def insert_hyp_node(t, leaf_name, percent): total_dist = t.get_distance(t.name,leaf_name) percent_dist = percent*total_dist child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t&child_node.name).up insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node)) #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1] cutoffs = map(lambda y: y**1.5,cutoffs) for i in range(len(x_file_names)): xi = x[i:len(x):len(x_file_names)] for j in range(1,len(cutoffs)+1): if xi[j]>0: insert_hyp_node(t, x_file_names[i], cutoffs[j-1]) hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names #insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j]) #Now put the bubbles on the nodes def layout(node): #print(node) if node.is_leaf(): if node.name in x_file_names: #make reconstructed bubble size = x[x_file_names.index(node.name)] F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") if taxonomic_names_on_leaves: nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)]) faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: nameFace = AttrFace("name", fsize=25, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in x_file_names: idx = hyp_node_names[node.name][2] size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)] F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #print node #print size else: size=0 else: size=0 #print(size) ts = TreeStyle() ts.layout_fn = layout if plot_rectangular: ts.mode = "r" else: ts.mode = "c" ts.show_leaf_name = False ts.min_leaf_separation = 50 #Export the tree to a png image t.render(out_file, w=width, units="mm", tree_style=ts) #Export the xml file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) phylo.phyloxml_phylogeny.set_name(title) project.add_phylogeny(phylo) project.export(open(out_file_xml,'w'))