def reroot_tree_with_outgroup(tree_name, outgroups): clade_outgroups = GubbinsCommon.get_monophyletic_outgroup(tree_name, outgroups) outgroups = [{'name': taxon_name} for taxon_name in clade_outgroups] tree = Phylo.read(tree_name, 'newick') tree.root_with_outgroup(*outgroups) Phylo.write(tree, tree_name, 'newick') tree = dendropy.Tree.get_from_path(tree_name, 'newick', preserve_underscores=True) tree.deroot() tree.update_splits() output_tree_string = tree.as_string( 'newick', taxon_set=None, suppress_leaf_taxon_labels=False, suppress_leaf_node_labels=True, suppress_internal_taxon_labels=False, suppress_internal_node_labels=False, suppress_rooting=True, suppress_edge_lengths=False, unquoted_underscores=True, preserve_spaces=False, store_tree_weights=False, suppress_annotations=True, annotations_as_nhx=False, suppress_item_comments=True, node_label_element_separator=' ', node_label_compose_func=None ) output_file = open(tree_name, 'w+') output_file.write(output_tree_string.replace('\'', '')) output_file.closed
def convert_boottrees(fname_trees): out_fnames = [] for i, tree in enumerate(Phylo.parse(fname_trees, "newick")): fname_tree = "%s.codeml-%d" % (fname_trees, i) Phylo.write(tree, fname_tree, "newick") out_fnames.append(fname_tree) return out_fnames
def phylo2newick(self, t): """ Convert Phylo into Newick tree string. """ output = StringIO() Phylo.write(t, output, 'newick') return output.getvalue()
def buildTree(FASTAFile): myAlignment = AlignIO.read(FASTAFile, "fasta") # Create a tip mapping from the fasta file tipMapping = {} for record in myAlignment: tipMapping[record.id] = str(record.seq) # Compute a distance matrix and construct tree calculator = DistanceCalculator("identity") myMatrix = calculator.get_distance(myAlignment) constructor = DistanceTreeConstructor() upgmaTree = constructor.nj(myMatrix) upgmaTree.root_at_midpoint() Phylo.draw(upgmaTree) # Convert phyloxml tree to newick # biopython does not provide a function to do this so it was necessary # to write to a buffer in newick to convert then get rid of unneeded info for clade in upgmaTree.get_terminals(): clade.name = "\"" + clade.name + "\"" buf = cStringIO.StringIO() Phylo.write(upgmaTree, buf, 'newick', plain = True) tree = buf.getvalue() tree = re.sub(r'Inner\d*', '', tree) tree = tree.replace(";", "") tree = literal_eval(tree) #newick format # RLR tree required for maxParsimony function tree = NewicktoRLR(tree) return tree
def print_newick_tree(self, root_tee): """ Convert an ElementTree into a ladderized Newick tree. """ newick = self.export_newick_tree(root_tee.getroot()) # load into Phylo so we can sort the tree (i.e. ladderize) tree = Phylo.read(StringIO(newick), 'newick') tree.ladderize() # export the tree back to a string fout = StringIO() Phylo.write(tree, fout, 'newick') newick = fout.getvalue() # remove the branch lenghs newick = newick.replace(':0.00000', '').strip() # get the order of admix nodes in the tree order = list(OrderedDict.fromkeys(re.findall('a\d+', newick))) # normalise the node numbering for i, old in enumerate(order): newick = newick.replace(old, 'n%s' % (i + 1)) # replace n0 with a0 (to preseve the existing cache) newick = re.sub(r'n([0-9]+)', r'a\1', newick) return newick
def annotate_cOTU_tree(cOTU_tree_string,results_list): from Bio import Phylo from StringIO import StringIO tree = Phylo.read(StringIO(cOTU_tree_string),'newick',rooted=True) for node_dict in results_list: node_tree = Phylo.read(StringIO(load_de_numericized_newick_tree(node_dict['s_nodes'],before="cOTU_",after="")),'newick',rooted=True) ###debug### #print node_tree node_ref = [] for terminal in node_tree.get_terminals(): node_ref.append({"name": terminal.name}) node = tree.common_ancestor(node_ref) node.confidence = float(node_dict['fdr_p']) #print node_dict['fdr_p'] out = StringIO() Phylo.write(tree,out,'newick') return out.getvalue()
def main(): global START_TIME global CURRENT_TIME global nodelist print(colored("---------------- read tree ----------------", "green")) subtree_path = './data/subtree/Eukaryota.tre' tree = Phylo.read(subtree_path, 'newick') CURRENT_TIME = print_time(CURRENT_TIME) print(colored("---------------- read nodelist ----------------", "green")) nodelist_path = './data/nodelist/Eukaryota-castor.csv' # 0 1 2 3 4 5 # nodelist - [id, originaltag, finaltag, depth, heights, nr_children] with open(nodelist_path, 'r') as csv_file: reader = csv.reader(csv_file, delimiter=',') next(reader, None) # skip the header for row in reader: if row != []: ott_id = row[0] originaltag = row[1] finaltag = row[2] nodelist.append([ott_id, originaltag, finaltag]) CURRENT_TIME = print_time(CURRENT_TIME) print(colored("---------------- prepare tree ----------------", "green")) prepare_tree(tree.clade) print(colored("---------------- Save tree ----------------", "green")) Phylo.write(tree, './results/Eukaryota_tree-castor.tre', 'newick') CURRENT_TIME = print_time(CURRENT_TIME) print(colored("--------------------------------", "green")) return
def tree(option, opt, value, parser): inputfile = parser.values.inputfile outputfile = parser.values.outputfile inputtype = str(value[0]) outputtype = str(value[1]) tree = Phylo.read(inputfile, inputtype) records = tree.get_terminals() # convert sequences to small names and save dictionary to .dict file if parser.values.convert == False: namedict = dict() x = 0 for clade in records: x+=1 newname = 'seq%i' % x namedict[newname] = clade.name clade.name = newname Phylo.write(tree, outputfile, outputtype) # dictoutname = outputfile + ".dict" # dicthandle = open(dictoutname, "w") # dicthandle.write(str(namedict)) # dicthandle.close() # use dictionary to convert taxon back to large names elif parser.values.convert == True: namedict = eval(open(parser.values.dictfile, "r").read()) for clade in records: oldname = namedict[clade.name] clade.name = oldname Phylo.write(tree, outputfile, outputtype)
def trim_tree(absenteeList, TreeFile): """Collapse away species from the phylogenetic tree that are not found in this sequence file. Output the tree file.""" print "\nReading the Tree..." #parse the tree using Phylo tree = Phylo.read(TreeFile, 'newick') print "Here is the starting tree:" Phylo.draw_ascii(tree) terminals = tree.get_terminals() print "\nFound the following {} taxa in the tree:".format(len(terminals)) print terminals #prune away taxa that are not included for this sequence file for taxon in absenteeList: print("Removing absent") tree.prune(taxon) print "\nPruned away these species:" print absenteeList print "\nHere is the tree with the missing taxa pruned away:\n" Phylo.draw_ascii(tree) #unless you have a clock, PAML requires that your tree is unrooted, ie has a trifurcation at first node. So do that here # ROOT = tree.get_nonterminals()[0] # if ROOT.is_bifurcating() == True: # firstNode = tree.get_nonterminals()[1] # tree.collapse(firstNode) #if RunMode is not 2 just output the pruned tree as is print "\nOutputting the following revised tree for the species content of the sequence file" print "it should have a trifurcation at the base unless you are using a clock\n" Phylo.draw_ascii(tree) # if tree.rooted == False: # print "The revised tree is an unrooted tree (regardless of how the sketch above looks)" # if tree.rooted == True: # print "Hmm, the tree is rooted. This may not be right for PAML input. You should check." Phylo.write(tree, TreeOutFileName, "newick")
def writeTree(filename, tree, format_str='newick'): """ Write a tree to file using Biopython. :arg filename: name for output file :type filename: str :arg tree: a square matrix with length of ensemble. If numbers does not match *names* it will raise an error :type tree: :class:`~Bio.Phylo.BaseTree.Tree` :arg format_str: a string specifying the format for the tree :type format_str: str """ try: from Bio import Phylo except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') if not isinstance(filename, str): raise TypeError('filename should be a string') if not isinstance(tree, Phylo.BaseTree.Tree): raise TypeError('tree should be a Biopython.Phylo Tree object') if not isinstance(format_str, str): raise TypeError('format_str should be a string') Phylo.write(tree, filename, format_str)
def root_tree_with_outgroup(input_file, output_file, outgroup): input_tree = Phylo.read(input_file, 'newick') try: input_tree.root_with_outgroup({'name': outgroup}) Phylo.write(input_tree, output_file, 'newick') except: print('Could not root', input_file)
def save_treetime_results(self): from Bio import Align # files to be displayed in the web interface self._tree_to_json() self._likelihoods_to_json() # files to be downloaded as .zip archive Phylo.write(self.tree, os.path.join(self._root_dir, out_tree_nwk), 'newick') self._save_alignment() self._save_metadata_to_csv() self._save_molecular_clock_to_csv() self._save_gtr() # zip all results to one file with zipfile.ZipFile(os.path.join(self._root_dir, zipname), 'w') as out_zip: out_zip.write(os.path.join(self._root_dir, out_tree_nwk), arcname=out_tree_nwk) out_zip.write(os.path.join(self._root_dir, out_aln_fasta), arcname=out_aln_fasta) out_zip.write(os.path.join(self._root_dir, out_metadata_csv), arcname=out_metadata_csv) out_zip.write(os.path.join(self._root_dir, out_tree_json), arcname=out_tree_json) #out_zip.write(os.path.join(self._root_dir, in_cfg), arcname=in_cfg) out_zip.write(os.path.join(self._root_dir, out_mol_clock_csv), arcname=out_mol_clock_csv) out_zip.write(os.path.join(self._root_dir, out_likelihoods_json), arcname=out_likelihoods_json) out_zip.write(os.path.join(self._root_dir, out_gtr), arcname=out_gtr)
def process_fasta(args): metadata = pd.read_csv(args.metadata, sep='\t') # calcualte numeric date # removes rows with ambiguous dates exclude_dates = set(['2019', '2020', '2020-01', '2020-02', '2020-03', '2020-01-XX', '2020-02-XX', '2020-03-XX', '2020-04-XX']) metadata = metadata[~metadata['date'].isin(exclude_dates)] metadata['numeric_date'] = pd.to_datetime( metadata['date']).apply(numeric_date) aligned = list(SeqIO.parse(args.aln, "fasta")) aligned, names = rename_aln(aligned, args.ph, metadata) tree = Phylo.read(args.initTree, 'newick') tree = rescale_tree(tree) # TODO WHY DO WE NEED THIS for tip in tree.get_terminals(): tip.name = names[tip.name] aln_name = args.aln.split('/')[-1].replace('.fasta', '') renamed_fasta_path = args.base_path+'/'+aln_name+'renamed.fasta' renamed_tree_path = args.base_path+'/'+aln_name+'renamed.newick' with open(renamed_fasta_path, 'w') as out_fasta: SeqIO.write(aligned, out_fasta, 'fasta') with open(renamed_tree_path, 'w') as out_tree: Phylo.write(tree, out_tree, 'newick') args.finalAln = renamed_fasta_path args.finalTree = renamed_tree_path return(args)
def build_phylogeny_trees(): path = "out/homologous_gene_sequences/" output_path = "out/aligned_homologous_gene_sequences/" for homologous_gene_sequence in os.listdir(path): input = path + homologous_gene_sequence output = output_path + homologous_gene_sequence clustal_omega = ClustalOmegaCommandline(infile=input, outfile=output, verbose=True, auto=True) os.system(str(clustal_omega)) multi_seq_align = AlignIO.read(output, 'fasta') # Distance Matrix calculator = DistanceCalculator('identity') dist_mat = calculator.get_distance(multi_seq_align) tree_constructor = DistanceTreeConstructor() phylo_tree = tree_constructor.upgma(dist_mat) Phylo.draw(phylo_tree) print('\nPhylogenetic Tree\n', homologous_gene_sequence) Phylo.draw_ascii(phylo_tree) Phylo.write([phylo_tree], 'out/phylogenetic_trees/{}_tree.nex'.format(homologous_gene_sequence), 'nexus')
def main(): args = parse_arguments() msa = ParsimonyTree.read_msa(args.a) i_tree = ParsimonyTree.read_tree(args.n) nb_f = ParsimonyTree.get_nni_neighbors if args.spr: nb_f = ParsimonyTree.get_spr_neighbors elif args.tbr: nb_f = ParsimonyTree.get_tbr_neighbors mcmc = MonteCarlo(msa, i_tree, nb_f, args.r, args.p) f_tree = mcmc.get_tree() with open(args.o, "w") as outfile: Phylo.write(f_tree, outfile, "newick") print("\n=========================\n") print("Original Tree") print("Score:", ParsimonyTree.get_parsimony_score(msa, i_tree)) Phylo.draw(i_tree) Phylo.draw_ascii(i_tree) print("\n=========================\n") print("Final Tree") print("Score:", ParsimonyTree.get_parsimony_score(msa, f_tree)) Phylo.draw(f_tree) Phylo.draw_ascii(f_tree) print("\n=========================\n") print("Histogram of Parsimony Scores") plt.title("Histogram of Parsimony Scores") plt.hist(mcmc.get_scores()) plt.show()
def phylo_tree_score_otus(input_file, tree, path=RESULT_FOLDER, out_tree='out_tree.txt'): #系統樹作成 score = [] otus = [] f = open(os.path.join(RESULT_FOLDER, input_file), "r") try: line_count = int(f.readline()) for n in range(line_count): line = f.readline() line_read = line.split(" ", 1) otus.append(line_read.pop(0)) pre_score = line_read.pop(0) score.append(list(map(float, pre_score.split(" ")[:-1:]))) except: raise Exception("No valid matrix") f.close() print("Create Phylogenetic Tree...") try: if tree == "nj": print("nj") Phylo.write(makeNj(score, otus), os.path.join(path, out_tree), "newick") elif tree == "upgma": print("upgma") Phylo.write(makeUpgma(score, otus), os.path.join(path, out_tree), "newick") except: raise Exception("Phylogenetic Tree Generation Error")
def phylo2newick(self, t): """ Convert Phylo into Newick tree string. """ output = StringIO() Phylo.write(t, output, "newick") return output.getvalue()
def action(args): def newname(leaf, newname): leaf.name = newname return leaf tree = Phylo.parse(args.tree, args.tree_type).next() leafs = (leaf for leaf in tree.get_terminals()) if args.info: info = DictReader(args.info, fieldnames=['seqname', 'newname']) info = {i['seqname']: i['newname'] for i in info} # for newick trees :s will be replaced by |s if args.tree_type == 'newick': info = {s.replace(':', '|'): n for s, n in info.items()} leafs = (l for l in leafs if l.name in info) leafs = (newname(l, info[l.name]) for l in leafs) if args.remove_word: leafs = (newname(l, re.sub(args.remove_word, '', l.name)) for l in leafs) leafs = (newname(l, l.name.strip()) for l in leafs) leafs = (newname(l, args.add_prefix + l.name) for l in leafs) leafs = (newname(l, l.name + args.add_suffix) for l in leafs) # do this last if args.tree_type == 'newick': leafs = (newname(l, l.name.replace(' ', '_')) for l in leafs) # execute changes and write tree list(leafs) Phylo.write(tree, args.out, args.tree_type)
def _calculate_gsi(self): """ Method for calculating Gene Support Indices :return: """ LOGGER.info("Calculating Gene Support Indices (GSIs)" " from the gene trees..") genome_num = 0 bcg_dir = os.path.join(self._dirpath, self.config.bcg_dir) for file in os.listdir(bcg_dir): if file.endswith('.bcg'): genome_num += 1 nwk_file = os.path.join(self._align_output_dir, "all_gene.trees") trees = Phylo.parse(nwk_file, 'newick') tree = Consensus.majority_consensus(trees, cutoff=(100-self.config.gsi_threshold) * genome_num/100) Phylo.draw_ascii(tree) ubcg_gsi_file = os.path.join(self._align_output_dir, f'UBCG_gsi({self._bcg_num}' f'){self.config.postfixes.align_tree_const}') with open(ubcg_gsi_file, 'w') as f: Phylo.write(tree, f, 'newick') LOGGER.info("The final tree marked with GSI was written" " to %s", ubcg_gsi_file)
def reroot_tree_with_outgroup(tree_name, outgroups): clade_outgroups = GubbinsCommon.get_monophyletic_outgroup(tree_name, outgroups) outgroups = [{"name": taxon_name} for taxon_name in clade_outgroups] tree = Phylo.read(tree_name, "newick") tree.root_with_outgroup(*outgroups) Phylo.write(tree, tree_name, "newick") tree = dendropy.Tree.get_from_path(tree_name, "newick", preserve_underscores=True) tree.deroot() tree.update_bipartitions() output_tree_string = tree.as_string( schema="newick", suppress_leaf_taxon_labels=False, suppress_leaf_node_labels=True, suppress_internal_taxon_labels=False, suppress_internal_node_labels=False, suppress_rooting=True, suppress_edge_lengths=False, unquoted_underscores=True, preserve_spaces=False, store_tree_weights=False, suppress_annotations=True, annotations_as_nhx=False, suppress_item_comments=True, node_label_element_separator=" ", ) with open(tree_name, "w+") as output_file: output_file.write(output_tree_string.replace("'", "")) output_file.closed
def generate_new_files(fname) : # to get gene names that slr can handle (short enough) newfname = fname.replace(".", "_.") # generate a fasta file with new ids d = {} sequences = [] runningids = 1 for record in SeqIO.parse(fname, 'fasta') : d[record.id] = "flyg%s" % runningids record.id = d[record.id] record.name = "" record.description = "" sequences.append(record) runningids += 1 SeqIO.write(sequences, newfname, "fasta") if not RUN_RAXML : # generate a treefile with new ids treefile = fname.replace("fasta", "tree") newtreefile = newfname.replace("fasta", "tree") tree = Phylo.read(treefile, 'newick') for node in tree.get_terminals(): node.name = d[node.name] Phylo.write(tree, newtreefile, 'newick') return newfname
def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5): from Bio import Phylo, AlignIO import subprocess, glob, shutil make_dir(self.run_dir) os.chdir(self.run_dir) for seq in self.aln: seq.name=seq.id AlignIO.write(self.aln, 'temp.fasta', 'fasta') tree_cmd = ["fasttree"] if self.nuc: tree_cmd.append("-nt") tree_cmd.append("temp.fasta") tree_cmd.append(">") tree_cmd.append("initial_tree.newick") os.system(" ".join(tree_cmd)) out_fname = "tree_infer.newick" if raxml: if raxml_time_limit>0: tmp_tree = Phylo.read('initial_tree.newick','newick') resolve_iter = 0 resolve_polytomies(tmp_tree) while (not tmp_tree.is_bifurcating()) and (resolve_iter<10): resolve_iter+=1 resolve_polytomies(tmp_tree) Phylo.write(tmp_tree,'initial_tree.newick', 'newick') AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed") print( "RAxML tree optimization with time limit", raxml_time_limit, "hours") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit*3600) process = subprocess.Popen("exec raxml -f d -T " + str(self.nthreads) + " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob("RAxML_checkpoint*") if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') try: print("RAxML branch length optimization") os.system("raxml -f e -T " + str(self.nthreads) + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick") shutil.copy('RAxML_result.branches', out_fname) except: print("RAxML branch length optimization failed") shutil.copy('raxml_tree.newick', out_fname) else: shutil.copy('initial_tree.newick', out_fname) self.tt_from_file(out_fname, root) os.chdir('..') remove_dir(self.run_dir) self.is_timetree=False
def root(tree, clade, filename): """ roots tree in newick format on a single column list of outgroup clade names Parameters ---------- argv: tree newick tree file argv: clade single column file of outgroup taxa argv: filename output file name """ # read in tree tree = Phylo.read(tree, 'newick') # initialize variables for terminal branch length clade = [line.rstrip('\n') for line in open(clade)] outgroup = [{'name': taxon_name} for taxon_name in clade] tree.root(outgroup) Phylo.draw_ascii(tree) Phylo.write(tree, filename, 'newick')
def tree(from_cluster,to_cluster, grupa): consensus_trees = [] for i in [x for x in range(from_cluster,to_cluster)]: msa = AlignIO.read('msa\msa_rodzina_' + str(i)+ '_s.fasta', 'fasta') print i calculator = DistanceCalculator('identity') try: dm = calculator.get_distance(msa) constructor = DistanceTreeConstructor(calculator, 'nj') trees = bootstrap_trees(msa, 50, constructor) trees_list = list(trees) not_included = set([]) for j in range(len(trees_list)): target_tree = trees_list[j] support_tree = get_support(target_tree, trees_list) for node in support_tree.get_nonterminals(): if node.confidence < 50: not_included.add(j) trees = [trees_list[k] for k in range(len(trees_list)) if k not in not_included] if len(trees) > 0: consensus_trees.append(majority_consensus(trees)) except: ValueError Phylo.write(consensus_trees,"drzewa_wynikowe_" + str(grupa),"newick")
def ML_tree(infile, outfile, file_type): # Tree creation with maximum-likelihood algorithm (phyML) # input : infile = .fasta alignment file that the user can import or paste, outfile = name of output file, file_type = clustal is the clustal too has been used, fasta if muscle tool has been used # output : .newick file and .png picture to display # phylogeny page should allow to choose maximum likelihood method # convert file to phylip records = SeqIO.parse("static/data/sauvegardes/" + dirName + infile, file_type) # clustal <-> fasta count = SeqIO.write(records, "static/data/sauvegardes/" + dirName + outfile + ".phylip", "phylip") print("Converted %i records" % count) if (user_OS == 'darwin'): cmd = PhymlCommandline(cmd='static/tools/MacOS/PhyML-3.1/PhyML-3.1_macOS-MountainLion', input='static/data/sauvegardes/' + dirName + outfile + '.phylip') if (user_OS == 'linux'): cmd = PhymlCommandline(cmd='static/tools/Linux/PhyML-3.1/PhyML-3.1_linux64', input='static/data/sauvegardes/' + dirName + outfile + '.phylip') if (user_OS == 'win32'): cmd = PhymlCommandline(cmd= current_path + '/static/tools/Windows/PhyML-3.1/PhyML-3.1_win32.exe', input='static/data/sauvegardes/' + dirName + outfile + '.phylip') out_log, err_log = cmd() tree = Phylo.read('static/data/sauvegardes/' + dirName + outfile + '.phylip_phyml_tree.txt', 'newick') Phylo.draw(tree, do_show=False) Phylo.write(tree, 'static/data/sauvegardes/' + dirName + 'tree.txt', "newick") foo = current_path + '/static/data/sauvegardes/' + dirName + 'tree.png' plt.savefig(foo)
def TreeAssembly(StartDIR, outfname, delete_name): init_clade = Phylo.BaseTree.Clade(name=StartDIR) tree = Phylo.BaseTree.Tree(init_clade) NONTERMINALS = [tree.clade] i = 0 while (NONTERMINALS != []): i += 1 cstate = NONTERMINALS.pop(0) WD = cstate.name try: downtree = Phylo.read(WD + "/UPSTREAM.nwk", 'newick') cstate.clades.extend(downtree.clade.clades) NONTERMINALS.extend( list(terminal for terminal in downtree.get_terminals())) except: try: downtree = Phylo.read(WD + "/TERMINAL.nwk", 'newick') if (downtree.clade.clades != []): cstate.clades.extend(downtree.clade.clades) else: cstate.name = downtree.clade.name except: print("missing " + WD) if (delete_name == "TRUE"): for internal_node in tree.get_nonterminals(): internal_node.name = "" Phylo.write(tree, outfname, 'newick')
def make_patient_RNA_DNA_tree(pcode, min_DNA_frac = 0.001): ''' make a tree for all RNA/DNA sample of a given patient ''' for seq_type in ['clustered_good', 'good', 'hyper', 'suspicious']: seqs=[] for outprefix in patient_to_prefix_p17[pcode]: with myopen('data/'+outprefix+'_DNA_'+seq_type+save_as) as ifile: seqs.extend([x for x in SeqIO.parse(ifile, 'fasta')]) p = Patient.load(pcode) seqs.extend(p.get_haplotype_alignment(region)) seqs_pruned = prune_rare_DNA(seqs, min_DNA_frac) for hi, hap in enumerate(seqs_pruned): hap.id+='_'+str(hi) hap.name=hap.id outfname = 'data/'+pcode+'_RNA_and_DNA_'+seq_type+'.fasta' align(ungap(seqs_pruned), outfname) tree = infer_tree(outfname, min_DNA_frac=0.0) leaves = sorted(filter(lambda x:x.name[:4]=='days', tree.get_terminals()), key = lambda x:(int(x.name.split('_')[1]), -int(x.name.split('_')[3][:-1]))) tree.root_with_outgroup(leaves[0]) tree.root.branch_length=0.01 for branch in tree.get_nonterminals(order='postorder'): if branch.branch_length<0.001: tree.collapse(branch) tree.ladderize() Phylo.write(tree, 'data/'+pcode+ '_RNA_and_DNA_'+seq_type+'.nwk', 'newick')
def tree_reconstruction(phy_file, method, model, phyformat): '''Construct tree with given method and model''' aln = AlignIO.read(phy_file, 'phylip-' + phyformat) constructor = DistanceTreeConstructor() calculator = DistanceCalculator(model) dm = calculator.get_distance(aln) if method == 'upgma': tree = constructor.upgma(dm) elif method == 'nj': tree = constructor.nj(dm) tree.ladderize() for c in tree.find_clades(): if 'Inner' in c.name: c.name = '' Phylo.write(tree, args.output + '/tree.nwk', 'newick') plt.rcParams['font.style'] = 'italic' plt.rc('font', size=8) plt.rc('axes', titlesize=14) plt.rc('xtick', labelsize=10) plt.rc('ytick', labelsize=10) plt.rc('figure', titlesize=18) draw(tree, do_show=False) plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
def summarise_dist(self, rf_results: RfResults, dir_out): for use_norm in (True, False): if use_norm: path_out = os.path.join(dir_out, 'rf_normed.tree') path_hm = os.path.join(dir_out, 'rf_normed_heatmap.svg') plt_title = 'Normalised Robinson-Foulds Distance' else: path_out = os.path.join(dir_out, 'rf_un_normed.tree') path_hm = os.path.join(dir_out, 'rf_un_normed_heatmap.svg') plt_title = '(un)Normalised Robinson-Foulds Distance' metrics = defaultdict(dict) names = set() for (tid_a, tid_b), (rf, norm_rf) in rf_results.data.items(): if use_norm: metrics[tid_a][tid_b] = norm_rf metrics[tid_b][tid_a] = norm_rf else: metrics[tid_a][tid_b] = rf metrics[tid_b][tid_a] = rf names.add(tid_a) names.add(tid_b) labels = sorted(list(names)) mat_vals = list() mat = np.zeros((len(labels), len(labels))) for i in range(len(labels)): cur_row = list() tid_a = labels[i] for j in range(i + 1): tid_b = labels[j] if tid_a == tid_b: cur_row.append(0.0) else: cur_row.append(metrics[tid_a][tid_b]) mat[i, j] = metrics[tid_a][tid_b] mat_vals.append(cur_row) mat = mat + mat.T # Newick dm = DistanceMatrix(names=labels, matrix=mat_vals) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) Phylo.write(tree, path_out, 'newick') # Heatmap cmap = sns.cubehelix_palette(100, reverse=True) sns.set(font_scale=1) fig_size = (15, 15) rf_df = pd.DataFrame(mat, columns=labels, index=labels) sns.clustermap(rf_df, annot=True, fmt='.3f', cmap=cmap, figsize=fig_size).fig.suptitle(plt_title) plt.savefig(path_hm)
def distance_matrix(cls, cluster_list): print cluster_list dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list) distance_pairs = {g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all()} matrix = [] for i in range(0,len(cluster_list)): matrix_iteration = [] for j in range(0,i+1): if i == j: matrix_iteration.append(0) elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs: matrix_iteration.append(distance_pairs[cluster_list[i] + '_' + cluster_list[j]]) elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs: matrix_iteration.append(distance_pairs[cluster_list[j] + '_' + cluster_list[i]]) else: raise("Error, can't find pair!") matrix.append(matrix_iteration) #print matrix_iteration cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list] matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(matrix_obj) tree.ladderize() #Phylo.draw_ascii(tree) output = StringIO.StringIO() Phylo.write(tree, output, 'newick') tree_str = output.getvalue() #print tree_str return tree_str
def write_xml(fname, E, C, l): n, _ = E.shape root = Tree() root.name = str(n - 1) stack = [root] while stack: cur = stack.pop() i = int(cur.name) child_idxs = np.where(E[i, :] == 1)[0] for ci in child_idxs: child = cur.add_child(name=str(ci)) child.dist = np.linalg.norm(np.subtract(C[i, l:], C[ci, l:]), ord=1) stack.append(child) newick_str = root.write( features=['name'], format=1, format_root_node=True ) # format_root_node=True puts root node name in str newick_tree = Phylo.read( StringIO(newick_str), 'newick' ) # format=1 gives branch lengths and names for all nodes (leaves and internal) for clade in newick_tree.find_clades(): if clade.confidence is not None: # Phylo.read() stupidly interprets names of internal nodes as confidences for newick strings clade.name = clade.confidence clade.confidence = None xmltree = newick_tree.as_phyloxml() # convert to PhyloXML.Phylogeny type Phylo.write(xmltree, open(fname, 'w'), 'phyloxml')
def ete3_evol_prepare( tree_in_fn, alignment_in_fn, tree_out_fn, foreground_list, min_foreground=2, min_background=2, ): """ Read a species tree and alignment (nwk and fasta), Read the list of foreground taxa If there are enough foreground and background species: subset and rename the species tree into a protein tree write the alignment separately. """ # Read inputs tree_in = Phylo.read(file=tree_in_fn, format="newick") alignment_in = AlignIO.read(handle=open(alignment_in_fn, "r"), format="fasta") # Slice and rename leafs in tree tree_out = rename_tree(tree=tree_in, alignment=alignment_in) # Check that there are enough sequences if has_enough_by_background_and_foreground( alignment_in, foreground_list, min_foreground, min_background ): Phylo.write(trees=tree_out, file=tree_out_fn, format="newick")
def export_gain_loss(tree, path): ''' ''' # write final tree with internal node names as assigned by treetime sep = '/' output_path = sep.join([path.rstrip(sep), 'geneCluster/']) tree_fname = sep.join([output_path, 'tree_result.newick']) Phylo.write(tree.tree, tree_fname, 'newick') from collections import defaultdict gene_gain_loss_dict = defaultdict(str) for node in tree.tree.find_clades( order='preorder'): # order does not matter much here if node.up is None: continue #print(node.name ,len(node.geneevents),node.geneevents) gain_loss = [ str(int(ancestral) * 2 + int(derived)) for ancestral, derived in zip(node.up.genepresence, node.genepresence) ] gene_gain_loss_dict[node.name] = "".join(gain_loss) gain_loss_array = np.array( [[i for i in gain_loss_str] for gain_loss_str in gene_gain_loss_dict.values()], dtype=int) # 1 and 2 are codes for gain/loss events events_array = ((gain_loss_array == 1) | (gain_loss_array == 2)).sum(axis=0) events_dict = {index: event for index, event in enumerate(events_array)} events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk']) write_pickle(events_dict_path, events_dict) # export gene loss dict to json for visualization gene_loss_fname = sep.join([output_path, 'geneGainLossEvent.json']) write_json(gene_gain_loss_dict, gene_loss_fname, indent=1)
def write_clusters(seqfname, tree, clusters, unclustered): """Write output files: clusters & unique as FASTA, tree as phyloXML.""" is_aln = seqfname.endswith('.aln') seq_idx = SeqIO.to_dict(SeqIO.parse(seqfname, 'clustal' if is_aln else 'fasta')) def write_cluster(cluster, fname): """Write the sequences of cluster tips to a FASTA file.""" records = [seq_idx[seqid] for seqid in sorted(cluster)] with open(fname, 'w+') as handle: for rec in records: write_fasta(rec, handle, do_ungap=is_aln) logging.info("Wrote %s (%d sequences)", fname, len(records)) colors = [BranchColor(*map(lambda x: int(x*255), rgb)) for rgb in ColorSpiral().get_colors(len(clusters))] for i, item in enumerate(sorted(clusters.iteritems(), reverse=True, key=lambda kv: len(kv[1]))): clade, cluster = item write_cluster(cluster, os.path.basename(seqfname) + '.' + str(i)) clade.color = colors[i] clade.width = 2 if unclustered: write_cluster(unclustered, os.path.basename(seqfname) + '.Unique') treefname = os.path.basename(seqfname) + '.xml' Phylo.write(tree, treefname, 'phyloxml') logging.info("Wrote %s", treefname)
def tiny_tree(INPUTfile, OUTPUTnwk, file_format="fasta"): is_gzipped = (INPUTfile.split(".")[-1] == "gz") if is_gzipped: handle = gzip.open(INPUTfile, 'rt') else: handle = open(INPUTfile, 'r') names = [] if (file_format == "fasta"): records = SeqIO.parse(handle, "fasta") for record in records: if (record.id != "root"): names.append(record.id) elif (file_format == "edit"): for line in handle: name = line.split()[0] if (name != "root"): names.append(name) if (len(names) == 1): init_clade = Phylo.BaseTree.Clade(name=names[0]) tree = Phylo.BaseTree.Tree(init_clade) elif (len(names) == 2): init_clade = Phylo.BaseTree.Clade() tree = Phylo.BaseTree.Tree(init_clade) tree.clade.clades.extend( list(Phylo.BaseTree.Clade(name=name) for name in names)) else: print("tiny_tree() Error : len(names)=") print(len(names)) Phylo.write(tree, OUTPUTnwk, 'newick') handle.close()
def write ( self, phytrees_file ) : """ Save all trees stored at the PhyTrees object in the 'phytrees_file' (in newick format). A file with a detailed report of the trees will be created replacing the extension of 'phytrees_file' by ".rep". If 'phytrees_file' contains a relative path, the current working directory will be used to get the absolute path. If any file already exists, it will be overwritten without warning. Arguments : phytrees_file ( string ) New PhyTrees tree file. Raises : IOError If the path provided doesn't exist. """ data_filepath = get_abspath(phytrees_file) report_filepath = os.path.splitext(data_filepath)[0] + '.rep' # Generate a single string with all the report content str_report = '\n'.join([' '.join(x) for x in self._report]) # Write all the information in the PhyTrees files try : Phylo.write(self.data, data_filepath, 'newick') with open(report_filepath, 'w') as report_file : report_file.write('Num. trees: {:d}\nHistory:\n' \ '{:s}'.format(len(self), str_report)) except IOError : raise except : if ( os.path.isfile(data_filepath) ) : os.remove(data_filepath) if ( os.path.isfile(report_filepath) ) : os.remove(report_filepath) raise
def action(args): def newname(leaf, newname): leaf.name = newname return leaf tree = Phylo.parse(args.tree, args.tree_type).next() leafs = (leaf for leaf in tree.get_terminals()) if args.info: info = DictReader(args.info, fieldnames = ['seqname','newname']) info = {i['seqname']:i['newname'] for i in info} # for newick trees :s will be replaced by |s if args.tree_type == 'newick': info = {s.replace(':', '|'):n for s,n in info.items()} leafs = (l for l in leafs if l.name in info) leafs = (newname(l, info[l.name]) for l in leafs) if args.remove_word: leafs = (newname(l, re.sub(args.remove_word, '', l.name)) for l in leafs) leafs = (newname(l, l.name.strip()) for l in leafs) leafs = (newname(l, args.add_prefix + l.name) for l in leafs) leafs = (newname(l, l.name + args.add_suffix) for l in leafs) # do this last if args.tree_type == 'newick': leafs = (newname(l, l.name.replace(' ', '_')) for l in leafs) # execute changes and write tree list(leafs) Phylo.write(tree, args.out, args.tree_type)
def serialize_trees(self, tree_uri='', format='newick', trees=None, handle=None): '''Retrieve trees serialized to any format supported by Biopython. Current options include 'newick', 'nexus', 'phyloxml', 'nexml', and 'cdao' Example: >>> treestore.serialize_trees('http://www.example.org/test/') ''' if handle: s = handle else: s = StringIO() if tree_uri: tree_uri = self.uri_from_id(tree_uri) if trees is None: trees = [(x for x in self.get_trees(tree_uri)).next()] if not trees: raise Exception('Tree to be serialized not found.') if format == 'cdao': bp.write(trees, s, format, tree_uri=tree_uri) elif format == 'ascii': bp._utils.draw_ascii((i for i in trees).next(), file=s) else: bp.write(trees, s, format) if handle: return return s.getvalue()
def pruneNewick(gt, protein_id): tree = Phylo.read( "/SplitTrees/" + gt.partition("_")[0] + '/' + gt + ".newick", 'newick') pruned_tree = tree.prune(target=protein_id) Phylo.write(tree, "/SplitTrees/" + gt.partition("_")[0] + '/' + gt + ".newick", 'newick')
def measure_D_net(G,qmod,qcon): D_net_dic = {} D_net_ret = {} D_net = [] for u in G: D_net_dic[u] = {} for u in sorted(G): key1 = "Taxon" + str(u) tmp_row = [] for v in sorted(G): key2 = "Taxon" + str(v) if u < v: continue D_net_dic[u][v] = 1.0 - G.dmc_likelihood(u,v,qmod,qcon) tmp_row.append(D_net_dic[u][v]) print D_net_dic[u][v], D_net.append(tmp_row) print '\n' names = [] for u in G: names.append('Taxon'+str(u)) print names print D_net D_net_final = _DistanceMatrix(names,D_net) #print D_net_final.names constructor = DistanceTreeConstructor() tree_dmc = constructor.upgma(D_net_final) #print tree_dmc Phylo.write(tree_dmc,'ph_dmc.nre','newick') return D_net_final
def draw_tree(): alignment = AlignIO.read('outfile_padded.aln', 'clustal') # reading the alignment file calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignment) msas = bootstrap(alignment, 100) calculator = DistanceCalculator('blosum62') constructor = DistanceTreeConstructor(calculator) trees = bootstrap_trees(alignment, 100, constructor) consensus_tree = bootstrap_consensus(alignment, 1000, constructor, majority_consensus) consensus_tree.ladderize() consensus_tree.root.color="green" #mrca = tree.common_ancestor({"name": "PC_00004"}, {"name": "BG_I_00594"}) mrca = consensus_tree.common_ancestor({"name": "PC_00004|DNA"}) mrca.color = "salmon" Phylo.write(consensus_tree, 'TreeToCutOff.xml', 'phyloxml') #plt.rc('font', size=10) # controls default text sizes #HERE IS THE SETTING FOR THAT ALLOWS ME TO HIDE THE BRANCH TIP LABELS #plt.rc('axes', titlesize=14) # fontsize of the axes title #plt.rc('xtick', labelsize=10) # fontsize of the tick labels #plt.rc('ytick', labelsize=10) # fontsize of the tick labels #plt.rc('figure', titlesize=18) # fontsize of the figure title #plt.savefig("TreeToCutOff_check.svg", format='svg', dpi=1200, bbox_inches='tight') Phylo.draw(consensus_tree, show_confidence=True) pylab.gcf().set_dpi(300) pylab.savefig("phylo-dot.png") pylab.clf()
def main(): args = parse_args() tree = Phylo.read(args.input_file, args.input_type) tree = tree.as_phyloxml() if args.zchemat_kolorowania == 'eba': get_colors_and_groups = get_eukariota_group elif args.zchemat_kolorowania == 'fungi': get_colors_and_groups = get_fungus_groups elif args.zchemat_kolorowania == 'opisto': get_colors_and_groups = get_opisto_groups for branch in tree.get_nonterminals(): try: branch.confidence.type = "bootstrap" branch.name = branch.confidence.value except AttributeError: pass colors, list_of_groups = get_colors_and_groups() for leaf in tree.get_terminals(): name = leaf.name.strip() try: index = name.index(".") name = name[index + 3:] except: name = "_".join(name.split("_")[-2:]) for color, members in zip(colors, list_of_groups): if name in members: leaf.color = color Phylo.write(tree, args.output_file, "phyloxml")
def rootTree(f, root,output): tree = Phylo.read(f,'newick') if ',' in root: taxa = root.split(',') root = tree.common_ancestor(taxa) tree.root_with_outgroup(root) Phylo.write(tree,output,'newick')
def main(): # read argparse global args args = parse_arguments() # initialize input_tree, hosts, score, solution_count input_tree = initialize_tree(args.INPUT_TREE_FILE) initialize_leaf_nodes(input_tree) initialize_internal_nodes(input_tree) # label internal nodes np.random.seed(args.seed) labeled_trees = get_labeled_trees(input_tree) # create transmission edges and counts from labeled trees # create output files if not args.times: # old TNet output format transmission_edges = get_transmission_edges(labeled_trees[0]) write_transmission_edges(args.OUTPUT_FILE, labeled_trees[0].root.name, transmission_edges) else: # summary output edge_count = get_transmission_edge_count(labeled_trees) write_transmission_edges_summary(edge_count) # create optional output files if args.labeledtrees: Phylo.write(labeled_trees, args.OUTPUT_FILE + '.tree', 'newick')
def labeler(files, etalon_tree, tree_path=".", rebuild=False): """ Constructs labels for given files. (Best phylogeny reconstruction method) :param files: an iterable with file paths to alignments :param etalon_tree: the path to etalon tree :param tree_path: a directory, where built trees will be stored :param rebuild: set it True, if you need to rebuild trees or build them from scratch :return: tensor with labels """ tree_path = osp.abspath(tree_path) # raxml needs absolute paths if rebuild: calculator = TreeConstruction.DistanceCalculator('blosum62') dist_constructor = TreeConstruction.DistanceTreeConstructor() # construct all trees with UPGMA, NJ and raxml for i, file in enumerate(files): aln = AlignIO.read(file, 'fasta') tree = dist_constructor.upgma(calculator.get_distance(aln)) name = file.split("/")[-1].split(".")[0] Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)), 'newick') tree = dist_constructor.nj(calculator.get_distance(aln)) Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)), 'newick') raxml = RaxmlCommandline(sequences=osp.abspath(file), model='PROTCATWAG', name='{}.tre'.format(name), threads=3, working_dir=tree_path) _, stderr = raxml() print(stderr) print('{} finished'.format(name)) # get best tree tns = dendropy.TaxonNamespace() act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree), "newick", taxon_namespace=tns) act_tree.encode_bipartitions() distances = np.zeros(shape=(len(files), 3)) for i, file in enumerate(files): name = file.split("/")[-1].split(".")[0] nj_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "nj_{}.tre".format(name)), "newick", taxon_namespace=tns) up_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "upgma_{}.tre".format(name)), "newick", taxon_namespace=tns) ml_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "RAxML_bestTree.{}.tre".format(name)), "newick", taxon_namespace=tns) distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference( nj_tree, act_tree) distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference( up_tree, act_tree) distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference( ml_tree, act_tree) return distances.argmin(1)
def taxid2tree(self, taxid_list, out_fmt="newick"): """ This function take a list of gi as input, will generate a path for for each gi, then construct a newick or phyloxml tree based on these gi pathes. out_fmt = newick / phyloxml ... """ treeFile = StringIO() # get pathes for a list of taxid path_list =[";".join([str(item) for item in self.get_path(taxid)]) for taxid in taxid_list ] # read in pathFile, and store node info into nodes nodes = {} # data format {"node_name": Clade_object} root = None # to parese path iterately for i, path in enumerate(path_list): line = path.strip().split(";") if root is None: root = line[0] else: assert root == line[0], "The %d-th line is from a different root"%(i+1) # check node iterately, first reverse list, to from leaf to root # to make sure every node has a parent node leaf2root = line[::-1] for j, item in enumerate(leaf2root): # find child_node and parent_node, root node's parent is itself if j == len(line)-1: child_node = item; parent_node=item else: child_node = item; parent_node = leaf2root[j+1] if nodes.has_key(child_node): continue else: # add this node nodes[child_node] = Newick.Clade(name=child_node) # add its parent info nodes[child_node].parent = parent_node for node_name, node_clade in nodes.iteritems(): # find the root node, its parent is itself if node_name == node_clade.parent: root_node = node_clade print "root node is %s, constructing tree ..."%(str(node_name)) # if node is not root, then find its parent, and add to its parent's clades else: parent_node = nodes[node_clade.parent] parent_node.clades.append(node_clade) del node_clade.parent tree = Newick.Tree(root = root_node) bp.write(tree, treeFile, out_fmt) treeStr = treeFile.getvalue() return treeStr
def trim_tree(absenteeList, TreeFile, Inclusive): """Collapse away species from the phylogenetic tree that are not found in this sequence file. Output the tree file.""" print "\nReading the Tree..." #parse the tree using Phylo tree = Phylo.read(TreeFile, 'newick') print "Here is the starting tree:" Phylo.draw_ascii(tree) terminals = tree.get_terminals() print "\nFound the following {} taxa in the tree:".format(len(terminals)) print terminals #prune away taxa that are not included for this sequence file for taxon in absenteeList: tree.prune(taxon) if CladeList != "none": if taxon in CladeList: CladeList.remove(taxon) print "\nPruned away these species:" print absenteeList print "\nHere is the tree with the missing taxa pruned away:\n" Phylo.draw_ascii(tree) #unless you have a clock, PAML requires that your tree is unrooted, ie has a trifurcation at first node. So do that here ROOT = tree.get_nonterminals()[0] if ROOT.is_bifurcating() == True: firstNode = tree.get_nonterminals()[1] tree.collapse(firstNode) #add notations to the tree to identify the 'foreground' branches #these are assigned to a monophyletic group of species assigned with the argument -clade #by default add "#1" to the branch leading to the clade. Change -inc from 'no' to make it inclusive, #adding #1 to the branch leading to the clade as well as all terminal branches. if Model == "2": print "\nAssigning the foreground branches in the tree based on the species given in the clade file..." print "These species make up the forground clade:" for spp in CladeList: print spp #identifying the foreground clade works differently depending on whether there are multiple species or just one #deal with the case when there are multiple first if len(CladeList) > 1: #add #1 to the node representing the common ancestor to your clade of interest, identifying it as the foreground lineage for the branch sites model tree.common_ancestor(CladeList).name = "#1" #if you want the foreground lineage to be inclusive for terminal branches, then add the #1s to the terminal taxa in the clade if Inclusive != 'no': for leaf in tree.get_terminals(): if leaf.name in CladeList: leaf.name = leaf.name + "#1" #if there is only one member of the clade list left, then it is the sole representative for the lineage, and should be marked #1 else: for leaf in tree.get_terminals(): if leaf.name in CladeList: leaf.name = leaf.name + "#1" #if RunMode is not 2 just output the pruned tree as is print "\nOutputting the following revised tree for the species content of the sequence file" print "it should have a trifurcation at the base unless you are using a clock\n" Phylo.draw_ascii(tree) # if tree.rooted == False: # print "The revised tree is an unrooted tree (regardless of how the sketch above looks)" # if tree.rooted == True: # print "Hmm, the tree is rooted. This may not be right for PAML input. You should check." Phylo.write(tree, TreeOutFileName, "newick")
def test_consensus(self): # create a list of trees and make a consensus phylogenies = [self.phylo for i in range(100)] with open('distribution.tre', 'w') as file: Phylo.write(phylogenies, file, 'newick') ptools.consensus(outdir='.', min_freq=0.5, is_rooted=True, trees_splits_encoded=False) self.assertTrue(os.path.isfile('consensus.tre'))
def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertTrue(isinstance(tree, BaseTree.Tree)) tree_file = StringIO.StringIO() Phylo.write(tree, tree_file, 'newick') ref_tree = open('./TreeConstruction/nj.tre') self.assertEqual(tree_file.getvalue(), ref_tree.readline()) ref_tree.close()
def test_format_branch_length(self): """Custom format string for Newick branch length serialization.""" tree = Phylo.read(StringIO("A:0.1;"), "newick") mem_file = StringIO() Phylo.write(tree, mem_file, "newick", format_branch_length="%.0e") # Py2.5 compat: Windows with Py2.5- represents this as 1e-001; # on all other platforms it's 1e-01 self.assertTrue(mem_file.getvalue().strip() in ["A:1e-01;", "A:1e-001;"])
def build_nj_tree(self): dm = self.distance_matrix() constructor = DistanceTreeConstructor() tree = constructor.nj(dm) treeio = StringIO.StringIO() Phylo.write(tree, treeio, 'newick') treestr = treeio.getvalue() treeio.close() return treestr
def export(self): from bio_draw import muttree_draw def select_fontsize(n): if n<10: return 12 elif n<50: return 10 else: return 8 def branch_label_func(n): max_muts = 15 if hasattr(n,'aa_muts'): muts = n.aa_muts else: muts = n.nuc_muts tmp = muts.split(',') if len(tmp)>max_muts: return ', '.join(tmp[:max_muts])+' + '+str(len(tmp)-max_muts)+' others' else: return ', '.join(tmp) from Bio import Phylo import matplotlib.pyplot as plt plt.rcParams.update({'font.size':select_fontsize(len(self.viruses))}) plt.ioff() from tree_util import to_Biopython tmp_tree = to_Biopython(self.tree) tmp_tree.ladderize() fig = plt.figure('Tree') plt.close() fig = plt.figure('Tree', figsize = (15,2+len(self.viruses)/5)) ax = plt.subplot('111') muttree_draw(tmp_tree, axes=ax, show_confidence=False, do_show=False, label_func = lambda x: x.name, branch_labels = branch_label_func ) ax.invert_yaxis() tl = np.diff(ax.get_xticks())[0] lengthbar = tl/2 plt.plot( [0,lengthbar],[len(self.viruses),len(self.viruses)], lw=10, c='k') plt.text(lengthbar/2, len(self.viruses)+0.1, str(lengthbar),horizontalalignment='center',fontsize=16) ax.set_axis_off() for fmt in self.formats: plt.savefig(self.outdir+'tree.'+fmt) for t in tmp_tree.find_clades(): if t.name is None: t.name='' muts = t.aa_muts if hasattr(t,'aa_muts') else t.nuc_muts if len(t.name) and len(muts): t.name+='-' t.name+='_'.join(muts.split(',')) Phylo.write(tmp_tree, self.outdir+'tree.nwk', 'newick') self.export_to_auspice(tree_fields = ['aa_muts','num_date']+self.fasta_fields.values())
def dump(self, treefile, nodefile): from Bio import Phylo Phylo.write(self.tree, treefile, 'newick') node_props = {} for node in self.tree.find_clades(): node_props[node.name] = {attr:node.__getattribute__(attr) for attr in self.dump_attr if hasattr(node, attr)} with myopen(nodefile, 'w') as nfile: pickle.dump(node_props, nfile)
def main() : if len(sys.argv) != 4 : print >> sys.stderr, "Usage: %s <orthologues_input.json> <msa_fname.fasta> <fly output folder>" % sys.argv[0] sys.exit(1) json_input = sys.argv[1] msa_fname = sys.argv[2] msa_number = filter(str.isdigit, msa_fname) fly_directory = sys.argv[3] fly_fasta_path = ("%s/fly%s.fasta") % (fly_directory, msa_number) fly_tree_path = ("%s/fly%s.tree") % (fly_directory, msa_number) minimum_species = 10 # arbitrary global the_name the_name = msa_fname if not os.path.exists(msa_fname) : print >> sys.stderr, "Error: %s does not exist!" % msa_fname sys.exit(1) orthologues = get_homology_information_fromfile(json_input) #orthologues = get_homology_information() msa_species,tc_genes = get_msa_species(msa_fname) tc_gene = tc_genes[0] if tc_gene not in orthologues : #print "\nSkipped %s, %s: missing from orthologues" % (msa_fname, tc_gene) sys.exit(1) """if len(msa_species) < minimum_species : #print >> sys.stderr, "\033[93m" + "\nskipping: not enough beetle species..." + "\033[0m" print "\nSkipped %s, %s: not enough beetle species" % (msa_fname, tc_gene) sys.exit(1)""" if len(orthologues[tc_gene]) < minimum_species : #print >> sys.stderr, "\033[93m" + "\nskipping: not enough fly species..." + "\033[0m" #print "\nSkipped %s, %s: not enough fly species" % (msa_fname, tc_gene) sys.exit(1) tmp = orthologues[tc_gene] tmp_species = tmp.keys()[0] tmp_gene = tmp[tmp_species][0] tmp_flies, tmp_alignment, tree = get_genetree(tmp_species, tmp_gene) AlignIO.write(tmp_alignment, fly_fasta_path, 'fasta') Phylo.write(tree, fly_tree_path, 'newick') get_rid_of_bootstrap(fly_tree_path) #print >> sys.stderr, "\n\033[92mfly = %d genes, beetle = %d genes\033[0m" % (len(tmp_alignment), count_seq(msa_fname)) #print "\n Wrote %s, %s (homologues of %s) \n" % (fly_fasta_path, fly_tree_path, tc_gene) return 0
def runBSSC(workingFile, srp_tree_file, debug): bssc = runExtProg(bsscDir + "BSSC_original", pdir=bsscDir, length=3) bssc.set_param_at("-f", 1) bssc.set_param_at(workingFile + "_BSSC.par", 2) bssc.set_param_at(1, 3) bssc_paup_result = workingFile + "_BSSC.paup" # bssc_tree_result = workingFile + "_true_trees.trees" try: os.remove(bssc_paup_result) # os.remove(workingFile + "_0.pau") except OSError: pass while not os.path.exists(bssc_paup_result): bssc.run() # input_handle = open(bssc_tree_result, "rU") input_handle = open(bssc_paup_result, "rU") for line in input_handle: if line.find("tree true_tree_1") > 0: line = line.strip() start = line.index("U]") + 3 treeString = line[start:] tree = Phylo.read(StringIO(treeString), 'newick') input_handle.close() for clade in tree.find_clades(): if clade.name: match = re.match(MATCH_TREE_NODE, clade.name) if match: index = match.group(2) clade.name = "hap_" + str(int(index) - 1) input_handle = open(bssc_paup_result, "rU") sequences = AlignIO.read(input_handle, "nexus") input_handle.close() seq = sequences[0] ref_handle = open(workingFile + ".cons", "w") ref_handle.write(">%s\n%s\n" % ("Ref", seq.seq)) ref_handle.close() output_handle = open(workingFile + "_seqgen.phylip", "w") output_handle.write("1 1200\n") output_handle.write("%s %s\n" % ("ancestor", seq.seq)) output_handle.write("1\n") Phylo.write(tree, output_handle, "newick") output_handle.close() output_handle = open(srp_tree_file, "w") Phylo.write(tree, output_handle, "newick") output_handle.close()
def save_treeanc_results(self): from Bio import Align # files to be displayed in the web interface Phylo.write(self.tree, os.path.join(self._root_dir, out_tree_nwk), 'newick') self._save_alignment() self._save_gtr() with zipfile.ZipFile(os.path.join(self._root_dir, zipname), 'w') as out_zip: out_zip.write(os.path.join(self._root_dir, out_tree_nwk), arcname=out_tree_nwk) out_zip.write(os.path.join(self._root_dir, out_aln_fasta), arcname=out_aln_fasta) out_zip.write(os.path.join(self._root_dir, out_gtr), arcname=out_gtr)
def D_seq_matrix(fasta_file): aln = AlignIO.read(fasta_file, 'fasta') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree_seq = constructor.upgma(dm) #print tree_dmc Phylo.write(tree_seq,'ph_seq.nre','newick') print dm.names return dm
def test_convert_phyloxml_to_newick_branch_length_only(self): """Write phyloxml with bootstrap values to newick format using branch_length_only=True""" trees = Phylo.parse(EX_APAF, "phyloxml") tmp_filename = tempfile.mktemp() try : Phylo.write(trees, tmp_filename, "newick", branch_length_only=True) os.remove(tmp_filename) except TypeError: self.fail()