def test_raxml(self): """Run RAxML using the wrapper.""" cmd = RaxmlCommandline(raxml_exe, sequences=EX_PHYLIP, model="PROTCATWAG", name="test") # The parsimony seed should be set automatically self.assertIn("-p", str(cmd)) # Smoke test try: out, err = cmd() self.assertTrue(len(out) > 0) self.assertEqual(len(err), 0) # Check the output tree tree = Phylo.read("RAxML_result.test", "newick") self.assertEqual(tree.count_terminals(), 4) finally: # Remove RAxML-generated files, or RAxML will complain bitterly # during the next run for fname in ["RAxML_info.test", "RAxML_log.test", "RAxML_parsimonyTree.test", "RAxML_result.test", # Present in 7.2.X+ but not 7.0.4: "RAxML_bestTree.test", ]: if os.path.isfile(fname): os.remove(fname)
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"): """ SH test using RAxML querytree can be a single tree or a bunch of trees (eg. from bootstrapping) """ assert op.isfile(reftree) shout = must_open(shout, "a") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="h", model="GTRGAMMA", \ name="SH", starting_tree=reftree, bipartition_filename=querytree, \ working_dir=raxml_work) logging.debug("Running SH test in RAxML: %s" % raxml_cl) o, stderr = raxml_cl() # hard coded try: pval = re.search('(Significantly.*:.*)', o).group(0) except: print >> sys.stderr, "SH test failed." else: pval = pval.strip().replace("\t", " ").replace("%", "\%") print >> shout, "{0}\t{1}".format(op.basename(querytree), pval) logging.debug("SH p-value appended to %s" % shout.name) shout.close() return shout.name
def labeler(files, etalon_tree, tree_path=".", rebuild=False): """ Constructs labels for given files. (Best phylogeny reconstruction method) :param files: an iterable with file paths to alignments :param etalon_tree: the path to etalon tree :param tree_path: a directory, where built trees will be stored :param rebuild: set it True, if you need to rebuild trees or build them from scratch :return: tensor with labels """ tree_path = osp.abspath(tree_path) # raxml needs absolute paths if rebuild: calculator = TreeConstruction.DistanceCalculator('blosum62') dist_constructor = TreeConstruction.DistanceTreeConstructor() # construct all trees with UPGMA, NJ and raxml for i, file in enumerate(files): aln = AlignIO.read(file, 'fasta') tree = dist_constructor.upgma(calculator.get_distance(aln)) name = file.split("/")[-1].split(".")[0] Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)), 'newick') tree = dist_constructor.nj(calculator.get_distance(aln)) Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)), 'newick') raxml = RaxmlCommandline(sequences=osp.abspath(file), model='PROTCATWAG', name='{}.tre'.format(name), threads=3, working_dir=tree_path) _, stderr = raxml() print(stderr) print('{} finished'.format(name)) # get best tree tns = dendropy.TaxonNamespace() act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree), "newick", taxon_namespace=tns) act_tree.encode_bipartitions() distances = np.zeros(shape=(len(files), 3)) for i, file in enumerate(files): name = file.split("/")[-1].split(".")[0] nj_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "nj_{}.tre".format(name)), "newick", taxon_namespace=tns) up_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "upgma_{}.tre".format(name)), "newick", taxon_namespace=tns) ml_tree = dendropy.Tree.get_from_path(osp.join( tree_path, "RAxML_bestTree.{}.tre".format(name)), "newick", taxon_namespace=tns) distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference( nj_tree, act_tree) distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference( up_tree, act_tree) distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference( ml_tree, act_tree) return distances.argmin(1)
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs): """ build maximum likelihood tree of DNA seqs with RAxML """ work_dir = op.join(work_dir, "work") mkdir(work_dir) phy_file = op.join(work_dir, "aln.phy") AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="a", model="GTRGAMMA", \ parsimony_seed=12345, rapid_bootstrap_seed=12345, \ num_replicates=100, name="aln", \ working_dir=raxml_work, **kwargs) logging.debug("Building ML tree using RAxML: %s" % raxml_cl) stdout, stderr = raxml_cl() tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work) if not op.exists(tree_file): print >> sys.stderr, "***RAxML failed." sh("rm -rf %s" % raxml_work, log=False) return None sh("cp {0} {1}".format(tree_file, outfile), log=False) logging.debug("ML tree printed to %s" % outfile) sh("rm -rf %s" % raxml_work) return outfile, phy_file
def generate_phylo(): [os.remove(x) for x in glob.glob("RAxML_*")] print "Generating phylogenetic tree using the multiple algnment output using RAxML..." try: #best_likelihood print "1/3" raxml_cline = RaxmlCommandline(sequences="trimmed_muscleout.aln", model="GTRGAMMA", threads=NPROC, cmd=raxml_exe, name="T1", parsimony_seed=12345, num_replicates=BOOTSTRAP) stdout, stderr = raxml_cline() #bootstrap search print "2/3" raxml_cline = RaxmlCommandline(sequences="trimmed_muscleout.aln", model="GTRGAMMA", threads=NPROC, cmd=raxml_exe, name="T2", parsimony_seed=12345, num_replicates=BOOTSTRAP, bootstrap_seed=12345) stdout, stderr = raxml_cline() #draw bipartition print "3/3" raxml_cline = RaxmlCommandline( sequences="trimmed_muscleout.aln", model="GTRCAT", threads=NPROC, cmd=raxml_exe, name="T3.nwk", algorithm="b", starting_tree="RAxML_bestTree.T1", bipartition_filename="RAxML_bootstrap.T2") stdout, stderr = raxml_cline() except Exception, e: print str(e) exit(0)
def create_tree_from_seqs(filename, out, outgroup='', model='GTRGAMMA', tail_deletion=False, n=40, delete_outgroup=True): output_dir, output_filename = parse_path(out) if tail_deletion: regions = list() alns = AlignIO.read(filename, 'fasta') a = None for i in range(len(alns[0])): complete = True for aln in alns: if aln[i] == '-': complete = False break if complete: if a is None: a = i else: if a is not None: regions.append((a, i)) a = None if a is not None: regions.append((a, i)) s, _ = max(regions, key=lambda reg: (reg[1] - reg[0]) / (reg[0] + 1)) alns = MSA([aln[s:] for aln in alns]) filename = filename + '.tmp.fasta' AlignIO.write(alns, filename, 'fasta') RaxmlCommandline(sequences=filename, model=model, parsimony_seed=2018, num_replicates=n, name=output_filename)() if tail_deletion: os.remove(filename) for filename in glob.glob("RAxML_*"): if output_filename in filename: if not filename.endswith('bestTree.' + output_filename): os.remove(filename) else: os.rename(filename, out) if os.path.isfile('tmp.dnd'): os.remove('tmp.dnd') tree = ete3.Tree(out) os.remove(out) if outgroup is not None: outgroup_tree(tree, outgroup, delete_outgroup) tree.write(outfile=out) return ete3.Tree(out)
def generate_dist(self): mafft_cline = MafftCommandline(input=self.fasta_seq, maxiterate=1000, localpair=True, phylipout=True) stdout, stderr = mafft_cline() #Save alignments into FASTA and PHYLIP format phyFile = 'testing/alignment.phy' outPhy = open(phyFile, 'w') outPhy.write(stdout) outPhy.close() fastaFile = 'testing/align.fasta' SeqIO.convert(phyFile, 'phylip', fastaFile, 'fasta') #Create phylogenetic tree of the original sequences raxml_cline = RaxmlCommandline(sequences=phyFile, model='GTRGAMMA', name='reversatest', working_dir=self.cwPath) raxml_cline() #Calculate the phylo distances between each branch of the tree tree = dendropy.Tree.get_from_path("testing/RAxML_result.reversatest", "newick") pdm = tree.phylogenetic_distance_matrix() pdm.write_csv('distance.csv')
time.sleep(3) sys.exit() print("Multiple sequence alignment complete") #code block executes or skips trimal if trimal == 'yes': print("Optimization in progress") subprocess.call(['trimal', '-in', out_file, '-out', out_file2]) print("Alignment optimization complete") elif trimal == 'no': out_file2 = out_file else: print("error: invalid input, terminating program") time.sleep(3) sys.exit() print("Beginning phylogenetic tree construction") #code block executes RAxML tree construction raxml_cline = RaxmlCommandline(sequences=out_file2, model="PROTCATWAG", name=f+".nwk") child = subprocess.call(str(raxml_cline), shell=(sys.platform!=platform)) #i variable is used to count the number of files that are iterated over i = i + 1 twoLines() print("Total number of files analyzed: "+str(i)) twoLines() #sys.exit() is not used, this way commands and selections can be copied and pasted for documentation
def roundTwo(self): self.roundOne() self.jfileMinus = [] pattern = re.compile(r'_') for j in self.query_name: clust_id = pattern.split(j)[2] for query in SeqIO.parse(j, 'fasta'): seq = pattern.split(query.id)[0] #Create special identifier for each round of files number = 'minus%s' % seq id_jfile = '%s_minus%s' % (clust_id, seq) rax_name = 'reversatest%s' % number fasta_name = 'testing/align%s.fasta' % number if not os.path.isfile('testing/alignment%s.phy' % number): edited = MultipleSeqAlignment([]) openPhy = open('testing/alignment.phy') record = AlignIO.read(openPhy, 'phylip') for i in record: if i.id != seq: edited.append(i) #write the alignment minus a sequence phy_name = 'testing/alignment%s.phy' % number out = open(phy_name, 'w') AlignIO.write(edited, out, 'phylip') out.close() #convert FASTA to PHYLIP format SeqIO.convert( phy_name, 'phylip', fasta_name, 'fasta', ) #Create reference tree raxml_line = RaxmlCommandline(sequences=phy_name, model='GTRGAMMA', name=rax_name, working_dir=self.cwPath) raxml_line() #Add query sequences to the previous alignment multiali_name = 'testing/multiple_ali%s.fasta' % id_jfile if not os.path.isfile('testing/alignment%s.phy' % id_jfile): os.system('mafft --add %s --quiet --reorder %s >%s' % (j, fasta_name, multiali_name)) jason_name = 'multiple_ali%s.jplace' % id_jfile #wrap pplacer if not os.path.isfile('pplacer/%s' % jason_name): self.jfileMinus.append(jason_name) os.system( 'pplacer --out-dir pplacer -p -t testing/RAxML_result.%s -s testing/RAxML_info.%s %s' % (rax_name, rax_name, multiali_name)) print self.jfile print self.jfileMinus return self.jfile, self.jfileMinus
#Muscle Alignment from Bio.Align.Applications import MuscleCommandline #Path relative to project in_file = input('file_name= ') out_file = "align.fasta" muscle_exe = "muscle" #using default fasta output cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) #creating best tree file from Bio.Phylo.Applications import RaxmlCommandline raxml_cline = RaxmlCommandline(sequences=out_file, model="PROTCATWAG", name="tree") raxml_cline(out_file)