def setUp(self): self.cml = codeml.Codeml()
#!/usr/bin/env python3.6 # -*- coding: utf-8 -*- __author__ = 'Serafina Nieves' __email__ = '*****@*****.**' from Bio.Phylo.PAML import codeml import sys wdir = str(sys.argv[1]) seqfile = str(sys.argv[2]) treefile = str(sys.argv[3]) mod = str(sys.argv[4]) outfile= str(sys.argv[5]) cml = codeml.Codeml(working_dir=wdir, alignment=seqfile, tree=treefile, out_file=outfile) cml.set_options(noisy=9, verbose=1, runmode=0, seqtype=1, CodonFreq=2, ndata=0, clock=0, aaDist=0, model=mod, NSsites=[0], icode=0, Mgene=0, fix_kappa=0, kappa=2, fix_omega=0, omega=1, fix_alpha=1, alpha=0., Malpha=0, ncatG=8, getSE=0, RateAncestor=0, Small_Diff=.5e-6, cleandata=1, fix_blength=1, method=0) cml.print_options() cml.run(verbose=True)
logging.info("Starting the run. The run settings can be checked in the control file created...") for file in os.listdir(input_dir): if file.endswith(".pal2nal"): codeml_output = re.sub(".pal2nal", ".cml.out", file) alignment = os.path.join(input_dir,file) tree = "COG0012.mod.nwk" #Needed as input but not used. So you can use any tree. cog_in_turn = file.replace(input_dir, "") cog_in_turn = file.replace(".pal2nal", "") print("Your input files are: ", alignment, "and", tree) logging.info("Your input files are: {0} and {1}.".format(alignment, tree)) #Let's run codeml! print("Running codeml for COG:", cog_in_turn) logging.info("Running codeml for COG: {0}".format(cog_in_turn)) cml = codeml.Codeml() cml.alignment = alignment cml.tree = tree cml.out_file = codeml_output cml.working_dir = input_dir #Setting options cml.set_options(noisy=1) #How much rubbish on the screen cml.set_options(verbose=0) #How many details on the screen cml.set_options(runmode=-2) #I set pairwise comparison cml.set_options(seqtype=1) #I am using codons cml.set_options(CodonFreq=1) #The equilibrium codon frequencies in the codon substitution model will be calculated from the average nucleotides frequencies cml.set_options(clock=0) #No clock and rates are entirely free to vary from branch to branch cml.set_options(model = 1) #I set to compute an omega value for each branch cml.set_options(NSsites = [0]) #This model fits with the CodonFreq used cml.set_options(icode = 0) #I set the universal code
def alignGene(line): try: gene1 = line.split(" ")[2] gene2 = line.split(" ")[7] print(gene1) print(gene2) cds1 = line.split(" ")[1] cds2 = line.split(" ")[6] geneName = line.split(" ")[4] #Make file with both protein sequences seqiter = SeqIO.parse(open(seq1), 'fasta') SeqIO.write((seq for seq in seqiter if seq.id == gene1), "scratch/" + gene1 + ".fa", "fasta") seqiter = SeqIO.parse(open(seq2), 'fasta') SeqIO.write((seq for seq in seqiter if seq.id == gene2), "scratch/" + gene2 + ".fa", "fasta") with open("scratch/" + geneName + ".fa", 'wb') as wfd: for f in ["scratch/" + gene1 + ".fa", "scratch/" + gene2 + ".fa"]: with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd) #Make file with both protein sequences seqiter = SeqIO.parse(open(seq1_nucl), 'fasta') SeqIO.write((seq for seq in seqiter if seq.id == cds1), "scratch/" + cds1 + ".fa", "fasta") seqiter = SeqIO.parse(open(seq2_nucl), 'fasta') SeqIO.write((seq for seq in seqiter if seq.id == cds2), "scratch/" + cds2 + ".fa", "fasta") with open("scratch/" + geneName + "_nucl.fa", 'wb') as wfd: for f in ["scratch/" + cds1 + ".fa", "scratch/" + cds2 + ".fa"]: with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd) #Make tree file cline = ClustalwCommandline("clustalw2", infile="scratch/" + geneName + ".fa", newtree='scratch/' + geneName + 'tree.tre') stdout, stderr = cline() #Make alignment file cline = ClustalwCommandline("clustalw2", infile="scratch/" + geneName + ".fa", output="CLUSTAL", outfile='scratch/' + geneName + 'alignment.aln') stdout, stderr = cline() cmd = "perl pal2nal.pl scratch/" + geneName + "alignment.aln scratch/" + geneName + "_nucl.fa -output paml > scratch/" + geneName + "alignment_nucl.phy" #Run pal2nal os.system(cmd) cml = codeml.Codeml() cml.alignment = 'scratch/' + geneName + 'alignment_nucl.phy' cml.working_dir = "./scratch" cml.tree = 'scratch/' + geneName + 'tree.tre' cml.out_file = 'scratch/' + geneName + 'out.txt' cml.set_options(seqtype=1, verbose=1, noisy=0, model=1, runmode=-2, Mgene=0, NSsites=[0], CodonFreq=2, cleandata=1) cml.run(verbose=False) with open(cml.out_file) as results: with open(outfile, 'a') as out: for line in results: if line.find("dN/dS=") > -1: line = line.split() out.write(geneName + '\t' + str(line[line.index("dN/dS=") + 1]) + '\n') cmd = "rm scratch/*" + geneName + "* scratch/" + cds1 + "* scratch/" + cds2 + "* scratch/*" + gene1 + "* scratch/*" + gene2 + "*" os.system(cmd) except: return
def __call__(self): from Bio.Phylo.PAML import codeml import os try:os.mkdir('paml') except:pass try:os.mkdir('paml/%s'%os.path.split(self.align)[-1]) except:pass cml = codeml.Codeml(alignment = self.align, tree = self.tree, out_file = "tmpcodeml/%s.out"%os.path.split(self.align)[-1], working_dir='paml/%s'%os.path.split(self.align)[-1]) cml.set_options(NSsites = "1 2", seqtype = 1, model = 0, RateAncestor = 1) cml.ctl_file = "../../tmpcodeml/%s.ctl"%os.path.split(self.align)[-1] try: res = cml.run() shutil.move('paml/%s/rst'%os.path.split(self.align)[-1], "tmpcodeml/%s.rst"%os.path.split(self.align)[-1]) shutil.move('paml/%s/rst1'%os.path.split(self.align)[-1], "tmpcodeml/%s.rst1"%os.path.split(self.align)[-1]) except: res = None return (self.align,res) class Consumer(multiprocessing.Process): def __init__(self, task_queue = multiprocessing.Queue(), result_queue = multiprocessing.Queue()): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.result_queue = result_queue def run(self): while True: next_task = self.task_queue.get() time.sleep(0.01) if next_task is None: # Poison pill means we should exit break answer = next_task() self.result_queue.put(answer) return class MultiProcess(object): ''' Class MultiProcess An object that can perform multiprocesses ''' def __init__(self,ncpus=1): self.ncpus = int(ncpus) # Parallelization self._parallel = None self._paralleltasks = Queue() self._parallelresults = Queue() def initiateParallel(self): self._parallel = [Consumer(self._paralleltasks,self._parallelresults) for x in range(self.ncpus)] for consumer in self._parallel: consumer.start() def addPoison(self): for consumer in self._parallel: self._paralleltasks.put(None) def isTerminated(self): for consumer in self._parallel: if consumer.is_alive(): return False return True def killParallel(self): for consumer in self._parallel: consumer.terminate() def doCodeML(self, indir, tree): i = 0 dres = {} redo = open('codemlfail.txt','w') self.initiateParallel() for f in os.listdir(indir): if f[-4:] != '.phy':continue align = os.path.join(indir, f) obj = CodeML(indir, align, tree) self._paralleltasks.put(obj) # Poison pill to stop the workers self.addPoison() while True: while not self._parallelresults.empty(): result = self._parallelresults.get() if not result[1]: msg(result[0],'ERR') redo.write('%s\n'%result[0]) else: msg('%s %d'%(result[0],i),'IMP') i += 1 if self.isTerminated(): break time.sleep(0.1) # Get the last messages while not self._parallelresults.empty(): result = self._parallelresults.get() if not result[1]: msg(result[0],'ERR') redo.write('%s\n'%result[0]) else: msg('%s %d'%(result[0],i),'IMP') i += 1 self.killParallel() return dres class Highlighter: def __init__(self): self._msgTypes={'INF':'\033[0m', 'IMP':'\033[1;32m', 'DEV':'\033[1;34m', 'ERR':'\033[1;31m', 'WRN':'\033[1;33m'} self._reset='\033[0m' self._default='INF' def ColorMsg(self,msg,msgLevel='INF'): try: s=self._msgTypes[msgLevel]+msg+self._reset except:s=s=self._msgTypes[self._default]+msg+self._reset return s def msg(message, msgLevel='INF', sameline=False): o=Highlighter() if sameline: sys.stderr.write('\r') else: sys.stderr.write(strftime("%H:%M:%S") + ' ') sys.stderr.write(o.ColorMsg(message,msgLevel)) if not sameline: sys.stderr.write('\n') def creturn(): sys.stderr.write('\n') def getOptions(): '''Retrieve the options passed from the command line''' usage = "usage: python parallelPAML.py [options]" parser = OptionParser(usage) group1 = OptionGroup(parser, "Inputs") group1.add_option('-a', '--aligndir', action="store", dest='align', default='OUT', help='Alignment directory') group1.add_option('-t', '--tree', action="store", dest='tree', default='TREE.nwk', help='Tree file') group1.add_option('-r', '--threads', action="store", dest='threads', default=1, help='Threads [Default: 1]') parser.add_option_group(group1) # Parse the options return parser.parse_args() (options, args) = getOptions() dres = MultiProcess(options.threads).doCodeML(options.align,options.tree) import json json.dump(dres,open('codemlresults.out','w'))
# @author Emily Huntsman BC'21 # under the guidance of Professors Jon Snow and Allison Lopatkin # @version May 20, 2021 from Bio.Phylo.PAML import codeml import os # below insert the names of your alignment and tree files cml = codeml.Codeml( alignment="IRE_NT.phylip", tree="IRE_NT.trees", out_file="results.out", working_dir=os.path.abspath(""), ) # specifications from Professor Lopatkin reflected in codeml.ctl but can be adjusted according to the PAML manual cml.read_ctl_file("codeml.ctl") cml.print_options() # change command to reflect the path to your paml executable # this can be found by navigating through your directory structure and into paml4.8/bin and typing pwd (print working directory) in the command line results = cml.run(command="/Users/annhuntsman/Desktop/PAML_Python/paml4.8/bin/codeml",verbose=True) # if prompted in the terminal respond accordingly (usually pressing enter) # omega for selection value print("omega: "+str(results['NSsites'][0]['parameters']['omega']))
# FPAML3.py: Runs PAML as in FPAML.py, but for the 70% Gapped Sequence ONLY # Non 'gap' + SEQNAME folders will give errors ##### INITIALIZATION ##### from Bio.Phylo.PAML import codeml # Utilizing CodeML from BioPython import glob, os cml = codeml.Codeml() # Defines CodeML variable cml.set_options(verbose=0) # Set CodeML Options for all analyses cml.set_options(CodonFreq=2) cml.set_options(cleandata=0) cml.set_options(fix_blength=0) cml.set_options(NSsites=[0, 1, 2, 7, 8]) cml.set_options(fix_omega=0) cml.set_options(clock=1) cml.set_options(ncatG=2) cml.set_options(runmode=0) cml.set_options(fix_kappa=0) cml.set_options(fix_alpha=1) cml.set_options(Small_Diff=5e-7) cml.set_options(method=1) cml.set_options(Malpha=0) cml.set_options(aaDist=0) cml.set_options(RateAncestor=0) cml.set_options(icode=0) cml.set_options(alpha=0.0) cml.set_options(seqtype=1) cml.set_options(omega=0.4) cml.set_options(getSE=0)
def ma_m1a(alignment, tree, output_dir, working_dir): """ This is tu run PAML in each defined branch under models MA and M1a, with this options: model = 2 NSsites = 2 fix_omega = 0 (for Ma) and 1 (for M1a) fix_blength = 1 -> The supplied tree should have branch lengths, and PAML will use those as a starting point The output of this function is a dictionary containing the lnL value and the site_classes for each model (Ma and M1a) """ from Bio.Phylo.PAML import codeml import os paml_results = dict() # Store the results of the analysis cml = codeml.Codeml() # Setup PAML #Parameters to PAML cml.alignment = alignment cml.tree = tree cml.out_file = output_dir + "/" + os.path.basename( alignment)[:-4] + os.path.basename(tree)[:-4] + ".ma" cml.working_dir = working_dir cml.set_options(seqtype=1, CodonFreq=2, clock=0, model=2, NSsites=[2], fix_kappa=0, kappa=2, fix_omega=0, omega=5, verbose=1, fix_blength=1) print "Running codeml for model A in : %s" % os.path.basename(tree) results_ma = cml.run() #Parse the results for the first run ns_sites_ma = results_ma.get("NSsites") for site in ns_sites_ma: lnL = ns_sites_ma[site].get("lnL") parameters = ns_sites_ma[site].get("parameters") site_classes = parameters.get("site classes") model_results = {"lnL": lnL, "site_classes": site_classes} paml_results["Ma"] = model_results #Run the second model print "Running codeml for model 1A in : %s" % os.path.basename(tree) #Parameters for the second model cml.out_file = output_dir + "/" + os.path.basename( alignment)[:-4] + os.path.basename(tree)[:-4] + ".m1a" cml.set_options(seqtype=1, CodonFreq=2, clock=0, model=2, NSsites=[2], fix_kappa=0, kappa=2, fix_omega=1, omega=1, verbose=1, fix_blength=1) results_m1a = cml.run() #Parse the results for the second run ns_sites_m1a = results_m1a.get("NSsites") for site in ns_sites_m1a: lnL = ns_sites_m1a[site].get("lnL") parameters = ns_sites_m1a[site].get("parameters") site_classes = parameters.get("site classes") model_results = {"lnL": lnL, "site_classes": site_classes} paml_results["M1a"] = model_results return paml_results
def reconstruct(df, id_col='uid', sequence_col='sequence', working_dir='', save_ancestors=False, altall_cutoff=0.2, infer_gaps=True, aaRatefile='lg', **kwargs): df = df.copy() # Construct default arguments default_options = dict(verbose=9, CodonFreq=None, cleandata=0, fix_blength=2, NSsites=None, fix_omega=None, clock=None, ncatG=8, runmode=0, fix_kappa=None, fix_alpha=1, Small_Diff=1.0e-6, method=0, Malpha=None, aaDist=None, RateAncestor=2, icode=None, alpha=None, seqtype=2, omega=None, getSE=None, noisy=3, Mgene=None, kappa=None, model=3, ndata=None) # Update default arguments in place. default_options.update(**kwargs) # ---------------- Prepare model ---------------- # copy model from package to project directory. path_to_model = pkg_resources.resource_filename( 'pyasr', os.path.join('dat', '{}.dat'.format(aaRatefile))) model_file = '{}.dat'.format(aaRatefile) model_path = os.path.join(working_dir, model_file) shutil.copyfile(path_to_model, model_path) # ---------------------- curr_path = os.getcwd() proj_path = os.path.join(curr_path, working_dir) ali_path = os.path.join(working_dir, 'ali-to-reconstruct.phy') tree_path = os.path.join(working_dir, 'tree-to-reconstruct.phy') out_path = os.path.join(working_dir, 'results.txt') ctl_path = os.path.join(working_dir, 'codeml_options.ctl') rst_path = os.path.join(working_dir, 'rst') df.phylo.to_fasta( filename=ali_path, id_col=id_col, sequence_col=sequence_col, ) df.phylo.to_newick( filename=tree_path, taxon_col=id_col, node_col=id_col, suppress_internal_node_labels=True, ) df.phylo.to_newick( taxon_col=id_col, node_col=id_col, suppress_internal_node_labels=True, ) # Build and write out control file. cml = codeml.Codeml(alignment=ali_path, tree=tree_path, out_file=out_path, working_dir=working_dir) cml.set_options(aaRatefile=model_file, **default_options) cml.ctl_file = ctl_path cml.write_ctl_file() # ---------------------- os.chdir(proj_path) output = subprocess.run(['codeml', 'codeml_options.ctl']) os.chdir(curr_path) # ---------------------- return read_codeml_output(rst_path, df)
import sys from Bio.Phylo.PAML import codeml #folder_path = sys.argv[1] alignment_file = sys.argv[1] # full path tree_file = sys.argv[2] # full path m0_out = sys.argv[3] # full output path estimated_tree_name = sys.argv[4] final_out = sys.argv[5] # Run M0 model to get tree cmlM0 = codeml.Codeml(alignment=alignment_file, tree=tree_file, out_file=m0_out) cmlM0.set_options(seqtype=1) cmlM0.set_options(model=0) cmlM0.set_options(NSsites=[0]) cmlM0.set_options(omega=0.5) cmlM0.set_options(CodonFreq=2) cmlM0.set_options(ndata=1) cmlM0.set_options(fix_alpha=1) cmlM0.set_options(Small_Diff=5e-7) # Run the M0 model cmlM0.run(command="/Users/kmoney/Documents/paml4.9e/bin/codeml") # Get tree from m0 results m0result = codeml.read(m0_out) NSsites_dict = m0result.get("NSsites") NSsites0_dict = NSsites_dict.get(0) estimated_tree = NSsites0_dict.get("tree")
def parse_hogs(hoglist, model, basedir, verbose=True, multisite=False): #take list of hogs, return parsed final results dictionary final_results = {} for hog in hoglist: if verbose: print("Working on", hog, flush=True) toppath = '{:0>4}'.format(int(hog) % 100) # 0000/100/100.codeml.ancrec.ctl.out/ fullpath = basedir + "/" + toppath + "/" + hog + "/" + hog + ".codeml." + model + ".ctl.out" results_file = fullpath + "/" + model + ".out" control_file = fullpath + "/" + hog + ".codeml." + model + ".ctl" #get species tree sptreepath = basedir + "/" + toppath + "/" + hog + "/" + hog + ".final_spt.nwk" try: species_tree = Phylo.read(sptreepath, "newick") except FileNotFoundError: species_tree = None cml = codeml.Codeml() try: cml.read_ctl_file(control_file) except OSError: print("Couldn't parse file for", hog, "at", pamldir + "/" + fullpath) continue tree_file = fullpath + "/" + cml.tree #now process parsed_trees = parse_trees(tree_file, species_tree) try: if multisite: parsed_results = parse_multitree_multimodel_results( results_file) else: parsed_results = parse_multitree_results(results_file) except FileNotFoundError: print("Couldn't parse file for", hog, "at", pamldir + "/" + fullpath) continue #check that we have a result for each tree if len(parsed_trees) < len(parsed_results): print("Warning, too few trees for number of results for", hog, "in", results_file) continue elif len(parsed_trees) > len(parsed_results): #remove trees that aren't in results trimmed_trees = {x: parsed_trees[x] for x in parsed_results.keys()} parsed_trees = trimmed_trees if hog in final_results: #append cur_len = len(final_results[hog]['trees']) if cur_len != len(final_results[hog]['results']): print("Warning, something went wrong!!") #update keys (tree numbers) new_trees = { int(x) + cur_len: parsed_trees[x] for x in parsed_trees.keys() } new_results = { int(x) + cur_len: parsed_results[x] for x in parsed_results.keys() } final_results[hog]['trees'].update(new_trees) final_results[hog]['results'].update(new_results) else: final_results[hog] = { 'trees': parsed_trees, 'results': parsed_results } return (final_results)
def free_ratios_worker(orthogroup, workingdir): cml = codeml.Codeml(alignment = "%s/og_cds_%s.afa" % (workingdir, orthogroup), tree = "%s/og_%s.tree" % (workingdir, orthogroup), out_file = "%s/og_%s.alt" % (workingdir, orthogroup), working_dir = "%s/og_%s_working" % (workingdir, orthogroup)) cml.set_options(runmode=0,fix_blength=0,seqtype=1,CodonFreq=2, model=1, icode=0, clock = 0, aaDist=0, Mgene = 0, fix_kappa = 0, kappa = 2, fix_omega = 0, omega = 1, getSE = 0, RateAncestor = 0, cleandata = 0, Small_Diff = .45e-6, verbose = True) cml.set_options(NSsites=[0]) cml.print_options() cml.run(command = "/Genomics/kocherlab/berubin/local/src/paml4.9e/bin/codeml", verbose = True)