def setUp(self):
     self.cml = codeml.Codeml()
Esempio n. 2
0
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-

__author__ = 'Serafina Nieves'
__email__ = '*****@*****.**'

from Bio.Phylo.PAML import codeml
import sys

wdir = str(sys.argv[1])
seqfile = str(sys.argv[2])
treefile = str(sys.argv[3])
mod = str(sys.argv[4])
outfile= str(sys.argv[5])

cml = codeml.Codeml(working_dir=wdir, alignment=seqfile, tree=treefile, out_file=outfile)

cml.set_options(noisy=9, verbose=1, runmode=0, seqtype=1, CodonFreq=2,
               ndata=0, clock=0, aaDist=0, model=mod, NSsites=[0], icode=0,
               Mgene=0, fix_kappa=0, kappa=2, fix_omega=0, omega=1,
               fix_alpha=1, alpha=0., Malpha=0, ncatG=8, getSE=0, RateAncestor=0,
               Small_Diff=.5e-6, cleandata=1, fix_blength=1, method=0)

cml.print_options()
cml.run(verbose=True)
Esempio n. 3
0
logging.info("Starting the run. The run settings can be checked in the control file created...")
for file in os.listdir(input_dir):
	if file.endswith(".pal2nal"):
		codeml_output = re.sub(".pal2nal", ".cml.out", file)
		alignment = os.path.join(input_dir,file)
		tree = "COG0012.mod.nwk" #Needed as input but not used. So you can use any tree. 
		cog_in_turn = file.replace(input_dir, "")
		cog_in_turn = file.replace(".pal2nal", "")

		print("Your input files are: ", alignment, "and", tree)
		logging.info("Your input files are: {0} and {1}.".format(alignment, tree))
		
		#Let's run codeml!
		print("Running codeml for COG:", cog_in_turn)
		logging.info("Running codeml for COG: {0}".format(cog_in_turn))
		cml = codeml.Codeml()
		cml.alignment = alignment
		cml.tree = tree
		cml.out_file = codeml_output
		cml.working_dir = input_dir

 		#Setting options
		cml.set_options(noisy=1) #How much rubbish on the screen
		cml.set_options(verbose=0) #How many details on the screen
		cml.set_options(runmode=-2) #I set pairwise comparison
		cml.set_options(seqtype=1) #I am using codons
		cml.set_options(CodonFreq=1) #The equilibrium codon frequencies  in the codon substitution model will be calculated from the average nucleotides frequencies 
		cml.set_options(clock=0) #No clock and rates are entirely free to vary from branch to branch
		cml.set_options(model = 1) #I set to compute an omega value for each branch
		cml.set_options(NSsites = [0]) #This model fits with the CodonFreq used
		cml.set_options(icode = 0) #I set the universal code
Esempio n. 4
0
def alignGene(line):
    try:
        gene1 = line.split(" ")[2]
        gene2 = line.split(" ")[7]
        print(gene1)
        print(gene2)
        cds1 = line.split(" ")[1]
        cds2 = line.split(" ")[6]
        geneName = line.split(" ")[4]
        #Make file with both protein sequences
        seqiter = SeqIO.parse(open(seq1), 'fasta')
        SeqIO.write((seq for seq in seqiter if seq.id == gene1),
                    "scratch/" + gene1 + ".fa", "fasta")
        seqiter = SeqIO.parse(open(seq2), 'fasta')
        SeqIO.write((seq for seq in seqiter if seq.id == gene2),
                    "scratch/" + gene2 + ".fa", "fasta")

        with open("scratch/" + geneName + ".fa", 'wb') as wfd:
            for f in ["scratch/" + gene1 + ".fa", "scratch/" + gene2 + ".fa"]:
                with open(f, 'rb') as fd:
                    shutil.copyfileobj(fd, wfd)

        #Make file with both protein sequences
        seqiter = SeqIO.parse(open(seq1_nucl), 'fasta')
        SeqIO.write((seq for seq in seqiter if seq.id == cds1),
                    "scratch/" + cds1 + ".fa", "fasta")
        seqiter = SeqIO.parse(open(seq2_nucl), 'fasta')
        SeqIO.write((seq for seq in seqiter if seq.id == cds2),
                    "scratch/" + cds2 + ".fa", "fasta")

        with open("scratch/" + geneName + "_nucl.fa", 'wb') as wfd:
            for f in ["scratch/" + cds1 + ".fa", "scratch/" + cds2 + ".fa"]:
                with open(f, 'rb') as fd:
                    shutil.copyfileobj(fd, wfd)

        #Make tree file
        cline = ClustalwCommandline("clustalw2",
                                    infile="scratch/" + geneName + ".fa",
                                    newtree='scratch/' + geneName + 'tree.tre')
        stdout, stderr = cline()

        #Make alignment file
        cline = ClustalwCommandline("clustalw2",
                                    infile="scratch/" + geneName + ".fa",
                                    output="CLUSTAL",
                                    outfile='scratch/' + geneName +
                                    'alignment.aln')
        stdout, stderr = cline()

        cmd = "perl pal2nal.pl scratch/" + geneName + "alignment.aln scratch/" + geneName + "_nucl.fa -output paml > scratch/" + geneName + "alignment_nucl.phy"
        #Run pal2nal
        os.system(cmd)
        cml = codeml.Codeml()
        cml.alignment = 'scratch/' + geneName + 'alignment_nucl.phy'
        cml.working_dir = "./scratch"
        cml.tree = 'scratch/' + geneName + 'tree.tre'
        cml.out_file = 'scratch/' + geneName + 'out.txt'
        cml.set_options(seqtype=1,
                        verbose=1,
                        noisy=0,
                        model=1,
                        runmode=-2,
                        Mgene=0,
                        NSsites=[0],
                        CodonFreq=2,
                        cleandata=1)
        cml.run(verbose=False)

        with open(cml.out_file) as results:
            with open(outfile, 'a') as out:
                for line in results:
                    if line.find("dN/dS=") > -1:
                        line = line.split()
                        out.write(geneName + '\t' +
                                  str(line[line.index("dN/dS=") + 1]) + '\n')
        cmd = "rm scratch/*" + geneName + "* scratch/" + cds1 + "* scratch/" + cds2 + "* scratch/*" + gene1 + "* scratch/*" + gene2 + "*"
        os.system(cmd)
    except:
        return
Esempio n. 5
0
def __call__(self):
from Bio.Phylo.PAML import codeml
import os
try:os.mkdir('paml')
except:pass
try:os.mkdir('paml/%s'%os.path.split(self.align)[-1])
except:pass
cml = codeml.Codeml(alignment = self.align, tree = self.tree,
out_file = "tmpcodeml/%s.out"%os.path.split(self.align)[-1],
working_dir='paml/%s'%os.path.split(self.align)[-1])
cml.set_options(NSsites = "1 2", seqtype = 1, model = 0, RateAncestor = 1)
cml.ctl_file = "../../tmpcodeml/%s.ctl"%os.path.split(self.align)[-1]
try:
res = cml.run()
shutil.move('paml/%s/rst'%os.path.split(self.align)[-1], "tmpcodeml/%s.rst"%os.path.split(self.align)[-1])
shutil.move('paml/%s/rst1'%os.path.split(self.align)[-1], "tmpcodeml/%s.rst1"%os.path.split(self.align)[-1])
except:
res = None
 
return (self.align,res)
 
class Consumer(multiprocessing.Process):
def __init__(self,
task_queue = multiprocessing.Queue(),
result_queue = multiprocessing.Queue()):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
 
def run(self):
while True:
next_task = self.task_queue.get()
time.sleep(0.01)
if next_task is None:
# Poison pill means we should exit
break
answer = next_task()
self.result_queue.put(answer)
return
 
class MultiProcess(object):
'''
Class MultiProcess
An object that can perform multiprocesses
'''
def __init__(self,ncpus=1):
self.ncpus = int(ncpus)
# Parallelization
self._parallel = None
self._paralleltasks = Queue()
self._parallelresults = Queue()
def initiateParallel(self):
self._parallel = [Consumer(self._paralleltasks,self._parallelresults)
for x in range(self.ncpus)]
for consumer in self._parallel:
consumer.start()
def addPoison(self):
for consumer in self._parallel:
self._paralleltasks.put(None)
 
def isTerminated(self):
for consumer in self._parallel:
if consumer.is_alive():
return False
return True
 
def killParallel(self):
for consumer in self._parallel:
consumer.terminate()
def doCodeML(self, indir, tree):
i = 0
dres = {}
redo = open('codemlfail.txt','w')
self.initiateParallel()
for f in os.listdir(indir):
if f[-4:] != '.phy':continue
align = os.path.join(indir, f)
obj = CodeML(indir, align, tree)
self._paralleltasks.put(obj)
# Poison pill to stop the workers
self.addPoison()
while True:
while not self._parallelresults.empty():
result = self._parallelresults.get()
if not result[1]:
msg(result[0],'ERR')
redo.write('%s\n'%result[0])
else:
msg('%s %d'%(result[0],i),'IMP')
i += 1
if self.isTerminated():
break
time.sleep(0.1)
# Get the last messages
while not self._parallelresults.empty():
result = self._parallelresults.get()
if not result[1]:
msg(result[0],'ERR')
redo.write('%s\n'%result[0])
else:
msg('%s %d'%(result[0],i),'IMP')
i += 1
self.killParallel()
return dres
 
class Highlighter:
def __init__(self):
self._msgTypes={'INF':'\033[0m',
'IMP':'\033[1;32m',
'DEV':'\033[1;34m',
'ERR':'\033[1;31m',
'WRN':'\033[1;33m'}
self._reset='\033[0m'
self._default='INF'
 
def ColorMsg(self,msg,msgLevel='INF'):
try:
s=self._msgTypes[msgLevel]+msg+self._reset
except:s=s=self._msgTypes[self._default]+msg+self._reset
return s
 
def msg(message, msgLevel='INF', sameline=False):
o=Highlighter()
if sameline:
sys.stderr.write('\r')
else:
sys.stderr.write(strftime("%H:%M:%S") + ' ')
sys.stderr.write(o.ColorMsg(message,msgLevel))
if not sameline:
sys.stderr.write('\n')
 
def creturn():
sys.stderr.write('\n')
 
def getOptions():
'''Retrieve the options passed from the command line'''
 
usage = "usage: python parallelPAML.py [options]"
parser = OptionParser(usage)
 
group1 = OptionGroup(parser, "Inputs")
group1.add_option('-a', '--aligndir', action="store", dest='align',
default='OUT',
help='Alignment directory')
group1.add_option('-t', '--tree', action="store", dest='tree',
default='TREE.nwk',
help='Tree file')
group1.add_option('-r', '--threads', action="store", dest='threads',
default=1,
help='Threads [Default: 1]')
parser.add_option_group(group1)
# Parse the options
return parser.parse_args()
(options, args) = getOptions()
 
dres = MultiProcess(options.threads).doCodeML(options.align,options.tree)
 
import json
json.dump(dres,open('codemlresults.out','w'))
Esempio n. 6
0
# @author Emily Huntsman BC'21
# under the guidance of Professors Jon Snow and Allison Lopatkin
# @version May 20, 2021

from Bio.Phylo.PAML import codeml
import os

# below insert the names of your alignment and tree files
cml = codeml.Codeml(
    alignment="IRE_NT.phylip",
    tree="IRE_NT.trees",
    out_file="results.out",
    working_dir=os.path.abspath(""),
)

# specifications from Professor Lopatkin reflected in codeml.ctl but can be adjusted according to the PAML manual
cml.read_ctl_file("codeml.ctl")
cml.print_options()

# change command to reflect the path to your paml executable
# this can be found by navigating through your directory structure and into paml4.8/bin and typing pwd (print working directory) in the command line
results = cml.run(command="/Users/annhuntsman/Desktop/PAML_Python/paml4.8/bin/codeml",verbose=True)

# if prompted in the terminal respond accordingly (usually pressing enter)

# omega for selection value
print("omega: "+str(results['NSsites'][0]['parameters']['omega']))
Esempio n. 7
0
# FPAML3.py: Runs PAML as in FPAML.py, but for the 70% Gapped Sequence ONLY
# Non 'gap' + SEQNAME folders will give errors

##### INITIALIZATION #####

from Bio.Phylo.PAML import codeml  # Utilizing CodeML from BioPython
import glob, os

cml = codeml.Codeml()  # Defines CodeML variable

cml.set_options(verbose=0)  # Set CodeML Options for all analyses
cml.set_options(CodonFreq=2)
cml.set_options(cleandata=0)
cml.set_options(fix_blength=0)
cml.set_options(NSsites=[0, 1, 2, 7, 8])
cml.set_options(fix_omega=0)
cml.set_options(clock=1)
cml.set_options(ncatG=2)
cml.set_options(runmode=0)
cml.set_options(fix_kappa=0)
cml.set_options(fix_alpha=1)
cml.set_options(Small_Diff=5e-7)
cml.set_options(method=1)
cml.set_options(Malpha=0)
cml.set_options(aaDist=0)
cml.set_options(RateAncestor=0)
cml.set_options(icode=0)
cml.set_options(alpha=0.0)
cml.set_options(seqtype=1)
cml.set_options(omega=0.4)
cml.set_options(getSE=0)
Esempio n. 8
0
def ma_m1a(alignment, tree, output_dir, working_dir):
    """
    This is tu run PAML in each defined branch under models MA and M1a, with this options:
    model = 2
    NSsites = 2
    fix_omega = 0 (for Ma) and 1 (for M1a)
    fix_blength = 1 -> The supplied tree should have branch lengths, and PAML will use those as a starting point

    The output of this function is a dictionary containing the lnL value and the site_classes for each model (Ma and M1a)
    """
    from Bio.Phylo.PAML import codeml
    import os

    paml_results = dict()  # Store the results of the analysis

    cml = codeml.Codeml()  # Setup PAML

    #Parameters to PAML
    cml.alignment = alignment
    cml.tree = tree
    cml.out_file = output_dir + "/" + os.path.basename(
        alignment)[:-4] + os.path.basename(tree)[:-4] + ".ma"
    cml.working_dir = working_dir

    cml.set_options(seqtype=1,
                    CodonFreq=2,
                    clock=0,
                    model=2,
                    NSsites=[2],
                    fix_kappa=0,
                    kappa=2,
                    fix_omega=0,
                    omega=5,
                    verbose=1,
                    fix_blength=1)

    print "Running codeml for model A in : %s" % os.path.basename(tree)

    results_ma = cml.run()

    #Parse the results for the first run
    ns_sites_ma = results_ma.get("NSsites")

    for site in ns_sites_ma:
        lnL = ns_sites_ma[site].get("lnL")
        parameters = ns_sites_ma[site].get("parameters")
        site_classes = parameters.get("site classes")

        model_results = {"lnL": lnL, "site_classes": site_classes}

        paml_results["Ma"] = model_results

    #Run the second model
    print "Running codeml for model 1A in : %s" % os.path.basename(tree)

    #Parameters for the second model
    cml.out_file = output_dir + "/" + os.path.basename(
        alignment)[:-4] + os.path.basename(tree)[:-4] + ".m1a"
    cml.set_options(seqtype=1,
                    CodonFreq=2,
                    clock=0,
                    model=2,
                    NSsites=[2],
                    fix_kappa=0,
                    kappa=2,
                    fix_omega=1,
                    omega=1,
                    verbose=1,
                    fix_blength=1)

    results_m1a = cml.run()

    #Parse the results for the second run
    ns_sites_m1a = results_m1a.get("NSsites")

    for site in ns_sites_m1a:
        lnL = ns_sites_m1a[site].get("lnL")
        parameters = ns_sites_m1a[site].get("parameters")
        site_classes = parameters.get("site classes")

        model_results = {"lnL": lnL, "site_classes": site_classes}

        paml_results["M1a"] = model_results

    return paml_results
Esempio n. 9
0
def reconstruct(df,
                id_col='uid',
                sequence_col='sequence',
                working_dir='',
                save_ancestors=False,
                altall_cutoff=0.2,
                infer_gaps=True,
                aaRatefile='lg',
                **kwargs):

    df = df.copy()

    # Construct default arguments
    default_options = dict(verbose=9,
                           CodonFreq=None,
                           cleandata=0,
                           fix_blength=2,
                           NSsites=None,
                           fix_omega=None,
                           clock=None,
                           ncatG=8,
                           runmode=0,
                           fix_kappa=None,
                           fix_alpha=1,
                           Small_Diff=1.0e-6,
                           method=0,
                           Malpha=None,
                           aaDist=None,
                           RateAncestor=2,
                           icode=None,
                           alpha=None,
                           seqtype=2,
                           omega=None,
                           getSE=None,
                           noisy=3,
                           Mgene=None,
                           kappa=None,
                           model=3,
                           ndata=None)

    # Update default arguments in place.
    default_options.update(**kwargs)

    # ---------------- Prepare model ----------------
    # copy model from package to project directory.
    path_to_model = pkg_resources.resource_filename(
        'pyasr', os.path.join('dat', '{}.dat'.format(aaRatefile)))

    model_file = '{}.dat'.format(aaRatefile)
    model_path = os.path.join(working_dir, model_file)
    shutil.copyfile(path_to_model, model_path)

    # ----------------------

    curr_path = os.getcwd()
    proj_path = os.path.join(curr_path, working_dir)
    ali_path = os.path.join(working_dir, 'ali-to-reconstruct.phy')
    tree_path = os.path.join(working_dir, 'tree-to-reconstruct.phy')
    out_path = os.path.join(working_dir, 'results.txt')
    ctl_path = os.path.join(working_dir, 'codeml_options.ctl')
    rst_path = os.path.join(working_dir, 'rst')

    df.phylo.to_fasta(
        filename=ali_path,
        id_col=id_col,
        sequence_col=sequence_col,
    )

    df.phylo.to_newick(
        filename=tree_path,
        taxon_col=id_col,
        node_col=id_col,
        suppress_internal_node_labels=True,
    )

    df.phylo.to_newick(
        taxon_col=id_col,
        node_col=id_col,
        suppress_internal_node_labels=True,
    )

    # Build and write out control file.
    cml = codeml.Codeml(alignment=ali_path,
                        tree=tree_path,
                        out_file=out_path,
                        working_dir=working_dir)
    cml.set_options(aaRatefile=model_file, **default_options)
    cml.ctl_file = ctl_path
    cml.write_ctl_file()

    # ----------------------

    os.chdir(proj_path)
    output = subprocess.run(['codeml', 'codeml_options.ctl'])
    os.chdir(curr_path)

    # ----------------------

    return read_codeml_output(rst_path, df)
import sys
from Bio.Phylo.PAML import codeml

#folder_path = sys.argv[1]
alignment_file = sys.argv[1]  # full path
tree_file = sys.argv[2]  # full path
m0_out = sys.argv[3]  # full output path
estimated_tree_name = sys.argv[4]
final_out = sys.argv[5]

# Run M0 model to get tree
cmlM0 = codeml.Codeml(alignment=alignment_file,
                      tree=tree_file,
                      out_file=m0_out)
cmlM0.set_options(seqtype=1)
cmlM0.set_options(model=0)
cmlM0.set_options(NSsites=[0])
cmlM0.set_options(omega=0.5)
cmlM0.set_options(CodonFreq=2)
cmlM0.set_options(ndata=1)
cmlM0.set_options(fix_alpha=1)
cmlM0.set_options(Small_Diff=5e-7)

# Run the M0 model
cmlM0.run(command="/Users/kmoney/Documents/paml4.9e/bin/codeml")

# Get tree from m0 results
m0result = codeml.read(m0_out)
NSsites_dict = m0result.get("NSsites")
NSsites0_dict = NSsites_dict.get(0)
estimated_tree = NSsites0_dict.get("tree")
def parse_hogs(hoglist, model, basedir, verbose=True, multisite=False):
    #take list of hogs, return parsed final results dictionary
    final_results = {}
    for hog in hoglist:
        if verbose:
            print("Working on", hog, flush=True)

        toppath = '{:0>4}'.format(int(hog) % 100)
        # 0000/100/100.codeml.ancrec.ctl.out/
        fullpath = basedir + "/" + toppath + "/" + hog + "/" + hog + ".codeml." + model + ".ctl.out"
        results_file = fullpath + "/" + model + ".out"
        control_file = fullpath + "/" + hog + ".codeml." + model + ".ctl"
        #get species tree
        sptreepath = basedir + "/" + toppath + "/" + hog + "/" + hog + ".final_spt.nwk"
        try:
            species_tree = Phylo.read(sptreepath, "newick")
        except FileNotFoundError:
            species_tree = None

        cml = codeml.Codeml()
        try:
            cml.read_ctl_file(control_file)
        except OSError:
            print("Couldn't parse file for", hog, "at",
                  pamldir + "/" + fullpath)
            continue

        tree_file = fullpath + "/" + cml.tree
        #now process
        parsed_trees = parse_trees(tree_file, species_tree)
        try:
            if multisite:
                parsed_results = parse_multitree_multimodel_results(
                    results_file)
            else:
                parsed_results = parse_multitree_results(results_file)
        except FileNotFoundError:
            print("Couldn't parse file for", hog, "at",
                  pamldir + "/" + fullpath)
            continue

        #check that we have a result for each tree
        if len(parsed_trees) < len(parsed_results):
            print("Warning, too few trees for number of results for", hog,
                  "in", results_file)
            continue
        elif len(parsed_trees) > len(parsed_results):
            #remove trees that aren't in results
            trimmed_trees = {x: parsed_trees[x] for x in parsed_results.keys()}
            parsed_trees = trimmed_trees

        if hog in final_results:
            #append
            cur_len = len(final_results[hog]['trees'])
            if cur_len != len(final_results[hog]['results']):
                print("Warning, something went wrong!!")

            #update keys (tree numbers)
            new_trees = {
                int(x) + cur_len: parsed_trees[x]
                for x in parsed_trees.keys()
            }
            new_results = {
                int(x) + cur_len: parsed_results[x]
                for x in parsed_results.keys()
            }
            final_results[hog]['trees'].update(new_trees)
            final_results[hog]['results'].update(new_results)

        else:
            final_results[hog] = {
                'trees': parsed_trees,
                'results': parsed_results
            }

    return (final_results)
Esempio n. 12
0
def free_ratios_worker(orthogroup, workingdir):
    cml = codeml.Codeml(alignment = "%s/og_cds_%s.afa" % (workingdir, orthogroup), tree = "%s/og_%s.tree" % (workingdir, orthogroup), out_file = "%s/og_%s.alt" % (workingdir, orthogroup), working_dir = "%s/og_%s_working" % (workingdir, orthogroup))
    cml.set_options(runmode=0,fix_blength=0,seqtype=1,CodonFreq=2, model=1, icode=0, clock = 0, aaDist=0, Mgene = 0, fix_kappa = 0, kappa = 2, fix_omega = 0, omega = 1, getSE = 0, RateAncestor = 0, cleandata = 0, Small_Diff = .45e-6, verbose = True)
    cml.set_options(NSsites=[0])
    cml.print_options()
    cml.run(command = "/Genomics/kocherlab/berubin/local/src/paml4.9e/bin/codeml", verbose = True)