def testParseNgene2Mgene34(self): res_dir = os.path.join(self.results_dir, "codeml", "ngene2_mgene34") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertEqual(len(results), 4, version_msg) self.assertIn("NSsites", results, version_msg) models = results["NSsites"] self.assertEqual(len(models), 1, version_msg) self.assertIn(0, models, version_msg) model = models[0] self.assertEqual(len(model), 5, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] # This type of model has fewer parameters for model 0 self.assertEqual(len(params), 3, version_msg) self.assertIn("rates", params, version_msg) rates = params["rates"] self.assertEqual(len(rates), 2, version_msg) self.assertIn("genes", params, version_msg) genes = params["genes"] self.assertEqual(len(genes), 2, version_msg)
def testParseFreeRatio(self): res_dir = os.path.join(self.results_dir, "codeml", "freeratio") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertEqual(len(results), 4, version_msg) self.assertIn("NSsites", results, version_msg) models = results["NSsites"] self.assertEqual(len(models), 1, version_msg) self.assertIn(0, models, version_msg) model = models[0] # With the free ratio model, you get 3 extra trees: dN tree, # dS tree and omega tree self.assertEqual(len(model), 8, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[0], version_msg) self.assertIn("branches", params, version_msg) # There should be 7 branches branches = params["branches"] self.assertEqual(len(branches), 7, version_msg) self.assertIn("omega", params, version_msg) omega = params["omega"] self.assertEqual(len(omega), 7, version_msg)
def testParseAA(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_aa_model0.out") results = codeml.read(results_file) self.assertEqual(len(results), 5) distances = results["distances"] self.assertEqual(len(distances), 1)
def testParseCladeModelC(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_clademodelC.out") results = codeml.read(results_file) self.assertEqual(len(results), 5) site_classes = results["NSsites"][2]["parameters"]["site classes"] self.assertEqual(len(site_classes), 3)
def testParseNSsite3(self): res_dir = os.path.join(self.results_dir, "codeml", "NSsite3") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # There should be 5 top-level items: 'codon model', 'model', # 'version', 'NSsites' & site-class model, the last of which # is only there when only one NSsites class is used self.assertEqual(len(results), 5, version_msg) self.assertIn('site-class model', results, version_msg) self.assertEqual(results['site-class model'], 'discrete', version_msg) self.assertIn("NSsites", results, version_msg) # There should be 1 NSsites classe: 3 self.assertEqual(len(results["NSsites"]), 1, version_msg) # Each site class model should have 5 sub-items: 'lnL', 'tree', # 'description', 'parameters', & 'tree length'. It should # have the correct number of parameters also. model = results["NSsites"][3] self.assertEqual(len(model), 5, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[3], version) self.assertIn("site classes", params, version_msg) site_classes = params["site classes"] self.assertEqual(len(site_classes), 4, version_msg)
def testParseBranchSiteA(self): res_dir = os.path.join(self.results_dir, "codeml", "branchsiteA") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # There are 5 top-level items in this case: # 'codon model', 'model', 'version', 'NSsites' & 'site-class model' self.assertEqual(len(results), 5, version_msg) self.assertIn("NSsites", results, version_msg) models = results["NSsites"] # Only site class model 2 is simulated for Branch Site A self.assertEqual(len(models), 1, version_msg) self.assertIn(2, models, version_msg) model = models[2] self.assertEqual(len(model), 5, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] # Branch Site A results lack a "branches" parameter self.assertEqual(len(params), SITECLASS_PARAMS[2] - 1, version_msg) self.assertIn("site classes", params, version_msg) site_classes = params["site classes"] # Branch Site A adds another site class self.assertEqual(len(site_classes), SITECLASSES[2] + 1, version) for class_num in [0, 1, 2, 3]: self.assertIn(class_num, site_classes, version_msg) site_class = site_classes[class_num] self.assertEqual(len(site_class), 2, version_msg) self.assertIn("branch types", site_class, version_msg) branches = site_class["branch types"] self.assertEqual(len(branches), 2, version_msg)
def testParseAllNSsites(self): res_dir = os.path.join(self.results_dir, "codeml", "all_NSsites") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # There should be 4 top-level items: 'codon model', 'model', # 'version', & 'NSsites' self.assertEqual(len(results), 4, version_msg) self.assertIn("NSsites", results, version_msg) # There should be 6 NSsites classes: 0, 1, 2, 3, 7 & 8 self.assertEqual(len(results["NSsites"]), 6, version_msg) # Each site class model should have 5 sub-items: 'lnL', 'tree', # 'description', 'parameters', & 'tree length'. It should # have the correct number of parameters also. for model_num in [0, 1, 2, 3, 7, 8]: model = results["NSsites"][model_num] self.assertEqual(len(model), 5, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[model_num], version_msg) self.assertIn("branches", params, version_msg) branches = params["branches"] # There are 7 branches in the test case (specific to these # test cases) self.assertEqual(len(branches), 7, version_msg) if "site classes" in params: self.assertEqual(len(params["site classes"]), SITECLASSES[model_num], version_msg)
def testParseCladeModelC(self): cladeC_res_dir = os.path.join(self.results_dir, "codeml", "clademodelC") for results_file in os.listdir(cladeC_res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(cladeC_res_dir, results_file) results = codeml.read(results_path) # 5 top-level items again in this case self.assertEqual(len(results), 5, version_msg) self.assertIn("NSsites", results, version_msg) models = results["NSsites"] # Only site class model 2 is simulated for Clade Model C self.assertEqual(len(models), 1, version_msg) self.assertIn(2, models, version_msg) model = models[2] self.assertEqual(len(model), 5, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] # Clade Model C results lack a "branches" parameter self.assertEqual(len(params), SITECLASS_PARAMS[2] - 1, version_msg) self.assertIn("site classes", params, version_msg) site_classes = params["site classes"] self.assertEqual(len(site_classes), SITECLASSES[2], version) for class_num in [0, 1, 2]: self.assertIn(class_num, site_classes, version_msg) site_class = site_classes[class_num] self.assertEqual(len(site_class), 2, version_msg) self.assertIn("branch types", site_class, version_msg) branches = site_class["branch types"] self.assertEqual(len(branches), 2, version_msg)
def testParseNgene2Mgene34(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_ngene2_mgene34.out") results = codeml.read(results_file) self.assertEqual(len(results), 4) site_classes = results["NSsites"][0]["parameters"]["genes"] self.assertEqual(len(site_classes), 2)
def testParsePairwise(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_pairwise.out") results = codeml.read(results_file) self.assertEqual(len(results), 5) pairwise = results["pairwise"] self.assertEqual(len(pairwise), 5)
def testParseAllNSsites(self): res_dir = os.path.join(self.results_dir, "codeml", "all_NSsites") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = "Improper parsing for version %s" \ % version.replace('_', '.') results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # There should be 4 top-level items: 'codon model', 'model', # 'version', & 'NSsites' self.assertEqual(len(results), 4, version_msg) self.assertTrue("NSsites" in results, version_msg) # There should be 6 NSsites classes: 0, 1, 2, 3, 7 & 8 self.assertEqual(len(results["NSsites"]), 6, version_msg) # Each site class model should have 5 sub-items: 'lnL', 'tree', # 'description', 'parameters', & 'tree length'. It should # have the correct number of parameters also. for model_num in [0, 1, 2, 3, 7, 8]: model = results["NSsites"][model_num] self.assertEqual(len(model), 5, version_msg) self.assertTrue("parameters" in model, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[model_num], version_msg) self.assertTrue("branches" in params, version_msg) branches = params["branches"] # There are 7 branches in the test case (specific to these # test cases) self.assertEqual(len(branches), 7, version_msg) if "site classes" in params: self.assertEqual(len(params["site classes"]), SITECLASSES[model_num], version_msg)
def testParseNSsite3(self): res_dir = os.path.join(self.results_dir, "codeml", "NSsite3") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = "Improper parsing for version %s" \ % version.replace('_', '.') results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # There should be 5 top-level items: 'codon model', 'model', # 'version', 'NSsites' & site-class model, the last of which # is only there when only one NSsites class is used self.assertEqual(len(results), 5, version_msg) self.assertTrue('site-class model' in results, version_msg) self.assertEqual(results['site-class model'], 'discrete', version_msg) self.assertTrue("NSsites" in results, version_msg) # There should be 1 NSsites classe: 3 self.assertEqual(len(results["NSsites"]), 1, version_msg) # Each site class model should have 5 sub-items: 'lnL', 'tree', # 'description', 'parameters', & 'tree length'. It should # have the correct number of parameters also. model = results["NSsites"][3] self.assertEqual(len(model), 5, version_msg) self.assertTrue("parameters" in model, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[3], version) self.assertTrue("site classes" in params, version_msg) site_classes = params["site classes"] self.assertEqual(len(site_classes), 4, version_msg)
def testParseSitesParamsForPairwise(self): """Verify that pairwise site estimates are indeed parsed. Fixes #483""" res_dir = os.path.join(self.results_dir, "codeml", "pairwise") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertIn("pairwise", results) seqs = list(results["pairwise"].keys()) self.assertGreaterEqual( len(seqs), 2, version_msg + ": should have at least two sequences") for seq1, seq2 in itertools.combinations(seqs, 2): params = results["pairwise"][seq1][seq2] self.assertEqual( len(params), 7, version_msg + ": wrong number of parsed parameters" + " for %s-%s" % (seq1, seq2)) for param in ("t", "S", "N", "omega", "dN", "dS", "lnL"): self.assertTrue( param in params, version_msg + ": '%s' not in parsed parameters" % (param)) self.assertTrue(isinstance(params[param], float)) if param != "lnL": self.assertTrue(params[param] >= 0)
def get_pairwise_dn_ds(self): aligned_phylip = self.nucl_align_file() dr = collections.OrderedDict() dr['pairwise'] = {} nullctl = filehash['ALL']['pamlctl']['pairwise'] outfile = aligned_phylip + "." + 'pairwise' + "." + ".phylip" if not os.path.isfile(outfile): print >> sys.stderr, 'working_dir = ', filehash['ALL']['workdir']['NA'],'aligned_phylip=', \ aligned_phylip cml = codeml.Codeml(alignment = aligned_phylip, out_file = aligned_phylip + "." +'pairwise' + "." +".phylip", \ working_dir = filehash['ALL']['workdir']['NA']) cml.read_ctl_file(nullctl) cml.get_option("NSsites") print 'cml=', cml results = cml.run() else: results = codeml.read(outfile) print >> sys.stderr, results paircapture = "cat " + outfile + " | perl -p -e \'s/\n/\t/g\' | grep -oP \"(?<=\t)([0-9]+)\s+\([a-zA-Z0-9]+\)\s+\.\.\.\s+([0-9]+)\s+\([a-zA-Z0-9]+\)\tlnL\s*=\s*[0-9\-\.]+\t\s+[0-9\.\-]+\s+[0-9\.\-]+\t\t([0-9a-zA-Z=\S \.\-]+)\t\" | perl -p -e \'s/\s*=\s*/\t=\t/g\' | perl -p -e \'s/[ \t]+/\t/g\' | cut -f 1,4,22,25,28 > " + outfile + ".table" os.system(paircapture) with open(outfile + ".table") as f: for line in f: print >> sys.stderr, line line = line.rstrip('\n') fields = line.split() dr['pairwise'][fields[0] + ".." + fields[1] + ":dN/dS"] = fields[2] dr['pairwise'][fields[0] + ".." + fields[1] + ":dN"] = fields[3] dr['pairwise'][fields[0] + ".." + fields[1] + ":dS"] = fields[4] printhash(dr, self.name, 'paml')
def testParseCladeModelC(self): cladeC_res_dir = os.path.join(self.results_dir, "codeml", "clademodelC") for results_file in os.listdir(cladeC_res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = "Improper parsing for version %s" \ % version.replace('_', '.') results_path = os.path.join(cladeC_res_dir, results_file) results = codeml.read(results_path) # 5 top-level items again in this case self.assertEqual(len(results), 5, version_msg) self.assertTrue("NSsites" in results, version_msg) models = results["NSsites"] # Only site class model 2 is simulated for Clade Model C self.assertEqual(len(models), 1, version_msg) self.assertTrue(2 in models, version_msg) model = models[2] self.assertEqual(len(model), 5, version_msg) self.assertTrue("parameters" in model, version_msg) params = model["parameters"] # Clade Model C results lack a "branches" parameter self.assertEqual(len(params), SITECLASS_PARAMS[2] - 1, version_msg) self.assertTrue("site classes" in params, version_msg) site_classes = params["site classes"] self.assertEqual(len(site_classes), SITECLASSES[2], version) for class_num in [0, 1, 2]: self.assertTrue(class_num in site_classes, version_msg) site_class = site_classes[class_num] self.assertEqual(len(site_class), 2, version_msg) self.assertTrue("branch types" in site_class, version_msg) branches = site_class["branch types"] self.assertEqual(len(branches), 2, version_msg)
def testParseSitesParamsForPairwise(self): """Verify that pairwise site estimates are indeed parsed. Fixes #483.""" res_dir = os.path.join(self.results_dir, "codeml", "pairwise") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertIn("pairwise", results) seqs = list(results["pairwise"].keys()) self.assertGreaterEqual(len(seqs), 2, version_msg + ": should have at least two sequences") for seq1, seq2 in itertools.combinations(seqs, 2): params = results["pairwise"][seq1][seq2] self.assertEqual(len(params), 7, version_msg + ": wrong number of parsed parameters" + " for %s-%s" % (seq1, seq2)) for param in ("t", "S", "N", "omega", "dN", "dS", "lnL"): self.assertTrue(param in params, version_msg + ": '%s' not in parsed parameters" % (param)) self.assertTrue(isinstance(params[param], float)) if param != "lnL": self.assertTrue(params[param] >= 0)
def testParsePairwise(self): res_dir = os.path.join(self.results_dir, "codeml", "pairwise") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = "Improper parsing for version %s" % version.replace( "_", ".") results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Pairwise models have an extra top-level item: pairwise self.assertEqual(len(results), 5, version_msg) self.assertIn("pairwise", results, version_msg) pairwise = results["pairwise"] self.assertGreaterEqual( len(pairwise), 2, version_msg + ": should have at least two sequences") for seq1, seq2 in itertools.combinations(pairwise.keys(), 2): self.assertEqual( len(pairwise[seq1][seq2]), 7, version_msg + ": wrong number of parameters parsed", ) self.assertEqual( len(pairwise[seq2][seq1]), 7, version_msg + ": wrong number of parameters parsed", )
def testParseAAPairwise(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_aa_pairwise.out") results = codeml.read(results_file) self.assertEqual(len(results), 4) distances = results["distances"] self.assertEqual(len(distances), 2)
def testParseFreeBranch(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_freebranch.out") results = codeml.read(results_file) self.assertEqual(len(results), 4) branches = results["NSsites"][0]["parameters"]["branches"] self.assertEqual(len(branches), 7)
def testParseBranchSiteA(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_branchsiteA.out") results = codeml.read(results_file) self.assertEqual(len(results), 5) site_classes = results["NSsites"][2]["parameters"]["site classes"] self.assertEqual(len(site_classes), 4)
def testParseBranchSiteA(self): res_dir = os.path.join(self.results_dir, "codeml", "branchsiteA") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = "Improper parsing for version %s" \ % version.replace('_', '.') results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # There are 5 top-level items in this case: # 'codon model', 'model', 'version', 'NSsites' & 'site-class model' self.assertEqual(len(results), 5, version_msg) self.assertTrue("NSsites" in results, version_msg) models = results["NSsites"] # Only site class model 2 is simulated for Branch Site A self.assertEqual(len(models), 1, version_msg) self.assertTrue(2 in models, version_msg) model = models[2] self.assertEqual(len(model), 5, version_msg) self.assertTrue("parameters" in model, version_msg) params = model["parameters"] # Branch Site A results lack a "branches" parameter self.assertEqual(len(params), SITECLASS_PARAMS[2] - 1, version_msg) self.assertTrue("site classes" in params, version_msg) site_classes = params["site classes"] # Branch Site A adds another site class self.assertEqual(len(site_classes), SITECLASSES[2] + 1, version) for class_num in [0, 1, 2, 3]: self.assertTrue(class_num in site_classes, version_msg) site_class = site_classes[class_num] self.assertEqual(len(site_class), 2, version_msg) self.assertTrue("branch types" in site_class, version_msg) branches = site_class["branch types"] self.assertEqual(len(branches), 2, version_msg)
def testParseAllNSsites(self): results_file = os.path.join("PAML", "Results", "codeml", "codeml_NSsites_all.out") results = codeml.read(results_file) models = results.get("NSsites") self.assertEqual(len(models), 6) for model in models: self.assertEqual(len(models.get(model)), 5)
def testParseSEs(self): SE_results_file = os.path.join("PAML", "Results", "codeml", "codeml_SE.out") SE_results = codeml.read(SE_results_file) SE_models = SE_results.get("NSsites") for model in SE_models: SE_model = SE_models.get(model) SE_parameters = SE_model.get("parameters") self.assertNotEqual(SE_parameters.get("SEs"), None)
def testParsePairwise(self): res_dir = os.path.join(self.results_dir, "codeml", "pairwise") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = "Improper parsing for version %s" % version.replace("_", ".") results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Pairwise models have an extra top-level item: pairwise self.assertEqual(len(results), 5, version_msg) self.assertTrue("pairwise" in results, version_msg) pairwise = results["pairwise"] self.assertEqual(len(pairwise), 5, version_msg)
def testParseAllVersions(self): for results_file in os.listdir(os.path.join("PAML", "Results","codeml","versions")): if os.path.isfile(results_file) and results_file[:6] == "codeml": results = codeml.read(os.path.join("PAML", "Results", results_file)) self.assertEqual(len(results["NSsites"]), 6) self.assertEqual(len(results["NSsites"][0]), 7) self.assertEqual(len(results["NSsites"][1]), 5) self.assertEqual(len(results["NSsites"][2]), 5) self.assertEqual(len(results["NSsites"][3]), 5) self.assertEqual(len(results["NSsites"][7]), 6) self.assertEqual(len(results["NSsites"][8]), 6)
def testTreeParseVersatility(self): """Test finding trees in the results, in response to bug #453, where trees like (A, (B, C)); weren't being caught""" res_file = os.path.join(self.results_dir, "codeml", "tree_regexp_versatility.out") results = codeml.read(res_file) self.assertTrue("NSsites" in results) nssites = results["NSsites"] self.assertTrue(0 in nssites) m0 = nssites[0] self.assertTrue("tree" in m0) self.assertTrue(m0["tree"] is not None) self.assertNotEqual(len(m0["tree"]), 0)
def testTreeParseVersatility(self): """Test finding trees in the results, in response to bug #453, where trees like (A, (B, C)); weren't being caught""" res_file = os.path.join(self.results_dir, "codeml", "tree_regexp_versatility.out") results = codeml.read(res_file) self.assertIn("NSsites", results) nssites = results["NSsites"] self.assertIn(0, nssites) m0 = nssites[0] self.assertIn("tree", m0) self.assertTrue(m0["tree"] is not None) self.assertNotEqual(len(m0["tree"]), 0)
def testParseM2arel(self): res_dir = os.path.join(self.results_dir, "codeml", "m2a_rel") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = f"Improper parsing for version {version.replace('_', '.')}" results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertIn("NSsites", results) self.assertIn(22, results["NSsites"]) model = results["NSsites"][22] self.assertEqual(len(model), 5, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[22], version_msg)
def testParsePairwise(self): res_dir = os.path.join(self.results_dir, "codeml", "pairwise") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = "Improper parsing for version %s" \ % version.replace('_', '.') results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Pairwise models have an extra top-level item: pairwise self.assertEqual(len(results), 5, version_msg) self.assertTrue("pairwise" in results, version_msg) pairwise = results["pairwise"] self.assertEqual(len(pairwise), 5, version_msg)
def testParseAAPairwise(self): res_dir = os.path.join(self.results_dir, "codeml", "aa_pairwise") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = "Improper parsing for version %s" % version.replace("_", ".") results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Pairwise AA analysis has one top-level fewer than non-pairwise self.assertEqual(len(results), 4, version_msg) self.assertTrue("lnL max" in results, version_msg) self.assertTrue("distances" in results, version_msg) distances = results["distances"] # Pairwise AA analysis has ML & raw distances self.assertEqual(len(distances), 2, version_msg)
def get_dn_ds(aligned_phylip, control_file, marker, dr): print >> sys.stderr, 'marker = in getdnds', marker if not os.path.isfile(aligned_phylip + "." + marker + "." + ".phylip"): cml = codeml.Codeml(alignment = aligned_phylip, out_file = aligned_phylip + "." +marker + "." +".phylip", \ working_dir = filehash['ALL']['workdir']['NA']) print >> sys.stderr, 'cml=', cml cml.read_ctl_file(control_file) cml.get_option("NSsites") results = cml.run() else: results = codeml.read(aligned_phylip + "." + marker + "." + ".phylip") print >> sys.stderr, results dr2 = rprint(results, 'start', marker, dr) return dr2
def testParseAAPairwise(self): res_dir = os.path.join(self.results_dir, "codeml", "aa_pairwise") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = f"Improper parsing for version {version.replace('_', '.')}" results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Pairwise AA analysis has one top-level fewer than non-pairwise self.assertEqual(len(results), 4, version_msg) self.assertIn("lnL max", results, version_msg) self.assertIn("distances", results, version_msg) distances = results["distances"] # Pairwise AA analysis has ML & raw distances self.assertEqual(len(distances), 2, version_msg)
def testParseM2arel(self): res_dir = os.path.join(self.results_dir, "codeml", "m2a_rel") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertIn("NSsites", results) self.assertIn(22, results["NSsites"]) model = results["NSsites"][22] self.assertEqual(len(model), 5, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[22], version_msg)
def testParseNgene2Mgene1(self): res_dir = os.path.join(self.results_dir, "codeml", "ngene2_mgene1") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = f"Improper parsing for version {version.replace('_', '.')}" results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertEqual(len(results), 4, version_msg) self.assertIn("genes", results, version_msg) genes = results["genes"] self.assertEqual(len(genes), 2, version_msg) model = genes[0] self.assertEqual(len(model), 5, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[0], version_msg)
def testParseNgene2Mgene1(self): res_dir = os.path.join(self.results_dir, "codeml", "ngene2_mgene1") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = "Improper parsing for version %s" % version.replace("_", ".") results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertEqual(len(results), 4, version_msg) self.assertTrue("genes" in results, version_msg) genes = results["genes"] self.assertEqual(len(genes), 2, version_msg) model = genes[0] self.assertEqual(len(model), 5, version_msg) self.assertTrue("parameters" in model, version_msg) params = model["parameters"] self.assertEqual(len(params), SITECLASS_PARAMS[0], version_msg)
def testParsePairwise(self): res_dir = os.path.join(self.results_dir, "codeml", "pairwise") for results_file in os.listdir(res_dir): version = results_file.split('-')[1].split('.')[0] version_msg = ("Improper parsing for version %s" % version.replace('_', '.')) results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Pairwise models have an extra top-level item: pairwise self.assertEqual(len(results), 5, version_msg) self.assertIn("pairwise", results, version_msg) pairwise = results["pairwise"] self.assertGreaterEqual(len(pairwise), 2, version_msg + ": should have at least two sequences") for seq1, seq2 in itertools.combinations(pairwise.keys(), 2): self.assertEqual(len(pairwise[seq1][seq2]), 7, version_msg + ": wrong number of parameters parsed") self.assertEqual(len(pairwise[seq2][seq1]), 7, version_msg + ": wrong number of parameters parsed")
def testParseAA(self): res_dir = os.path.join(self.results_dir, "codeml", "aa_model0") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = f"Improper parsing for version {version.replace('_', '.')}" results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Amino Acid analysis has different top-levels: # 'NSsites', 'model', 'version', 'lnL max', 'distances' # Version 4.1 doesn't seem to produce distances in the results if version == "4_1": self.assertEqual(len(results), 4, version_msg) self.assertIn("lnL max", results, version_msg) else: self.assertEqual(len(results), 5, version_msg) self.assertIn("lnL max", results, version_msg) self.assertIn("distances", results, version_msg) distances = results["distances"] # non-pairwise AA analysis only gives raw distances self.assertEqual(len(distances), 1, version_msg)
def testParseSEs(self): res_dir = os.path.join(self.results_dir, "codeml", "SE") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = f"Improper parsing for version {version.replace('_', '.')}" results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertEqual(len(results), 4, version_msg) self.assertIn("NSsites", results, version_msg) models = results["NSsites"] # Only site class model 0 was simulated self.assertEqual(len(models), 1, version_msg) self.assertIn(0, models, version_msg) model = models[0] self.assertEqual(len(model), 5, version_msg) self.assertIn("parameters", model, version_msg) params = model["parameters"] # There should be one new item in the parameters, "SEs" self.assertEqual(len(params), SITECLASS_PARAMS[0] + 1, version_msg) self.assertIn("SEs", params, version_msg)
def testParseSEs(self): res_dir = os.path.join(self.results_dir, "codeml", "SE") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = "Improper parsing for version %s" % version.replace("_", ".") results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertEqual(len(results), 4, version_msg) self.assertTrue("NSsites" in results, version_msg) models = results["NSsites"] # Only site class model 0 was simulated self.assertEqual(len(models), 1, version_msg) self.assertTrue(0 in models, version_msg) model = models[0] self.assertEqual(len(model), 5, version_msg) self.assertTrue("parameters" in model, version_msg) params = model["parameters"] # There should be one new item in the parameters, "SEs" self.assertEqual(len(params), SITECLASS_PARAMS[0] + 1, version_msg) self.assertTrue("SEs" in params, version_msg)
def testParseAA(self): res_dir = os.path.join(self.results_dir, "codeml", "aa_model0") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = "Improper parsing for version %s" % version.replace("_", ".") results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) # Amino Acid analysis has different top-levels: # 'NSsites', 'model', 'version', 'lnL max', 'distances' # Version 4.1 doesn't seem to produce distances in the results if version == "4_1": self.assertEqual(len(results), 4, version_msg) self.assertTrue("lnL max" in results, version_msg) else: self.assertEqual(len(results), 5, version_msg) self.assertTrue("lnL max" in results, version_msg) self.assertTrue("distances" in results, version_msg) distances = results["distances"] # non-pairwise AA analysis only gives raw distances self.assertEqual(len(distances), 1, version_msg)
def testParseNgene2Mgene02(self): res_dir = os.path.join(self.results_dir, "codeml", "ngene2_mgene02") for results_file in os.listdir(res_dir): version = results_file.split("-")[1].split(".")[0] version_msg = "Improper parsing for version %s" % version.replace("_", ".") results_path = os.path.join(res_dir, results_file) results = codeml.read(results_path) self.assertEqual(len(results), 4, version_msg) self.assertTrue("NSsites" in results, version_msg) models = results["NSsites"] self.assertEqual(len(models), 1, version_msg) self.assertTrue(0 in models, version_msg) model = models[0] self.assertEqual(len(model), 5, version_msg) self.assertTrue("parameters" in model, version_msg) params = model["parameters"] # This type of model has fewer parameters for model 0 self.assertEqual(len(params), 4, version_msg) self.assertTrue("rates" in params, version_msg) rates = params["rates"] self.assertEqual(len(rates), 2, version_msg)
IGC_geo) + '_Sim_' + str(sim_num) + '.fasta' ctl_loc = wk_dir + 'geo_' + str(IGC_geo) + '_Sim_' + str( sim_num) + '_codeml.ctl' out_file = wk_dir + 'unrooted_MG94_geo_' + str( IGC_geo) + '_Sim_' + str(sim_num) + '_codeml_output.txt' ## prepare_ctl(tree_loc, seq_loc, out_file, ctl_loc) ## run_paml(wk_dir, ctl_loc)#, "/Users/Xiang/Software/paml4.8/bin/codeml") out_tree1_file = out_file.replace('_output.txt', '_tree1_output.txt') out_tree2_file = out_file.replace('_output.txt', '_tree2_output.txt') out_tree_files = [out_tree1_file, out_tree2_file] Seperate_codeml_result(out_file, out_tree_files) if os.path.isfile(out_tree1_file): codeml_result = codeml.read(out_tree1_file) tree1_file = out_file.replace('codeml_output.txt', 'codeml_tree1_est.newick') with open(tree1_file, 'w+') as f: f.write(codeml_result['NSsites'][0]['tree'] + '\n') edge_to_blen, edge_list_1 = get_tree(tree1_file, name_tree_1st) if sim_num == 0: edge_list_1_fix = deepcopy(edge_list_1) summary = [ codeml_result['NSsites'][0]['lnL'], codeml_result['NSsites'][0]['parameters']['kappa'], codeml_result['NSsites'][0]['parameters']['omega'] ] summary.extend( [edge_to_blen[edge] for edge in edge_list_1_fix])
import time import glob from math import sqrt from rpy2 import robjects r = robjects.r len = len(glob.glob1(".", "*.out")) def compare_models(m1_lnl, m2_lnl, df): likelihood = 2 * (abs(m2_lnl - m1_lnl)) p = 1 - robjects.r.pchisq(likelihood, df)[0] return p results = codeml.read(sys.argv[1]) nssites = results.get("NSsites") m1 = nssites.get(1) m1_lnl = m1.get("lnL") m2 = nssites.get(2) m2_lnl = m2.get("lnL") m7 = nssites.get(7) m7_lnl = m7.get("lnL") m8 = nssites.get(8) m8_lnl = m8.get("lnL") m2_p_pos = compare_models(m1_lnl, m2_lnl, 2) m8_p_pos = compare_models(m7_lnl, m8_lnl, 2) r.assign('m2_p_pos', m2_p_pos) r.assign('m8_p_pos', m8_p_pos) r.assign('len', len)
import shutil import time import glob from math import sqrt from rpy2 import robjects r = robjects.r len = len(glob.glob1(".","*.out")) def compare_models(null_lnl, alt_lnl, df): likelihood = 2*(abs(null_lnl-alt_lnl)) p = 1 - robjects.r.pchisq(likelihood, df)[0] return p null_results = codeml.read(sys.argv[1]) alt_results = codeml.read(sys.argv[2]) null_nssites = null_results.get("NSsites") alt_nssites = alt_results.get("NSsites") #null_model = null_results.get("model") #alt_model = alt_results.get("model") null_value = null_nssites.get(2) null_lnl = null_value.get("lnL") alt_value = alt_nssites.get(2) alt_lnl = alt_value.get("lnL") bs_p_pos = compare_models(null_lnl,alt_lnl,1)
def read_results(out_file, hyp): """Use the Biopython codeml output parser to read the output file and extract all the relevant information.""" res = codeml.read(out_file) # extract the values that we are interested in, depending on which model # we have fit. if hyp == 'Null': lnl = res['NSsites'][22].get('lnL', 'NA') if 'parameters' not in res['NSsites'][22]: dat = { 'null_lnl': lnl, 'null_kappa': 'NA', 'null_omega0dnds': 'NA', 'null_omega1dnds': 'NA', 'null_omega2dnds': 'NA', 'null_omega0prop': 'NA', 'null_omega1prop': 'NA', 'null_omega2prop': 'NA' } else: kappa = res['NSsites'][22]['parameters'].get('kappa', 'NA') if 'site classes' not in res['NSsites'][22]['parameters']: omega_0_dnds = 'NA' omega_1_dnds = 'NA' omega_2_dnds = 'NA' omega_0_prop = 'NA' omega_1_prop = 'NA' omega_2_prop = 'NA' else: omega_0_dnds = res['NSsites'][22]['parameters'][ 'site classes'][0].get('omega', 'NA') omega_1_dnds = res['NSsites'][22]['parameters'][ 'site classes'][1].get('omega', 'NA') omega_2_dnds = res['NSsites'][22]['parameters'][ 'site classes'][2].get('omega', 'NA') omega_0_prop = res['NSsites'][22]['parameters'][ 'site classes'][0].get('proportion', 'NA') omega_1_prop = res['NSsites'][22]['parameters'][ 'site classes'][1].get('proportion', 'NA') omega_2_prop = res['NSsites'][22]['parameters'][ 'site classes'][2].get('proportion', 'NA') dat = { 'null_lnl': lnl, 'null_kappa': str(kappa), 'null_omega0dnds': str(omega_0_dnds), 'null_omega1dnds': str(omega_1_dnds), 'null_omega2dnds': str(omega_2_dnds), 'null_omega0prop': str(omega_0_prop), 'null_omega1prop': str(omega_1_prop), 'null_omega2prop': str(omega_2_prop) } elif hyp == 'Ha1': # For this model, #1 is maize/tandem, #0 is grass lnl = res['NSsites'][2].get('lnL', 'NA') if 'parameters' not in res['NSsites'][2]: dat = { 'ha1_lnl': lnl, 'ha1_kappa': 'NA', 'ha1_omegagrass0dnds': 'NA', 'ha1_omegamaize0dnds': 'NA', 'ha1_omegagrass1dnds': 'NA', 'ha1_omegamaize1dnds': 'NA', 'ha1_omegagrass2dnds': 'NA', 'ha1_omegamaize2dnds': 'NA', 'ha1_omega0prop': 'NA', 'ha1_omega1prop': 'NA', 'ha1_omega2prop': 'NA' } else: kappa = res['NSsites'][2]['parameters'].get('kappa', 'NA') if 'site classes' not in res['NSsites'][2]['parameters']: omega_grass_0_dnds = 'NA' omega_maize_0_dnds = 'NA' omega_grass_1_dnds = 'NA' omega_maize_1_dnds = 'NA' omega_grass_2_dnds = 'NA' omega_maize_2_dnds = 'NA' omega_0_prop = 'NA' omega_1_prop = 'NA' omega_2_prop = 'NA' else: omega_grass_0_dnds = res['NSsites'][2]['parameters'][ 'site classes'][0]['branch types'].get(0, 'NA') omega_maize_0_dnds = res['NSsites'][2]['parameters'][ 'site classes'][0]['branch types'].get(1, 'NA') omega_grass_1_dnds = res['NSsites'][2]['parameters'][ 'site classes'][1]['branch types'].get(0, 'NA') omega_maize_1_dnds = res['NSsites'][2]['parameters'][ 'site classes'][1]['branch types'].get(1, 'NA') omega_grass_2_dnds = res['NSsites'][2]['parameters'][ 'site classes'][2]['branch types'].get(0, 'NA') omega_maize_2_dnds = res['NSsites'][2]['parameters'][ 'site classes'][2]['branch types'].get(1, 'NA') omega_0_prop = res['NSsites'][2]['parameters']['site classes'][ 0].get('proportion', 'NA') omega_1_prop = res['NSsites'][2]['parameters']['site classes'][ 1].get('proportion', 'NA') omega_2_prop = res['NSsites'][2]['parameters']['site classes'][ 2].get('proportion', 'NA') dat = { 'ha1_lnl': lnl, 'ha1_kappa': str(kappa), 'ha1_omegagrass0dnds': str(omega_grass_0_dnds), 'ha1_omegamaize0dnds': str(omega_maize_0_dnds), 'ha1_omegagrass1dnds': str(omega_grass_1_dnds), 'ha1_omegamaize1dnds': str(omega_maize_1_dnds), 'ha1_omegagrass2dnds': str(omega_grass_2_dnds), 'ha1_omegamaize2dnds': str(omega_maize_2_dnds), 'ha1_omega0prop': str(omega_0_prop), 'ha1_omega1prop': str(omega_1_prop), 'ha1_omega2prop': str(omega_2_prop) } elif hyp == 'Ha2': # For this model, #1 is tandem, #0 is grass/maize lnl = res['NSsites'][2].get('lnL', 'NA') if 'parameters' not in res['NSsites'][2]: dat = { 'ha2_lnl': lnl, 'ha2_kappa': 'NA', 'ha2_omegagrass0dnds': 'NA', 'ha2_omegatandem0dnds': 'NA', 'ha2_omegagrass1dnds': 'NA', 'ha2_omegatandem1dnds': 'NA', 'ha2_omegagrass2dnds': 'NA', 'ha2_omegatandem2dnds': 'NA', 'ha2_omega0prop': 'NA', 'ha2_omega1prop': 'NA', 'ha2_omega2prop': 'NA' } else: kappa = res['NSsites'][2]['parameters'].get('kappa', 'NA') if 'site classes' not in res['NSsites'][2]['parameters']: omega_grass_0_dnds = 'NA' omega_tandem_0_dnds = 'NA' omega_grass_1_dnds = 'NA' omega_tandem_1_dnds = 'NA' omega_grass_2_dnds = 'NA' omega_tandem_2_dnds = 'NA' omega_0_prop = 'NA' omega_1_prop = 'NA' omega_2_prop = 'NA' else: omega_grass_0_dnds = res['NSsites'][2]['parameters'][ 'site classes'][0]['branch types'].get(0, 'NA') omega_tandem_0_dnds = res['NSsites'][2]['parameters'][ 'site classes'][0]['branch types'].get(1, 'NA') omega_grass_1_dnds = res['NSsites'][2]['parameters'][ 'site classes'][1]['branch types'].get(0, 'NA') omega_tandem_1_dnds = res['NSsites'][2]['parameters'][ 'site classes'][1]['branch types'].get(1, 'NA') omega_grass_2_dnds = res['NSsites'][2]['parameters'][ 'site classes'][2]['branch types'].get(0, 'NA') omega_tandem_2_dnds = res['NSsites'][2]['parameters'][ 'site classes'][2]['branch types'].get(1, 'NA') omega_0_prop = res['NSsites'][2]['parameters']['site classes'][ 0].get('proportion', 'NA') omega_1_prop = res['NSsites'][2]['parameters']['site classes'][ 1].get('proportion', 'NA') omega_2_prop = res['NSsites'][2]['parameters']['site classes'][ 2].get('proportion', 'NA') dat = { 'ha2_lnl': lnl, 'ha2_kappa': str(kappa), 'ha2_omegagrass0dnds': str(omega_grass_0_dnds), 'ha2_omegatandem0dnds': str(omega_tandem_0_dnds), 'ha2_omegagrass1dnds': str(omega_grass_1_dnds), 'ha2_omegatandem1dnds': str(omega_tandem_1_dnds), 'ha2_omegagrass2dnds': str(omega_grass_2_dnds), 'ha2_omegatandem2dnds': str(omega_tandem_2_dnds), 'ha2_omega0prop': str(omega_0_prop), 'ha2_omega1prop': str(omega_1_prop), 'ha2_omega2prop': str(omega_2_prop) } elif hyp == 'Ha3': # Recall that for this model, #1 is tandem and #2 is maize lnl = res['NSsites'][2].get('lnL', 'NA') if 'parameters' not in res['NSsites'][2]: dat = { 'ha3_lnl': lnl, 'ha3_kappa': 'NA', 'ha3_omegagrass0dnds': 'NA', 'ha3_omegatandem0dnds': 'NA', 'ha3_omegamaize0dnds': 'NA', 'ha3_omegagrass1dnds': 'NA', 'ha3_omegatandem1dnds': 'NA', 'ha3_omegamaize1dnds': 'NA', 'ha3_omegagrass2dnds': 'NA', 'ha3_omegatandem2dnds': 'NA', 'ha3_omegamaize2dnds': 'NA', 'ha3_omega0prop': 'NA', 'ha3_omega1prop': 'NA', 'ha3_omega2prop': 'NA' } else: kappa = res['NSsites'][2]['parameters'].get('kappa', 'NA') if 'site classes' not in res['NSsites'][2]['parameters']: omega_grass_0_dnds = 'NA' omega_tandem_0_dnds = 'NA' omega_maize_0_dnds = 'NA' omega_grass_1_dnds = 'NA' omega_tandem_1_dnds = 'NA' omega_maize_1_dnds = 'NA' omega_grass_2_dnds = 'NA' omega_tandem_2_dnds = 'NA' omega_maize_2_dnds = 'NA' omega_0_prop = 'NA' omega_1_prop = 'NA' omega_2_prop = 'NA' else: omega_grass_0_dnds = res['NSsites'][2]['parameters'][ 'site classes'][0]['branch types'].get(0, 'NA') omega_tandem_0_dnds = res['NSsites'][2]['parameters'][ 'site classes'][0]['branch types'].get(1, 'NA') omega_maize_0_dnds = res['NSsites'][2]['parameters'][ 'site classes'][0]['branch types'].get(2, 'NA') omega_grass_1_dnds = res['NSsites'][2]['parameters'][ 'site classes'][1]['branch types'].get(0, 'NA') omega_tandem_1_dnds = res['NSsites'][2]['parameters'][ 'site classes'][1]['branch types'].get(1, 'NA') omega_maize_1_dnds = res['NSsites'][2]['parameters'][ 'site classes'][1]['branch types'].get(2, 'NA') omega_grass_2_dnds = res['NSsites'][2]['parameters'][ 'site classes'][2]['branch types'].get(0, 'NA') omega_tandem_2_dnds = res['NSsites'][2]['parameters'][ 'site classes'][2]['branch types'].get(1, 'NA') omega_maize_2_dnds = res['NSsites'][2]['parameters'][ 'site classes'][2]['branch types'].get(2, 'NA') omega_0_prop = res['NSsites'][2]['parameters']['site classes'][ 0].get('proportion', 'NA') omega_1_prop = res['NSsites'][2]['parameters']['site classes'][ 1].get('proportion', 'NA') omega_2_prop = res['NSsites'][2]['parameters']['site classes'][ 2].get('proportion', 'NA') dat = { 'ha3_lnl': lnl, 'ha3_kappa': str(kappa), 'ha3_omegagrass0dnds': str(omega_grass_0_dnds), 'ha3_omegatandem0dnds': str(omega_tandem_0_dnds), 'ha3_omegamaize0dnds': str(omega_maize_0_dnds), 'ha3_omegagrass1dnds': str(omega_grass_1_dnds), 'ha3_omegatandem1dnds': str(omega_tandem_1_dnds), 'ha3_omegamaize1dnds': str(omega_maize_1_dnds), 'ha3_omegagrass2dnds': str(omega_grass_2_dnds), 'ha3_omegatandem2dnds': str(omega_tandem_2_dnds), 'ha3_omegamaize2dnds': str(omega_maize_2_dnds), 'ha3_omega0prop': str(omega_0_prop), 'ha3_omega1prop': str(omega_1_prop), 'ha3_omega2prop': str(omega_2_prop) } return dat
import csv from Bio.Phylo.PAML import codeml #runparse.sh supplies the following args #sysargv1 is directory #sysargv2 is geneID fname = os.path.join(sys.argv[1]) gene = sys.argv[2] filename = sys.argv[3] #with open(filename, 'a') as outfile: # outfile.write(str(gene) + '-null' +'\t' + 'lnL' + '\t' + 'branch length' + '\t' + 'prop0' + '\t ' + 'background0' + '\t' + 'foreground0'+ '\t' + '\t '+ 'prop1' + 'background1' + '\t' + 'foreground1'+ '\t' + 'prop2' + '\t '+ 'background2' + '\t' + 'foreground2'+ '\t' + 'prop3' + '\t ' 'background3' + '\t' + 'foreground3' + '\n') results = codeml.read(fname) lnl = results['NSsites'][2]['lnL'] treel = results['NSsites'][2]['tree length'] p0 = results['NSsites'][2]['parameters']['site classes'][0]['proportion'] b0 = results['NSsites'][2]['parameters']['site classes'][0]['branch types'][ 'background'] f0 = results['NSsites'][2]['parameters']['site classes'][0]['branch types'][ 'foreground'] p1 = results['NSsites'][2]['parameters']['site classes'][1]['proportion'] b1 = results['NSsites'][2]['parameters']['site classes'][1]['branch types'][ 'background'] f1 = results['NSsites'][2]['parameters']['site classes'][1]['branch types'][ 'foreground']
os.chdir('/Users/xji3/GitFolders/Genconv/PAMLCheck') if __name__ == '__main__': pairs = [] all_pairs = './Filtered_pairs.txt' with open(all_pairs, 'r') as f: for line in f.readlines(): pairs.append(line.replace('\n','').split('_')) ## for paralog in pairs: ## initialize(paralog) ## ctl_file = '/Users/xji3/GitFolders/Genconv/PAMLCheck/output/' + '_'.join(paralog) + '/' + '_'.join(paralog) + '_codeml.ctl' ## run_paml(paralog, ctl_file) summary_mat = [] finished_list = [] label = ['MG94_codeml_tree_length', 'MG94_codeml_lnL'] footer = ' '.join(label) #pairs = pairs[0:2] for pair in pairs: codeml_result = codeml.read('/Users/xji3/GitFolders/Genconv/PAMLCheck/output/' + '_'.join(pair) + '/' + '_'.join(pair) + '_codeml_result.txt') summary_mat.append([codeml_result['NSsites'][0]['tree length'], codeml_result['NSsites'][0]['lnL']]) finished_list.append(pair) header = ' '.join(['_'.join(pair) for pair in finished_list]) # column labels np.savetxt(open('/Users/xji3/GitFolders/Genconv/PAMLCheck/paml_summary.txt', 'w+'), np.matrix(summary_mat).T, delimiter = ' ', footer = footer, header = header)
tree=tree_file, out_file=m0_out) cmlM0.set_options(seqtype=1) cmlM0.set_options(model=0) cmlM0.set_options(NSsites=[0]) cmlM0.set_options(omega=0.5) cmlM0.set_options(CodonFreq=2) cmlM0.set_options(ndata=1) cmlM0.set_options(fix_alpha=1) cmlM0.set_options(Small_Diff=5e-7) # Run the M0 model cmlM0.run(command="/Users/kmoney/Documents/paml4.9e/bin/codeml") # Get tree from m0 results m0result = codeml.read(m0_out) NSsites_dict = m0result.get("NSsites") NSsites0_dict = NSsites_dict.get(0) estimated_tree = NSsites0_dict.get("tree") # Write tree to output tree file f = open(estimated_tree_name, "w") f.write(estimated_tree) f.close() # Now run all sites models cml = codeml.Codeml(alignment=alignment_file, tree=estimated_tree_name, out_file=final_out) cml.set_options(seqtype=1) cml.set_options(model=0)
for IGC_geo in IGC_geo_list: label = ['ll', 'kappa', 'omega'] header = [] summary_mat = [] for sim_num in range(100): #wk_dir = '/Users/xji3/GitFolders/IGCCodonSimulation/YDR418W_YEL054C/IGCgeo_' + str(IGC_geo) + '/sim_' + str(sim_num) + '/' wk_dir = '/Users/xji3/GitFolders/IGCCodonSimulation/YDR418W_YEL054C_estimatedTau/IGCgeo_' + str(IGC_geo) + '/sim_' + str(sim_num) + '/' seq_loc = wk_dir + 'YDR418W_YEL054C_MG94_geo_' + str(IGC_geo) + '_Sim_' + str(sim_num) + '.fasta' ctl_loc = wk_dir + 'geo_' + str(IGC_geo) + '_Sim_' + str(sim_num) + '_localTree_' + str(local_tree_num) + '_codeml.ctl' out_file = wk_dir + 'unrooted_MG94_geo_' + str(IGC_geo) + '_Sim_' + str(sim_num) + '_localTree_' + str(local_tree_num) + '_codeml_output.txt' ## prepare_ctl(tree_loc, seq_loc, out_file, ctl_loc) ## run_paml(wk_dir, ctl_loc)#, "/Users/xji3/Software/paml4.8/bin/codeml") ## if os.path.isfile(out_file): codeml_result = codeml.read(out_file) tree_file = out_file.replace('codeml_output.txt', 'codeml_est.newick') with open(tree_file, 'w+') as f: f.write(codeml_result['NSsites'][0]['tree'] + '\n') edge_to_blen, edge_list = get_tree(tree_file, name_tree) if sim_num == 0: edge_list_fix = deepcopy(edge_list) summary = [codeml_result['NSsites'][0]['lnL'], codeml_result['NSsites'][0]['parameters']['kappa'], codeml_result['NSsites'][0]['parameters']['omega']] summary.extend([edge_to_blen[edge] for edge in edge_list_fix]) summary_mat.append(summary) header.append('geo_' + str(IGC_geo) + '_sim_' + str(sim_num))
sim_list.extend(range(1, 100)) for sim_num in sim_list: #wk_dir = '/Users/xji3/GitFolders/IGCSimulation/YDR418W_YEL054C/IGCgeo_' + str(IGC_geo) + '/sim_' + str(sim_num) + '/' wk_dir = '/Users/xji3/GitFolders/IGCSimulation/YDR418W_YEL054C_estimatedTau/IGCgeo_' + str(IGC_geo) + '/sim_' + str(sim_num) + '/' seq_loc = wk_dir + 'YDR418W_YEL054C_MG94_geo_' + str(IGC_geo) + '_Sim_' + str(sim_num) + '.fasta' ctl_loc = wk_dir + 'geo_' + str(IGC_geo) + '_Sim_' + str(sim_num) + '_codeml.ctl' out_file = wk_dir + 'unrooted_MG94_geo_' + str(IGC_geo) + '_Sim_' + str(sim_num) + '_codeml_output.txt' #prepare_ctl(tree_loc, seq_loc, out_file, ctl_loc) #run_paml(wk_dir, ctl_loc)#, "/Users/Xiang/Software/paml4.8/bin/codeml") out_tree1_file = out_file.replace('_output.txt', '_tree1_output.txt') out_tree2_file = out_file.replace('_output.txt', '_tree2_output.txt') out_tree_files = [out_tree1_file, out_tree2_file] Seperate_codeml_result(out_file, out_tree_files) if os.path.isfile(out_tree1_file): codeml_result = codeml.read(out_tree1_file) tree1_file = out_file.replace('codeml_output.txt', 'codeml_tree1_est.newick') with open(tree1_file, 'w+') as f: f.write(codeml_result['NSsites'][0]['tree'] + '\n') edge_to_blen, edge_list_1 = get_tree(tree1_file, name_tree_1st) if sim_num == 0: edge_list_1_fix = deepcopy(edge_list_1) summary = [codeml_result['NSsites'][0]['lnL'], codeml_result['NSsites'][0]['parameters']['kappa'], codeml_result['NSsites'][0]['parameters']['omega']] summary.extend([edge_to_blen[edge] for edge in edge_list_1_fix]) summary_mat.append(summary) header.append('geo_' + str(IGC_geo) + '_sim_' + str(sim_num)) edge_to_blen = None
num_list = range(1, 1673) for num in num_list: try: wdir = "prot_" + str(num) acc_file = "all_prot.part-" + str(num) + "_filtered.acc" acc = str(open(acc_file).readline().rstrip()) infile_null = wdir + "/paml_results_null.out" infile_alt = wdir + "/paml_results_alt.out" outfile2 = wdir + "/CodemlCompare_prot" + str(num) + ".txt" print(wdir) print(acc) # read in data and parse it for relevant values results_null = codeml.read(infile_null) results_alt = codeml.read(infile_alt) lnL_null = results_null.get("NSsites").get(0).get("lnL") lnL_alt = results_alt.get("NSsites").get(0).get("lnL") likelihood_ratio = -2 * (lnL_null - lnL_alt) p_value = chi2.sf(likelihood_ratio, 1) arth_omega = results_alt.get("NSsites").get(0).get("parameters").get( "omega")[1] background_omega = results_alt.get("NSsites").get(0).get( "parameters").get("omega")[0] dS_0 = results_null.get("NSsites").get(0).get("parameters").get("dS") dN_0 = results_null.get("NSsites").get(0).get("parameters").get("dN") dS_2 = results_alt.get("NSsites").get(0).get("parameters").get("dS")
#wk_dir = '/Users/xji3/GitFolders/IGCCodonSimulation/YDR418W_YEL054C/IGCgeo_' + str(IGC_geo) + '/sim_' + str(sim_num) + '/' wk_dir = '/Users/xji3/GitFolders/IGCCodonSimulation/YDR418W_YEL054C_estimatedTau/IGCgeo_' + str( IGC_geo) + '/sim_' + str(sim_num) + '/' seq_loc = wk_dir + 'YDR418W_YEL054C_MG94_geo_' + str( IGC_geo) + '_Sim_' + str(sim_num) + '.fasta' ctl_loc = wk_dir + 'geo_' + str(IGC_geo) + '_Sim_' + str( sim_num) + '_localTree_' + str( local_tree_num) + '_codeml.ctl' out_file = wk_dir + 'unrooted_MG94_geo_' + str( IGC_geo) + '_Sim_' + str(sim_num) + '_localTree_' + str( local_tree_num) + '_codeml_output.txt' ## prepare_ctl(tree_loc, seq_loc, out_file, ctl_loc) ## run_paml(wk_dir, ctl_loc)#, "/Users/xji3/Software/paml4.8/bin/codeml") ## if os.path.isfile(out_file): codeml_result = codeml.read(out_file) tree_file = out_file.replace('codeml_output.txt', 'codeml_est.newick') with open(tree_file, 'w+') as f: f.write(codeml_result['NSsites'][0]['tree'] + '\n') edge_to_blen, edge_list = get_tree(tree_file, name_tree) if sim_num == 0: edge_list_fix = deepcopy(edge_list) summary = [ codeml_result['NSsites'][0]['lnL'], codeml_result['NSsites'][0]['parameters']['kappa'], codeml_result['NSsites'][0]['parameters']['omega'] ] summary.extend( [edge_to_blen[edge] for edge in edge_list_fix])
# Module 10 # # 1. Run another python script from within python import os os.system("python /home/arina/other_script.py") # 2. Create a new file with the “>” pipe command os.system("python /home/arina/other_script.py > logfile") # 3. Run the program “codeml” (in the subfolder paml). It requires an inputfile, # which you can copy or explicitly define as argument from Bio.Phylo.PAML import codeml cml = codeml.Codeml(alignment = "/home/arina/codeml/alignment.phylip", tree = "/home/arina/codeml/species.tree", out_file = "results.out", working_dir = "/home/arina/codeml/") results = cml.run() ''' >> Now here I stuck! :( I am getting an error - OSError: [Errno 2] No such file or directory and when trying to indicate path to codeml explicitly results = cml.run(command="/usr/local/lib/python2.7/dist-packages/Bio/Phylo/PAML/codeml.py") getting error - OSError: [Errno 13] Permission denied ''' # 4. Obtain the likelihood of the output file of codeml (file codeml_output, # it's indicated on the line with lnL = ) using a script that prints the likelihood # >> I suppose, it should be something like that: results = codeml.read("results.out") print(results.get("lnL max"))
cml.set_options(seqtype = 1) cml.set_options(omega = 0.4) cml.set_options(getSE = 0) cml.set_options(noisy = 3) cml.set_options(Mgene = 0) cml.set_options(kappa = 2) cml.set_options(model = 0) cml.set_options(ndata = 1) ##### PROGRAM ##### paml_folders = glob.glob('*PAML') # Creation of a sorted number list the length of the files in the directory paml_folder = range(len(paml_folders)) # Creates a list the length of the number of folders paml_folder = [x+1 for x in paml_folder] # Shifts all values up by 1 for naming reasons for folder in paml_folder: # Loops through all current PAML folders folder_name = ('%s' + 'PAML') % folder # ID of the folder name curr_dir = "C:\\analysis\\" + folder_name # Sets current directory curr_test = folder_name[:-4] # Assigns test number to a variable aln_file = curr_test + 'fix.afa' # Assigning file names tree_file = curr_test + '.ph' cml.alignment = curr_dir + "\\" + aln_file # Setting up files for PAML analysis cml.tree = curr_dir + "\\" + tree_file cml.out_file = curr_dir + "\\analysis\\output.txt" cml.working_dir = curr_dir + "\\analysis" cml.run(verbose = True, command = "C:\\analysis\\Phylogenetic\\paml4.8\\bin\\codeml.exe") results = codeml.read(cml.out_file)