def OrthologuesWorkflow(workingDir_ogs, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, tree_options, msa_method, tree_method, nHighParallel, nLowParrallel, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, pickleDir=None): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(workingDir_ogs, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor, pickleDir=pickleDir) # Class that is going to run the analysis needs to check the dependencies # if not CanRunOrthologueDependencies(workingDir_ogs, qMSA, qStopAfterTrees, userSpeciesTree == None): # print("Orthogroups have been inferred but the dependencies for inferring gene trees and") # print("orthologues have not been met. Please review previous messages for more information.") # sys.exit() resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ if qMSA or qPhyldog: treeGen = msa.TreesForOrthogroups(tree_options, msa_method, tree_method, resultsDir, workingDir_ogs) seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.Spec_SeqDict(), nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog) if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) if not userSpeciesTree: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") db.ReadAndPickle() spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if qPhyldog: trees_from_phyldog.RunPhyldogAnalysis(resultsDir + "WorkingDirectory/phyldog/", ogSet.OGs(), speciesToUse) return "Running Phyldog" + "\n".join(seqs_alignments_dirs) else: util.PrintUnderline("Calculating gene distances") db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) db.ReadAndPickle() nOGs, D, spTreeFN_ids, spTreeUnrootedFN = db.RunAnalysis() """ === 2 === Check can continue with analysis """ if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") userSpeciesTree = ConvertUserSpeciesTree(db.workingDir + "Trees_ids/", userSpeciesTree, ogSet.SpeciesDict()) rootedSpeciesTreeFN = [userSpeciesTree] roots = [None] qMultiple = False else: util.PrintUnderline("Best outgroup(s) for species tree") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.TreeFilename_IDs(0))[0] + "/", rfd.GeneToSpecies_dash, nHighParallel, treeFmt = 1) if len(roots) > 1: print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (resultsDir + "Gene_Trees/") return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if len(roots) == 1: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nAnalysing each of the potential species tree roots", True) resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): util.PrintUnderline("Reconciling gene trees and species tree" + (" (root %d)"%i if qMultiple else "")) if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) elif userSpeciesTree: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") print("Outgroup: " + (", ".join([spDict[s] for s in r]))) os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) ReconciliationAndOrthologues(db.TreeFilename_IDs, db.ogSet, speciesTree_fn, db.workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, i if qMultiple else None, pickleDir=pickleDir) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, nProcesses): ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor=util.FirstWordExtractor) if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() print("\n1. Checking required programs are installed") print("-------------------------------------------") if not CanRunDependencies(orthofinderWorkingDir): print( "Orthogroups have been inferred but the dependencies for inferring gene trees and\northologues have not been met. Please review previous messages for more information." ) sys.exit() print("\n2. Calculating gene distances") print("-----------------------------") resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) db.ReadAndPickle() nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() print("\n4. Best outgroup(s) for species tree") print("------------------------------------") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot( spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt=1) if len(roots) > 1: print( "Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print( "Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r]))) qMultiple = len(roots) > 1 if qMultiple: print("\nAnalysing each of the potential species tree roots.") resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append( resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) print("\n5%s. Reconciling gene and species trees" % ("-%d" % i if qMultiple else "")) print("-------------------------------------" + ("--" if qMultiple else "")) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) os.mkdir(reconTreesRenamedDir) for iog in xrange(len(db.ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + "OG%07d_tree_id.locus.tree" % iog, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, db.ogSet.Spec_SeqDict(), qFixNegatives=False, inFormat=8) # Orthologue lists print("\n6%s. Inferring orthologues from gene trees" % ("-%d" % i if qMultiple else "")) print("----------------------------------------" + ("--" if qMultiple else "")) pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) CleanWorkingDir(db) print("\n7. Writing results files") print("------------------------") return GetResultsFilesString(resultsSpeciesTrees)
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs, nProcesses): ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor=orthofinder.FirstWordExtractor) if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") orthofinder.Fail() print("\n1. Checking required programs are installed") print("-------------------------------------------") if not CanRunDependencies(orthofinderWorkingDir): orthofinder.Fail() print("\n2. Reading sequence similarity scores") print("-------------------------------------") resultsDir = orthofinder.util.CreateNewWorkingDirectory( orthofinderResultsDir + "Orthologues_") db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) db.ReadAndPickle() nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() print("\n4. Best outgroup(s) for species tree") print("------------------------------------") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot( spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt=1) if len(roots) > 1: print( "Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print( "Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r]))) qMultiple = len(roots) > 1 if qMultiple: print("\nAnalysing each of the potential species tree roots.") resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if qMultiple: resultsDir_new = resultsDir + "Orthologues_for_potential_outgroup_%d/" % i else: resultsDir_new = resultsDir + "Orthologues/" os.mkdir(resultsDir_new) resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted.txt") db.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) print("\n5%s. Reconciling gene and species trees" % ("-%d" % i if qMultiple else "")) print("-------------------------------------" + ("--" if qMultiple else "")) print("Root: " + (", ".join([spDict[s] for s in r]))) dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) # Orthologue lists print("\n6%s. Inferring orthologues from gene trees" % ("-%d" % i if qMultiple else "")) print("----------------------------------------" + ("--" if qMultiple else "")) pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) print("\n7. Writing results files") print("------------------------") return GetResultsFilesString(resultsSpeciesTrees)