def RunAnalysis(self, qSpeciesTree=True): util.PrintUnderline("Calculating gene distances") ogs, ogMatrices_partial = self.GetOGMatrices_FullParallel() ogMatrices = self.CompleteAndWriteOGMatrices(ogs, ogMatrices_partial) util.PrintTime("Done") cmds_trees = self.PrepareGeneTreeCommand() qLessThanFourSpecies = len(self.ogSet.seqsInfo.speciesToUse) < 4 if qLessThanFourSpecies: qSTAG = False spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() WriteSpeciesTreeIDs_TwoThree(self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids) else: qSTAG = self.EnoughOGsForSTAG(ogs, self.ogSet.seqsInfo.speciesToUse) if not qSTAG: print("Using fallback species tree inference method") D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_trees = [[cmd_spTree]] + cmds_trees util.PrintUnderline("Inferring gene and species trees") util.RunParallelOrderedCommandLists(self.nProcesses, cmds_trees) if qSTAG: # Trees must have been completed print("") spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() stag.Run_ForOrthoFinder(files.FileHandler.GetOGsTreeDir(), files.FileHandler.GetWorkingDirectory_Write(), self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.OGs())): util.RenameTreeTaxa(files.FileHandler.GetOGsTreeFN(iog), files.FileHandler.GetOGsTreeFN(iog, True), seqDict, qSupport=False, qFixNegatives=True) if qSpeciesTree: util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), self.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True) return spTreeFN_ids, qSTAG else: return None, qSTAG
def OrthologuesFromTrees(groupsDir, workingDir, nHighParallel, speciesTree_fn = None, pickleDir=None): """ groupsDir - directory with orthogroups file in userSpeciesTree_fn - None if not supplied otherwise rooted tree using user species names (not orthofinder IDs) workingDir - orthologues 'WorkingDirectory' qUserSpTree - is the speciesTree_fn user-supplied Just infer orthologues from trees, don't do any of the preceeding steps. """ # Check species tree qUserSpTree = (speciesTree_fn != None) if qUserSpTree: if not os.path.exists(speciesTree_fn): print("\nERROR: %s does not exist\n" % speciesTree_fn) util.Fail() else: possibilities = ["SpeciesTree_ids_0_rooted.txt", "SpeciesTree_ids_1_rooted.txt", "SpeciesTree_user_ids.txt"] # etc (only need to determine if unique) nTrees = 0 for p in possibilities: fn = workingDir + "Trees_ids/" + p if os.path.exists(fn): nTrees += 1 speciesTree_fn = fn if nTrees == 0: print("\nERROR: There is a problem with the specified directory. The rooted species tree %s or %s is not present." % (possibilities[0], possibilities[2])) print("Please rectify the problem or alternatively use the -s option to specify the species tree to use.\n") util.Fail() if nTrees > 1: print("\nERROR: There is more than one rooted species tree in the specified directory structure. Please use the -s option to specify which species tree should be used\n") util.Fail() def TreePatIDs(iog): return workingDir + ("Trees_ids/OG%07d_tree_id.txt" % iog) reconTreesRenamedDir = workingDir + "Recon_Gene_Trees/" resultsDir_new = workingDir + "../Orthologues" # for the Orthologues_Species/ directories # if os.path.exists(resultsDir_new): resultsDir_new = util.CreateNewWorkingDirectory(resultsDir_new + "_") # else: # resultsDir_new += os.sep # os.mkdir(resultsDir_new) orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(groupsDir) speciesToUse, nSpAll = util.GetSpeciesToUse(orthofinderWorkingDir + "SpeciesIDs.txt") ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor) if qUserSpTree: speciesToUseNames = ogSet.SpeciesDict().values() CheckUserSpeciesTree(speciesTree_fn, speciesToUseNames) speciesTree_fn = ConvertUserSpeciesTree(workingDir + "Trees_ids/", speciesTree_fn, ogSet.SpeciesDict()) util.PrintUnderline("Running Orthologue Prediction", True) util.PrintUnderline("Reconciling gene and species trees") ReconciliationAndOrthologues(TreePatIDs, ogSet, speciesTree_fn, workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, pickleDir=pickleDir) util.PrintUnderline("Writing results files") CleanWorkingDir(workingDir) return "Species-by-species orthologues directory:\n %s\n" % resultsDir_new
def DoTrees(self, ogs, idDict, nProcesses, qStopAfterSeqs, qStopAfterAlignments): # 0 resultsDirsFullPath = [] for fn in [self.GetFastaFilename, self.GetAlignmentFilename, self.GetTreeFilename]: for qIDs in [True, False]: d = os.path.split(fn(0, not qIDs))[0] if not os.path.exists(d): os.mkdir(d) if not qIDs: resultsDirsFullPath.append(d) if qStopAfterSeqs: break if qStopAfterAlignments and fn == self.GetAlignmentFilename: break # 1. fastaWriter = FastaWriter(self.ogsWorkingDir) self.WriteFastaFiles(fastaWriter, ogs, idDict) if qStopAfterSeqs: return resultsDirsFullPath # 2 if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") else: util.PrintUnderline("Inferring multiple sequence alignments and gene trees") # 3 alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs) if qStopAfterAlignments: pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False) return resultsDirsFullPath[:2] # Otherwise, alignments and trees alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs) commands_and_filenames = [] for i in xrange(len(treeCommands_and_filenames)): commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): commands_and_filenames.append([alignCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions for i, alignFN in enumerate(alignmentFilesToUse): with open(alignFN, 'rb') as infile, open(self.GetAlignmentFilename(i, True), 'wb') as outfile: for line in infile: if line.startswith(">"): outfile.write(">" + idDict[line[1:].rstrip()] + "\n") else: outfile.write(line) if os.path.exists(self.GetTreeFilename(i)): util.RenameTreeTaxa(self.GetTreeFilename(i), self.GetTreeFilename(i, True), idDict, qFixNegatives=True) return resultsDirsFullPath[:2]
def ReconciliationAndOrthologues(treesIDsPatFn, ogSet, speciesTree_fn, workingDir, resultsDir, reconTreesRenamedDir, nParallel, iSpeciesTree=None, pickleDir = None): """ treesPatFn - function returning name of filename ogSet - info about the orthogroups, species etc speciesTree_fn - the species tree workingDir - Orthologues working dir resultsDir - where the Orthologues top level results directory will go (should exist already) reconTreesRenamedDir - where to put the reconcilled trees that use the gene accessions iSpeciesTree - which of the potential roots of the species tree is this """ dlcparResultsDir = RunDlcpar(treesIDsPatFn, ogSet, speciesTree_fn, workingDir, nParallel) if not os.path.exists(reconTreesRenamedDir): os.mkdir(reconTreesRenamedDir) for iog in xrange(len(ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + "OG%07d_tree_id.dlcpar.locus.tree" % iog, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qFixNegatives=False, inFormat=8) # Orthologue lists util.PrintUnderline("Inferring orthologues from gene trees" + (" (root %d)"%iSpeciesTree if iSpeciesTree != None else "")) qDelDir = False if pickleDir == None: pickleDir = workingDir + "matrices_orthologues/" if not os.path.exists(pickleDir): os.mkdir(pickleDir) qDelDir = True rt.create_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, pickleDir) # If a temporary matrices directory was created, delete it now if qDelDir: if os.path.exists(pickleDir): try: os.rmdir(pickleDir) except OSError: pass
def RunAnalysis(self, qSpeciesTree=True): ogs, ogMatrices_partial = self.GetOGMatrices() ogMatrices = self.CompleteAndWriteOGMatrices(ogs, ogMatrices_partial) D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_geneTrees = self.PrepareGeneTreeCommand() util.PrintUnderline("Inferring gene and species trees") util.RunParallelOrderedCommandLists(self.nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout=True) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.OGs())): util.RenameTreeTaxa(self.TreeFilename_IDs(iog), self.treesPat % iog, seqDict, qFixNegatives=True) if qSpeciesTree: spTreeUnrootedFN = self.workingDir + "SpeciesTree_unrooted.txt" util.RenameTreeTaxa(spTreeFN_ids, spTreeUnrootedFN, self.ogSet.SpeciesDict(), qFixNegatives=True) return len(ogs), D, spTreeFN_ids, spTreeUnrootedFN else: return len(ogs), D, None, None
def OrthologuesFromTrees(recon_method, nHighParallel, userSpeciesTree_fn, qAddSpeciesToIDs): """ userSpeciesTree_fn - None if not supplied otherwise rooted tree using user species names (not orthofinder IDs) qUserSpTree - is the speciesTree_fn user-supplied Just infer orthologues from trees, don't do any of the preceeding steps. """ speciesToUse, nSpAll, _ = util.GetSpeciesToUse(files.FileHandler.GetSpeciesIDsFN()) ogSet = OrthoGroupsSet(files.FileHandler.GetWorkingDirectory1_Read(), speciesToUse, nSpAll, qAddSpeciesToIDs, idExtractor = util.FirstWordExtractor) if userSpeciesTree_fn != None: speciesDict = files.FileHandler.GetSpeciesDict() speciesToUseNames = [speciesDict[str(iSp)] for iSp in ogSet.speciesToUse] CheckUserSpeciesTree(userSpeciesTree_fn, speciesToUseNames) speciesTreeFN_ids = files.FileHandler.GetSpeciesTreeIDsRootedFN() ConvertUserSpeciesTree(userSpeciesTree_fn, speciesDict, speciesTreeFN_ids) util.PrintUnderline("Running Orthologue Prediction", True) util.PrintUnderline("Reconciling gene and species trees") ReconciliationAndOrthologues(recon_method, ogSet, nHighParallel) util.PrintUnderline("Writing results files") util.PrintTime("Writing results files") files.FileHandler.CleanWorkingDir2() return "Species-by-species orthologues directory:\n %s\n" % files.FileHandler.GetOrthologuesDirectory()
def ReconciliationAndOrthologues(recon_method, ogSet, nParallel, iSpeciesTree=None, all_stride_dup_genes=None): """ ogSet - info about the orthogroups, species etc resultsDir - where the Orthologues top level results directory will go (should exist already) reconTreesRenamedDir - where to put the reconcilled trees that use the gene accessions iSpeciesTree - which of the potential roots of the species tree is this method - can be dlcpar, dlcpar_deep, of_recon """ speciesTree_ids_fn = files.FileHandler.GetSpeciesTreeIDsRootedFN() labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN() util.RenameTreeTaxa(speciesTree_ids_fn, labeled_tree_fn, ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N') workingDir = files.FileHandler.GetWorkingDirectory_Write() # workingDir - Orthologues working dir resultsDir_ologs = files.FileHandler.GetOrthologuesDirectory() reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) if "dlcpar" in recon_method: qDeepSearch = (recon_method == "dlcpar_convergedsearch") util.PrintTime("Starting DLCpar") dlcparResultsDir, dlcparLocusTreePat = trees2ologs_dlcpar.RunDlcpar(ogSet, speciesTree_ids_fn, workingDir, nParallel, qDeepSearch) util.PrintTime("Done DLCpar") spec_seq_dict = ogSet.Spec_SeqDict() for iog in xrange(len(ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + dlcparLocusTreePat % iog, files.FileHandler.GetOGsReconTreeFN(iog), spec_seq_dict, qSupport=False, qFixNegatives=False, inFormat=8, label='n') # Orthologue lists util.PrintUnderline("Inferring orthologues from gene trees" + (" (root %d)"%iSpeciesTree if iSpeciesTree != None else "")) pickleDir = files.FileHandler.GetPickleDir() nOrthologues_SpPair = trees2ologs_dlcpar.create_orthologue_lists(ogSet, resultsDir_ologs, dlcparResultsDir, pickleDir) elif "phyldog" == recon_method: util.PrintTime("Starting Orthologues from Phyldog") nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDir, trees2ologs_of.GeneToSpecies_dash, resultsDir_ologs, reconTreesRenamedDir) util.PrintTime("Done Orthologues from Phyldog") else: util.PrintTime("Starting OF Orthologues") qNoRecon = ("only_overlap" == recon_method) nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder(ogSet, speciesTree_ids_fn, trees2ologs_of.GeneToSpecies_dash, all_stride_dup_genes, qNoRecon) util.PrintTime("Done OF Orthologues") nOrthologues_SpPair += TwoAndThreeGeneOrthogroups(ogSet, resultsDir_ologs) WriteOrthologuesStats(ogSet, nOrthologues_SpPair)
def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, speciesToUse, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree): idDict.update(speciesIdDict) # smae code will then also convert concatenated alignment for species tree # 0 resultsDirsFullPath = [files.FileHandler.GetResultsSeqsDir(), files.FileHandler.GetResultsAlignDir(), files.FileHandler.GetResultsTreesDir()] # 1. fastaWriter = FastaWriter(files.FileHandler.GetSpeciesSeqsDir(), speciesToUse) self.WriteFastaFiles(fastaWriter, ogs, idDict, True) if qStopAfterSeqs: return resultsDirsFullPath # 3 # Get OGs to use for species tree if qDoSpeciesTree: iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree(ogMatrix) concatenated_algn_fn = files.FileHandler.GetSpeciesTreeConcatAlignFN() else: iOgsForSpeciesTree = [] alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs) if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False) if qDoSpeciesTree: CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # write OGs used to file dSpeciesTree = os.path.split(files.FileHandler.GetSpeciesTreeResultsFN(0, True))[0] + "/" with open(dSpeciesTree + "Orthogroups_for_concatenated_alignment.txt", 'wb') as outfile: for iog in iOgsForSpeciesTree: outfile.write("OG%07d\n" % iog) # ids -> accessions alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) return resultsDirsFullPath[:2] # Otherwise, alignments and trees # Strategy is # 1. Do alignments (and trees) require for species tree # 2. Create concatenated alignment # 3. Create second list of commands [speciestree] + [remaining alignments and trees] alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs) commands_and_filenames = [] if qDoSpeciesTree: print("Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100.*fSingleCopy)) util.PrintUnderline("Inferring multiple sequence alignments for species tree") # Do required alignments and trees speciesTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() for i in iOgsForSpeciesTree: commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # write OGs used to file dSpeciesTree = os.path.split(files.FileHandler.GetSpeciesTreeResultsFN(0, True))[0] + "/" with open(dSpeciesTree + "Orthogroups_for_concatenated_alignment.txt", 'wb') as outfile: for iog in iOgsForSpeciesTree: outfile.write("OG%07d\n" % iog) # Add species tree to list of commands to run commands_and_filenames = [self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"])] util.PrintUnderline("Inferring remaining multiple sequence alignments and gene trees") else: util.PrintUnderline("Inferring multiple sequence alignments and gene trees") # Now continue as before iOgsForSpeciesTree = set(iOgsForSpeciesTree) for i in xrange(len(treeCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] # Add concatenated Alignment if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) qHaveSupport = util.HaveSupportValues(speciesTreeFN_ids) if os.path.exists(speciesTreeFN_ids): util.RenameTreeTaxa(speciesTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), idDict, qSupport=qHaveSupport, qFixNegatives=True) else: text = "ERROR: Species tree inference failed" files.FileHandler.LogFailAndExit(text) self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) qHaveSupport = None for i in xrange(len(treeCommands_and_filenames)): infn = self.GetTreeFilename(i) if os.path.exists(infn): if qHaveSupport == None: qHaveSupport = util.HaveSupportValues(infn) util.RenameTreeTaxa(infn, self.GetTreeFilename(i, True), idDict, qSupport=qHaveSupport, qFixNegatives=True) return resultsDirsFullPath[:2]
def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree): idDict.update(speciesIdDict) # smae code will then also convert concatenated alignment for species tree # 0 resultsDirsFullPath = [] for fn in [self.GetFastaFilename, self.GetAlignmentFilename, self.GetTreeFilename]: for qIDs in [True, False]: d = os.path.split(fn(0, not qIDs))[0] if not os.path.exists(d): os.mkdir(d) if not qIDs: resultsDirsFullPath.append(d) if qStopAfterSeqs: break if qStopAfterAlignments and fn == self.GetAlignmentFilename: break # 1. fastaWriter = FastaWriter(self.ogsWorkingDir) self.WriteFastaFiles(fastaWriter, ogs, idDict) if qStopAfterSeqs: return resultsDirsFullPath # 3 # Get OGs to use for species tree if qDoSpeciesTree: iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree(ogMatrix) concatenated_algn_fn = os.path.split(self.GetAlignmentFilename(0))[0] + "/SpeciesTreeAlignment.fa" else: iOgsForSpeciesTree = [] alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs) if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # ids -> accessions alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa") self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) return resultsDirsFullPath[:2] # Otherwise, alignments and trees # Strategy is # 1. Do alignments (and trees) require for species tree # 2. Create concatenated alignment # 3. Create second list of commands [speciestree] + [remaining alignments and trees] alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs) commands_and_filenames = [] if qDoSpeciesTree: print("Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100.*fSingleCopy)) util.PrintUnderline("Inferring multiple sequence alignments for species tree") # Do required alignments and trees speciesTreeFN_ids = os.path.split(self.GetTreeFilename(i))[0] + "/SpeciesTree_unrooted.txt" for i in iOgsForSpeciesTree: commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # Add species tree to list of commands to run commands_and_filenames = [self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"])] util.PrintUnderline("Inferring remaining multiple sequence alignments and gene trees") else: util.PrintUnderline("Inferring multiple sequence alignments and gene trees") # Now continue as before iOgsForSpeciesTree = set(iOgsForSpeciesTree) for i in xrange(len(treeCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] # Add concatenated Alignment if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa") self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) if os.path.exists(speciesTreeFN_ids): util.RenameTreeTaxa(speciesTreeFN_ids, self.workingDir + "SpeciesTree_unrooted.txt", idDict, qFixNegatives=True) else: print("ERROR: Species tree inference failed") util.Fail() for i in xrange(len(treeCommands_and_filenames)): if os.path.exists(self.GetTreeFilename(i)): util.RenameTreeTaxa(self.GetTreeFilename(i), self.GetTreeFilename(i, True), idDict, qFixNegatives=True) return resultsDirsFullPath[:2]
def OrthologuesWorkflow(workingDir_ogs, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, tree_options, msa_method, tree_method, nHighParallel, nLowParrallel, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, pickleDir=None): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(workingDir_ogs, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor, pickleDir=pickleDir) # Class that is going to run the analysis needs to check the dependencies # if not CanRunOrthologueDependencies(workingDir_ogs, qMSA, qStopAfterTrees, userSpeciesTree == None): # print("Orthogroups have been inferred but the dependencies for inferring gene trees and") # print("orthologues have not been met. Please review previous messages for more information.") # sys.exit() resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ if qMSA or qPhyldog: treeGen = msa.TreesForOrthogroups(tree_options, msa_method, tree_method, resultsDir, workingDir_ogs) seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.Spec_SeqDict(), nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog) if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) if not userSpeciesTree: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") db.ReadAndPickle() spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if qPhyldog: trees_from_phyldog.RunPhyldogAnalysis(resultsDir + "WorkingDirectory/phyldog/", ogSet.OGs(), speciesToUse) return "Running Phyldog" + "\n".join(seqs_alignments_dirs) else: util.PrintUnderline("Calculating gene distances") db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) db.ReadAndPickle() nOGs, D, spTreeFN_ids, spTreeUnrootedFN = db.RunAnalysis() """ === 2 === Check can continue with analysis """ if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") userSpeciesTree = ConvertUserSpeciesTree(db.workingDir + "Trees_ids/", userSpeciesTree, ogSet.SpeciesDict()) rootedSpeciesTreeFN = [userSpeciesTree] roots = [None] qMultiple = False else: util.PrintUnderline("Best outgroup(s) for species tree") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.TreeFilename_IDs(0))[0] + "/", rfd.GeneToSpecies_dash, nHighParallel, treeFmt = 1) if len(roots) > 1: print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (resultsDir + "Gene_Trees/") return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if len(roots) == 1: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nAnalysing each of the potential species tree roots", True) resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): util.PrintUnderline("Reconciling gene trees and species tree" + (" (root %d)"%i if qMultiple else "")) if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) elif userSpeciesTree: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") print("Outgroup: " + (", ".join([spDict[s] for s in r]))) os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) ReconciliationAndOrthologues(db.TreeFilename_IDs, db.ogSet, speciesTree_fn, db.workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, i if qMultiple else None, pickleDir=pickleDir) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)
arg = args.pop(0) if arg == "-t" or arg == "--threads": if len(args) == 0: print("Missing option for command line argument -t") util.Fail() arg = args.pop(0) try: nProcesses = int(arg) except: print("Incorrect argument for number of threads: %s" % arg) util.Fail() else: userDir = arg # Check arguments util.PrintUnderline("0. Getting Orthologues") if nProcesses == None: print( """\nNumber of parallel processes has not been specified, will use the default value. Number of parallel processes can be specified using the -t option.""") nProcesses = util.nThreadsDefault print("Using %d threads for alignments and trees" % nProcesses) orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile( userDir) speciesToUse, nSpAll = util.GetSpeciesToUse(orthofinderWorkingDir + "SpeciesIDs.txt") resultsString = OrthologuesWorkflow(orthofinderWorkingDir, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, nProcesses)
def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, speciesToUse, qOutputCommands, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree): idDict.update( speciesIdDict ) # smae code will then also convert concatenated alignment for species tree # 0 resultsDirsFullPath = [ files.FileHandler.GetResultsSeqsDir(), files.FileHandler.GetResultsAlignDir(), files.FileHandler.GetResultsTreesDir() ] # 1. fastaWriter = FastaWriter(files.FileHandler.GetSpeciesSeqsDir(), speciesToUse) self.WriteFastaFiles(fastaWriter, ogs, idDict, True) if qStopAfterSeqs: return resultsDirsFullPath job_files = [] # 3 # Get OGs to use for species tree if qDoSpeciesTree: iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree( ogMatrix) concatenated_algn_fn = files.FileHandler.GetSpeciesTreeConcatAlignFN( ) else: iOgsForSpeciesTree = [] alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames( ogs) if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") if qOutputCommands: job_files.append( CreateMsaJob(alignCommands_and_filenames, len(job_files))) else: pc.RunParallelCommandsAndMoveResultsFile( nProcesses, alignCommands_and_filenames, False) if qDoSpeciesTree: if qOutputCommands: job_files.append( CreateConcatenatedAlignmentJob(iOgsForSpeciesTree, concatenated_algn_fn, fSingleCopy, len(job_files))) else: CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # ids -> accessions alignmentFilesToUse = [ self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames) ] accessionAlignmentFNs = [ self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse)) ] if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append( files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) if qOutputCommands: # TODO: make rename alignment taxa command util.PrintUnderline("Execute the commands in " + ','.join(job_files)) else: self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) return resultsDirsFullPath[:2] # Otherwise, alignments and trees # Strategy is # 1. Do alignments (and trees) require for species tree # 2. Create concatenated alignment # 3. Create second list of commands [speciestree] + [remaining alignments and trees] alignmentFilesToUse = [ self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames) ] treeCommands_and_filenames = self.GetTreeCommands( alignmentFilesToUse, ogs) commands_and_filenames = [] if qDoSpeciesTree: print( "Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100. * fSingleCopy)) util.PrintUnderline( "Inferring multiple sequence alignments for species tree") # Do required alignments and trees speciesTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() for i in iOgsForSpeciesTree: commands_and_filenames.append([ alignCommands_and_filenames[i], treeCommands_and_filenames[i] ]) if qOutputCommands: job_files.append( CreateMsaJob(commands_and_filenames, len(job_files))) job_files.append( CreateConcatenatedAlignmentJob(iOgsForSpeciesTree, concatenated_algn_fn, fSingleCopy, len(job_files))) else: pc.RunParallelCommandsAndMoveResultsFile( nProcesses, commands_and_filenames, True) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # Add species tree to list of commands to run commands_and_filenames = [ self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"]) ] if qOutputCommands: job_files.append( CreateSpeciesTreeJob(commands_and_filenames, len(job_files))) commands_and_filenames = [] util.PrintUnderline( "Inferring remaining multiple sequence alignments and gene trees" ) else: util.PrintUnderline( "Inferring multiple sequence alignments and gene trees") # Now continue as before iOgsForSpeciesTree = set(iOgsForSpeciesTree) for i in xrange(len(treeCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([ alignCommands_and_filenames[i], treeCommands_and_filenames[i] ]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i]]) if qOutputCommands: job_files.append( CreateOGTreesJob(commands_and_filenames, len(job_files))) else: pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions accessionAlignmentFNs = [ self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse)) ] # Add concatenated Alignment if qDoSpeciesTree: if qOutputCommands: job_files.append( CreateRenameTaxaJob([ (concatenated_algn_fn, files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) ], [(speciesTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True))], len(job_files))) else: qHaveSupport = util.HaveSupportValues(speciesTreeFN_ids) alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append( files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) if os.path.exists(speciesTreeFN_ids): util.RenameTreeTaxa( speciesTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), idDict, qSupport=qHaveSupport, qFixNegatives=True) else: text = "ERROR: Species tree inference failed" files.FileHandler.LogFailAndExit(text) if qOutputCommands: job_files.append( CreateRenameTaxaJob( zip(alignmentFilesToUse, accessionAlignmentFNs), [(self.GetTreeFilename(i), self.GetTreeFilename(i, True)) for i in xrange(len(treeCommands_and_filenames))], len(job_files))) if qOutputCommands: print( "Run the commands contained in these files (each depends on the previous):\n" + "\n".join(job_files)) files.FileHandler.LogWorkingDirectoryTrees() else: self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) qHaveSupport = None for i in xrange(len(treeCommands_and_filenames)): infn = self.GetTreeFilename(i) if os.path.exists(infn): if qHaveSupport == None: qHaveSupport = util.HaveSupportValues(infn) util.RenameTreeTaxa(infn, self.GetTreeFilename(i, True), idDict, qSupport=qHaveSupport, qFixNegatives=True) return resultsDirsFullPath[:2]
def OrthologuesWorkflow(speciesToUse, nSpAll, tree_options, msa_method, tree_method, recon_method, nHighParallel, nLowParrallel, qDoubleBlast, qAddSpeciesToIDs, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, results_name = ""): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(files.FileHandler.GetWorkingDirectory1_Read(), speciesToUse, nSpAll, qAddSpeciesToIDs, idExtractor = util.FirstWordExtractor) tree_generation_method = "msa" if qMSA or qPhyldog else "dendroblast" stop_after = "seqs" if qStopAfterSeqs else "align" if qStopAfterAlign else "" files.FileHandler.MakeResultsDirectory2(tree_generation_method, stop_after, results_name) """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ qDB_SpeciesTree = False if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), spTreeFN_ids) if qMSA or qPhyldog: qLessThanFourSpecies = len(ogSet.seqsInfo.speciesToUse) < 4 treeGen = trees_msa.TreesForOrthogroups(tree_options, msa_method, tree_method) if (not userSpeciesTree) and qLessThanFourSpecies: spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() WriteSpeciesTreeIDs_TwoThree(ogSet.seqsInfo.speciesToUse, spTreeFN_ids) util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True) qDoMSASpeciesTree = (not qLessThanFourSpecies) and (not userSpeciesTree) util.PrintTime("Starting MSA/Trees") seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.OrthogroupMatrix(), ogSet.Spec_SeqDict(), ogSet.SpeciesDict(), ogSet.speciesToUse, nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog, qDoSpeciesTree=qDoMSASpeciesTree) util.PrintTime("Done MSA/Trees") if qDoMSASpeciesTree: spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast) if qDB_SpeciesTree and not userSpeciesTree and not qLessThanFourSpecies: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") spTreeFN_ids = db.SpeciesTreeOnly() if qPhyldog: # util.PrintTime("Do species tree for phyldog") # spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if userSpeciesTree: userSpeciesTree = ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), files.FileHandler.GetSpeciesTreeUnrootedFN()) util.PrintTime("Starting phyldog") species_tree_ids_labelled_phyldog = wrapper_phyldog.RunPhyldogAnalysis(files.FileHandler.GetPhyldogWorkingDirectory(), ogSet.OGs(), speciesToUse, nHighParallel) else: db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast) spTreeFN_ids, qSTAG = db.RunAnalysis() files.FileHandler.LogWorkingDirectoryTrees() qSpeciesTreeSupports = False if (userSpeciesTree or qMSA or qPhyldog) else qSTAG """ SpeciesTree spTreeFN_ids, or equivalently FileHandler.GetSpeciesTreeUnrootedFN() in all cases (user, inferred etc) Thus, we always have the species tree ids format With phyldog, we also have species_tree_ids_labelled_phyldog - with the node labels given by phyldog """ """ === 2 === Check can continue with analysis """ # if len(ogSet.speciesToUse) < 4: # print("ERROR: Not enough species to infer species tree") # util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if qPhyldog: rootedSpeciesTreeFN = [species_tree_ids_labelled_phyldog] roots = [None] qMultiple = False all_stride_dup_genes = None elif userSpeciesTree: rootedSpeciesTreeFN = [spTreeFN_ids] roots = [None] qMultiple = False all_stride_dup_genes = None elif len(ogSet.seqsInfo.speciesToUse) == 2: hardcodeSpeciesTree = GetSpeciesTreeRoot_TwoTaxa(ogSet.seqsInfo.speciesToUse) rootedSpeciesTreeFN = [hardcodeSpeciesTree] roots = [None] qMultiple = False all_stride_dup_genes = None else: util.PrintUnderline("Best outgroup(s) for species tree") util.PrintTime("Starting STRIDE") roots, clusters_counter, rootedSpeciesTreeFN, nSupport, _, _, all_stride_dup_genes = stride.GetRoot(spTreeFN_ids, files.FileHandler.GetOGsTreeDir(), stride.GeneToSpecies_dash, nHighParallel, qWriteRootedTree=True) util.PrintTime("Done STRIDE") nAll = sum(clusters_counter.values()) nFP_mp = nAll - nSupport n_non_trivial = sum([v for k, v in clusters_counter.items() if len(k) > 1]) if len(roots) > 1: print("Observed %d well-supported, non-terminal duplications. %d support the best roots and %d contradict them." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp)) print("Best outgroups for species tree:") else: print("Observed %d well-supported, non-terminal duplications. %d support the best root and %d contradict it." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp)) print("Best outgroup for species tree:") spDict = ogSet.SpeciesDict() for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 shutil.copy(rootedSpeciesTreeFN[0], files.FileHandler.GetSpeciesTreeIDsRootedFN()) """ SpeciesTree: We now have a list of rooted species trees: rootedSpeciesTreeFN (this should be recorded by the file handler) """ if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (files.FileHandler.GetResultsTreesDir()) return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(i, not qMultiple)) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True) labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN() util.RenameTreeTaxa(speciesTree_fn, labeled_tree_fn, db.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N') files.FileHandler.CleanWorkingDir2() return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nMultiple potential species tree roots were identified, only one will be analyed.", True) resultsSpeciesTrees = [] i = 0 r = roots[0] speciesTree_fn = rootedSpeciesTreeFN[0] util.PrintUnderline("Reconciling gene trees and species tree") resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(0, True)) if (not userSpeciesTree) and (not qPhyldog) and len(ogSet.seqsInfo.speciesToUse) != 2: print("Outgroup: " + (", ".join([spDict[s] for s in r]))) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True) util.PrintTime("Starting Recon and orthologues") ReconciliationAndOrthologues(recon_method, db.ogSet, nHighParallel, i if qMultiple else None, all_stride_dup_genes=all_stride_dup_genes) util.PrintTime("Done Recon") if qMultiple: for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): unanalysedSpeciesTree = files.FileHandler.GetSpeciesTreeResultsFN(i, False) util.RenameTreeTaxa(speciesTree_fn, unanalysedSpeciesTree, db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True, label='N') """ SpeciesTree: If it's been inferred, there is now at least one rooted results species trees: GetSpeciesTreeResultsFN() """ files.FileHandler.CleanWorkingDir2() util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)