def RunAnalysis(self, qSpeciesTree=True): util.PrintUnderline("Calculating gene distances") ogs, ogMatrices_partial = self.GetOGMatrices_FullParallel() ogMatrices = self.CompleteAndWriteOGMatrices(ogs, ogMatrices_partial) util.PrintTime("Done") cmds_trees = self.PrepareGeneTreeCommand() qLessThanFourSpecies = len(self.ogSet.seqsInfo.speciesToUse) < 4 if qLessThanFourSpecies: qSTAG = False spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() WriteSpeciesTreeIDs_TwoThree(self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids) else: qSTAG = self.EnoughOGsForSTAG(ogs, self.ogSet.seqsInfo.speciesToUse) if not qSTAG: print("Using fallback species tree inference method") D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_trees = [[cmd_spTree]] + cmds_trees util.PrintUnderline("Inferring gene and species trees") util.RunParallelOrderedCommandLists(self.nProcesses, cmds_trees) if qSTAG: # Trees must have been completed print("") spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() stag.Run_ForOrthoFinder(files.FileHandler.GetOGsTreeDir(), files.FileHandler.GetWorkingDirectory_Write(), self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.OGs())): util.RenameTreeTaxa(files.FileHandler.GetOGsTreeFN(iog), files.FileHandler.GetOGsTreeFN(iog, True), seqDict, qSupport=False, qFixNegatives=True) if qSpeciesTree: util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), self.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True) return spTreeFN_ids, qSTAG else: return None, qSTAG
def RunAnalysis(self, qSpeciesTree=True): ogs, ogMatrices_partial = self.GetOGMatrices() ogMatrices = self.CompleteAndWriteOGMatrices(ogs, ogMatrices_partial) D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_geneTrees = self.PrepareGeneTreeCommand() util.PrintUnderline("Inferring gene and species trees") util.RunParallelOrderedCommandLists(self.nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout=True) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.OGs())): util.RenameTreeTaxa(self.TreeFilename_IDs(iog), self.treesPat % iog, seqDict, qFixNegatives=True) if qSpeciesTree: spTreeUnrootedFN = self.workingDir + "SpeciesTree_unrooted.txt" util.RenameTreeTaxa(spTreeFN_ids, spTreeUnrootedFN, self.ogSet.SpeciesDict(), qFixNegatives=True) return len(ogs), D, spTreeFN_ids, spTreeUnrootedFN else: return len(ogs), D, None, None
def ReconciliationAndOrthologues(treesIDsPatFn, ogSet, speciesTree_fn, workingDir, resultsDir, reconTreesRenamedDir, nParallel, iSpeciesTree=None, pickleDir = None): """ treesPatFn - function returning name of filename ogSet - info about the orthogroups, species etc speciesTree_fn - the species tree workingDir - Orthologues working dir resultsDir - where the Orthologues top level results directory will go (should exist already) reconTreesRenamedDir - where to put the reconcilled trees that use the gene accessions iSpeciesTree - which of the potential roots of the species tree is this """ dlcparResultsDir = RunDlcpar(treesIDsPatFn, ogSet, speciesTree_fn, workingDir, nParallel) if not os.path.exists(reconTreesRenamedDir): os.mkdir(reconTreesRenamedDir) for iog in xrange(len(ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + "OG%07d_tree_id.dlcpar.locus.tree" % iog, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qFixNegatives=False, inFormat=8) # Orthologue lists util.PrintUnderline("Inferring orthologues from gene trees" + (" (root %d)"%iSpeciesTree if iSpeciesTree != None else "")) qDelDir = False if pickleDir == None: pickleDir = workingDir + "matrices_orthologues/" if not os.path.exists(pickleDir): os.mkdir(pickleDir) qDelDir = True rt.create_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, pickleDir) # If a temporary matrices directory was created, delete it now if qDelDir: if os.path.exists(pickleDir): try: os.rmdir(pickleDir) except OSError: pass
def DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDirectory, GeneToSpecies, output_dir, reconTreesRenamedDir): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) for index1 in xrange(nspecies): d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): recon_tree = files.FileHandler.GetPhyldogOGResultsTreeFN(iog) orthologues = GetOrthologues_from_phyldog_tree(iog, recon_tree, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict()) allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, False) return nOrthologues_SpPair
def SpeciesTreeOnly(self): ogs, ogMatrices_partial = self.GetOGMatrices() ogMatrices = self.CompleteOGMatrices(ogs, ogMatrices_partial) D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs, True) util.RunOrderedCommandList([cmd_spTree], True) spTreeUnrootedFN = self.workingDir + "SpeciesTree_unrooted.txt" util.RenameTreeTaxa(spTreeFN_ids, spTreeUnrootedFN, self.ogSet.SpeciesDict(), qFixNegatives=True) return spTreeFN_ids, spTreeUnrootedFN
def ReconciliationAndOrthologues(recon_method, ogSet, nParallel, iSpeciesTree=None, all_stride_dup_genes=None): """ ogSet - info about the orthogroups, species etc resultsDir - where the Orthologues top level results directory will go (should exist already) reconTreesRenamedDir - where to put the reconcilled trees that use the gene accessions iSpeciesTree - which of the potential roots of the species tree is this method - can be dlcpar, dlcpar_deep, of_recon """ speciesTree_ids_fn = files.FileHandler.GetSpeciesTreeIDsRootedFN() labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN() util.RenameTreeTaxa(speciesTree_ids_fn, labeled_tree_fn, ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N') workingDir = files.FileHandler.GetWorkingDirectory_Write() # workingDir - Orthologues working dir resultsDir_ologs = files.FileHandler.GetOrthologuesDirectory() reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) if "dlcpar" in recon_method: qDeepSearch = (recon_method == "dlcpar_convergedsearch") util.PrintTime("Starting DLCpar") dlcparResultsDir, dlcparLocusTreePat = trees2ologs_dlcpar.RunDlcpar(ogSet, speciesTree_ids_fn, workingDir, nParallel, qDeepSearch) util.PrintTime("Done DLCpar") spec_seq_dict = ogSet.Spec_SeqDict() for iog in xrange(len(ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + dlcparLocusTreePat % iog, files.FileHandler.GetOGsReconTreeFN(iog), spec_seq_dict, qSupport=False, qFixNegatives=False, inFormat=8, label='n') # Orthologue lists util.PrintUnderline("Inferring orthologues from gene trees" + (" (root %d)"%iSpeciesTree if iSpeciesTree != None else "")) pickleDir = files.FileHandler.GetPickleDir() nOrthologues_SpPair = trees2ologs_dlcpar.create_orthologue_lists(ogSet, resultsDir_ologs, dlcparResultsDir, pickleDir) elif "phyldog" == recon_method: util.PrintTime("Starting Orthologues from Phyldog") nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDir, trees2ologs_of.GeneToSpecies_dash, resultsDir_ologs, reconTreesRenamedDir) util.PrintTime("Done Orthologues from Phyldog") else: util.PrintTime("Starting OF Orthologues") qNoRecon = ("only_overlap" == recon_method) nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder(ogSet, speciesTree_ids_fn, trees2ologs_of.GeneToSpecies_dash, all_stride_dup_genes, qNoRecon) util.PrintTime("Done OF Orthologues") nOrthologues_SpPair += TwoAndThreeGeneOrthogroups(ogSet, resultsDir_ologs) WriteOrthologuesStats(ogSet, nOrthologues_SpPair)
def SpeciesTreeOnly(self): qLessThanFourSpecies = len(self.ogSet.seqsInfo.speciesToUse) < 4 if qLessThanFourSpecies: spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() WriteSpeciesTreeIDs_TwoThree(self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids) else: ogs, ogMatrices_partial = self.GetOGMatrices_FullParallel() ogMatrices = self.CompleteOGMatrices(ogs, ogMatrices_partial) D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs, True) util.RunOrderedCommandList([cmd_spTree], True) spTreeUnrootedFN = files.FileHandler.GetSpeciesTreeUnrootedFN(True) util.RenameTreeTaxa(spTreeFN_ids, spTreeUnrootedFN, self.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True) return spTreeFN_ids
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) dSuspect = output_dir + "Putative_Xenologues/" if not os.path.exists(dSuspect): os.mkdir(dSuspect) for index1 in xrange(nspecies): with open(dSuspect + '%s.csv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) neighbours = GetSpeciesNeighbours(species_tree_rooted) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) species = speciesDict.keys() with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): orthologues, recon_tree, suspect_genes = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes) for index0 in xrange(nspecies): strsp0 = species[index0] strsp0_ = strsp0+"_" these_genes = [g for g in suspect_genes if g.startswith(strsp0_)] if len(these_genes) > 0: with open(output_dir + "Orthologues_" + speciesDict[strsp0] + "/Putative_Horizontal_Gene_Transfer.txt", 'ab') as outfile: outfile.write("\n".join([SequenceDict[g]]) + "\n") allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, True) return nOrthologues_SpPair
def DoTrees(self, ogs, idDict, nProcesses, qStopAfterSeqs, qStopAfterAlignments): # 0 resultsDirsFullPath = [] for fn in [self.GetFastaFilename, self.GetAlignmentFilename, self.GetTreeFilename]: for qIDs in [True, False]: d = os.path.split(fn(0, not qIDs))[0] if not os.path.exists(d): os.mkdir(d) if not qIDs: resultsDirsFullPath.append(d) if qStopAfterSeqs: break if qStopAfterAlignments and fn == self.GetAlignmentFilename: break # 1. fastaWriter = FastaWriter(self.ogsWorkingDir) self.WriteFastaFiles(fastaWriter, ogs, idDict) if qStopAfterSeqs: return resultsDirsFullPath # 2 if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") else: util.PrintUnderline("Inferring multiple sequence alignments and gene trees") # 3 alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs) if qStopAfterAlignments: pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False) return resultsDirsFullPath[:2] # Otherwise, alignments and trees alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs) commands_and_filenames = [] for i in xrange(len(treeCommands_and_filenames)): commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): commands_and_filenames.append([alignCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions for i, alignFN in enumerate(alignmentFilesToUse): with open(alignFN, 'rb') as infile, open(self.GetAlignmentFilename(i, True), 'wb') as outfile: for line in infile: if line.startswith(">"): outfile.write(">" + idDict[line[1:].rstrip()] + "\n") else: outfile.write(line) if os.path.exists(self.GetTreeFilename(i)): util.RenameTreeTaxa(self.GetTreeFilename(i), self.GetTreeFilename(i, True), idDict, qFixNegatives=True) return resultsDirsFullPath[:2]
def RunDlcpar(ogSet, speciesTreeFN, workingDir, nParallel, qDeepSearch): """ Implementation: - (skip: label species tree) - sort out trees (midpoint root, resolve plytomies etc) - run """ ogs = ogSet.OGs() nOGs = len(ogs) dlcparResultsDir = workingDir + 'dlcpar/' if not os.path.exists(dlcparResultsDir): os.mkdir(dlcparResultsDir) RootGeneTreesArbitrarily(nOGs, dlcparResultsDir) spec_seq_dict = ogSet.Spec_SeqDict() for iog in xrange(len(ogs)): util.RenameTreeTaxa(files.FileHandler.GetOGsTreeFN(iog), files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=False, qFixNegatives=True, qViaCopy=False) geneMapFN = WriteGeneSpeciesMap(dlcparResultsDir, ogSet.SpeciesDict()) filenames = [ dlcparResultsDir + os.path.split(files.FileHandler.GetOGsTreeFN(i))[1] for i in xrange(nOGs) ] if qDeepSearch: nTaxa = [len(og) for og in ogs[:nOGs]] nIter = [ 1000 if n < 25 else 25000 if n < 200 else 50000 for n in nTaxa ] nNoImprov = [ 100 if n < 25 else 1000 if n < 200 else 2000 for n in nTaxa ] dlcCommands = [ 'dlcpar_search -s %s -S %s -D 1 -C 0.125 %s -I .txt -i %d --nprescreen 100 --nconverge %d' % (speciesTreeFN, geneMapFN, fn, i, n) for (fn, i, n) in zip(filenames, nIter, nNoImprov) ] else: dlcCommands = [ 'dlcpar_search -s %s -S %s -D 1 -C 0.125 %s -I .txt -x 1' % (speciesTreeFN, geneMapFN, fn) for fn in filenames ] util.RunParallelOrderedCommandLists(nParallel, [[c] for c in dlcCommands]) return dlcparResultsDir, "OG%07d_tree_id.dlcpar.locus.tree"
def RunAnalysis(self): ogs, ogMatrices_partial = self.GetOGMatrices() ogMatrices = self.WriteOGMatrices(ogs, ogMatrices_partial) D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_geneTrees = self.PrepareGeneTreeCommand() print("\n3. Inferring gene and species trees") print("-----------------------------------") util.RunParallelOrderedCommandLists(self.nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout=True) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.OGs())): util.RenameTreeTaxa(self.treesPatIDs % iog, self.treesPat % iog, seqDict, qFixNegatives=True) # util.RenameTreeTaxa(spTreeFN_ids, self.workingDir + "SpeciesTree_unrooted.txt", self.ogSet.SpeciesDict(), qFixNegatives=True) return len(ogs), D, spPairs, spTreeFN_ids
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) for index1 in xrange(nspecies): d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) allOrthologues = [] with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): orthologues, recon_tree = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes) allOrthologues.append((iog, orthologues)) util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir) return nOrthologues_SpPair
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, nProcesses): ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor=util.FirstWordExtractor) if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() print("\n1. Checking required programs are installed") print("-------------------------------------------") if not CanRunDependencies(orthofinderWorkingDir): print( "Orthogroups have been inferred but the dependencies for inferring gene trees and\northologues have not been met. Please review previous messages for more information." ) sys.exit() print("\n2. Calculating gene distances") print("-----------------------------") resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) db.ReadAndPickle() nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() print("\n4. Best outgroup(s) for species tree") print("------------------------------------") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot( spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt=1) if len(roots) > 1: print( "Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print( "Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r]))) qMultiple = len(roots) > 1 if qMultiple: print("\nAnalysing each of the potential species tree roots.") resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append( resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) print("\n5%s. Reconciling gene and species trees" % ("-%d" % i if qMultiple else "")) print("-------------------------------------" + ("--" if qMultiple else "")) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) os.mkdir(reconTreesRenamedDir) for iog in xrange(len(db.ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + "OG%07d_tree_id.locus.tree" % iog, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, db.ogSet.Spec_SeqDict(), qFixNegatives=False, inFormat=8) # Orthologue lists print("\n6%s. Inferring orthologues from gene trees" % ("-%d" % i if qMultiple else "")) print("----------------------------------------" + ("--" if qMultiple else "")) pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) CleanWorkingDir(db) print("\n7. Writing results files") print("------------------------") return GetResultsFilesString(resultsSpeciesTrees)
def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, speciesToUse, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree): idDict.update(speciesIdDict) # smae code will then also convert concatenated alignment for species tree # 0 resultsDirsFullPath = [files.FileHandler.GetResultsSeqsDir(), files.FileHandler.GetResultsAlignDir(), files.FileHandler.GetResultsTreesDir()] # 1. fastaWriter = FastaWriter(files.FileHandler.GetSpeciesSeqsDir(), speciesToUse) self.WriteFastaFiles(fastaWriter, ogs, idDict, True) if qStopAfterSeqs: return resultsDirsFullPath # 3 # Get OGs to use for species tree if qDoSpeciesTree: iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree(ogMatrix) concatenated_algn_fn = files.FileHandler.GetSpeciesTreeConcatAlignFN() else: iOgsForSpeciesTree = [] alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs) if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False) if qDoSpeciesTree: CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # write OGs used to file dSpeciesTree = os.path.split(files.FileHandler.GetSpeciesTreeResultsFN(0, True))[0] + "/" with open(dSpeciesTree + "Orthogroups_for_concatenated_alignment.txt", 'wb') as outfile: for iog in iOgsForSpeciesTree: outfile.write("OG%07d\n" % iog) # ids -> accessions alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) return resultsDirsFullPath[:2] # Otherwise, alignments and trees # Strategy is # 1. Do alignments (and trees) require for species tree # 2. Create concatenated alignment # 3. Create second list of commands [speciestree] + [remaining alignments and trees] alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs) commands_and_filenames = [] if qDoSpeciesTree: print("Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100.*fSingleCopy)) util.PrintUnderline("Inferring multiple sequence alignments for species tree") # Do required alignments and trees speciesTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() for i in iOgsForSpeciesTree: commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # write OGs used to file dSpeciesTree = os.path.split(files.FileHandler.GetSpeciesTreeResultsFN(0, True))[0] + "/" with open(dSpeciesTree + "Orthogroups_for_concatenated_alignment.txt", 'wb') as outfile: for iog in iOgsForSpeciesTree: outfile.write("OG%07d\n" % iog) # Add species tree to list of commands to run commands_and_filenames = [self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"])] util.PrintUnderline("Inferring remaining multiple sequence alignments and gene trees") else: util.PrintUnderline("Inferring multiple sequence alignments and gene trees") # Now continue as before iOgsForSpeciesTree = set(iOgsForSpeciesTree) for i in xrange(len(treeCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] # Add concatenated Alignment if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) qHaveSupport = util.HaveSupportValues(speciesTreeFN_ids) if os.path.exists(speciesTreeFN_ids): util.RenameTreeTaxa(speciesTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), idDict, qSupport=qHaveSupport, qFixNegatives=True) else: text = "ERROR: Species tree inference failed" files.FileHandler.LogFailAndExit(text) self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) qHaveSupport = None for i in xrange(len(treeCommands_and_filenames)): infn = self.GetTreeFilename(i) if os.path.exists(infn): if qHaveSupport == None: qHaveSupport = util.HaveSupportValues(infn) util.RenameTreeTaxa(infn, self.GetTreeFilename(i, True), idDict, qSupport=qHaveSupport, qFixNegatives=True) return resultsDirsFullPath[:2]
def DoOrthologuesForOrthoFinder(ogSet, species_tree_rooted_fn, GeneToSpecies, all_stride_dup_genes, qNoRecon): """ """ # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure qInitialisedSuspectGenesDirs = False speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory() for index1 in xrange(nspecies): d = dResultsOrthologues + "Orthologues_" + speciesDict[str( speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open( d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str( speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow( ("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) neighbours = GetSpeciesNeighbours(species_tree_rooted) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) species = speciesDict.keys() reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) spec_seq_dict = ogSet.Spec_SeqDict() with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow([ "Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2" ]) for iog in xrange(nOgs): rooted_tree_ids, qHaveSupport = CheckAndRootTree( files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted, GeneToSpecies) # this can be parallelised easily if rooted_tree_ids is None: continue # Write rooted tree with accessions util.RenameTreeTaxa(rooted_tree_ids, files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=qHaveSupport, qFixNegatives=True, qViaCopy=True) orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree( iog, rooted_tree_ids, species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=spec_seq_dict, spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes, qNoRecon=qNoRecon) qContainsSuspectGenes = len(suspect_genes) > 0 if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes: qInitialisedSuspectGenesDirs = True dSuspectGenes = files.FileHandler.GetSuspectGenesDir() dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir( ) for index1 in xrange(nspecies): with open( dSuspectOrthologues + '%s.tsv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow( ("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) for index0 in xrange(nspecies): strsp0 = species[index0] strsp0_ = strsp0 + "_" these_genes = [ g for g in suspect_genes if g.startswith(strsp0_) ] if len(these_genes) > 0: with open(dSuspectGenes + speciesDict[strsp0] + ".txt", 'ab') as outfile: outfile.write( "\n".join([SequenceDict[g] for g in these_genes]) + "\n") allOrthologues = [(iog, orthologues)] # don't relabel nodes, they've already been done util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, spec_seq_dict, qSupport=False, qFixNegatives=True) if iog >= 0 and divmod( iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles( allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, dResultsOrthologues, qContainsSuspectGenes) return nOrthologues_SpPair
def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree): idDict.update(speciesIdDict) # smae code will then also convert concatenated alignment for species tree # 0 resultsDirsFullPath = [] for fn in [self.GetFastaFilename, self.GetAlignmentFilename, self.GetTreeFilename]: for qIDs in [True, False]: d = os.path.split(fn(0, not qIDs))[0] if not os.path.exists(d): os.mkdir(d) if not qIDs: resultsDirsFullPath.append(d) if qStopAfterSeqs: break if qStopAfterAlignments and fn == self.GetAlignmentFilename: break # 1. fastaWriter = FastaWriter(self.ogsWorkingDir) self.WriteFastaFiles(fastaWriter, ogs, idDict) if qStopAfterSeqs: return resultsDirsFullPath # 3 # Get OGs to use for species tree if qDoSpeciesTree: iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree(ogMatrix) concatenated_algn_fn = os.path.split(self.GetAlignmentFilename(0))[0] + "/SpeciesTreeAlignment.fa" else: iOgsForSpeciesTree = [] alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs) if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # ids -> accessions alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa") self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) return resultsDirsFullPath[:2] # Otherwise, alignments and trees # Strategy is # 1. Do alignments (and trees) require for species tree # 2. Create concatenated alignment # 3. Create second list of commands [speciestree] + [remaining alignments and trees] alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs) commands_and_filenames = [] if qDoSpeciesTree: print("Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100.*fSingleCopy)) util.PrintUnderline("Inferring multiple sequence alignments for species tree") # Do required alignments and trees speciesTreeFN_ids = os.path.split(self.GetTreeFilename(i))[0] + "/SpeciesTree_unrooted.txt" for i in iOgsForSpeciesTree: commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # Add species tree to list of commands to run commands_and_filenames = [self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"])] util.PrintUnderline("Inferring remaining multiple sequence alignments and gene trees") else: util.PrintUnderline("Inferring multiple sequence alignments and gene trees") # Now continue as before iOgsForSpeciesTree = set(iOgsForSpeciesTree) for i in xrange(len(treeCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] # Add concatenated Alignment if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa") self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) if os.path.exists(speciesTreeFN_ids): util.RenameTreeTaxa(speciesTreeFN_ids, self.workingDir + "SpeciesTree_unrooted.txt", idDict, qFixNegatives=True) else: print("ERROR: Species tree inference failed") util.Fail() for i in xrange(len(treeCommands_and_filenames)): if os.path.exists(self.GetTreeFilename(i)): util.RenameTreeTaxa(self.GetTreeFilename(i), self.GetTreeFilename(i, True), idDict, qFixNegatives=True) return resultsDirsFullPath[:2]
def OrthologuesWorkflow(workingDir_ogs, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, tree_options, msa_method, tree_method, nHighParallel, nLowParrallel, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, pickleDir=None): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(workingDir_ogs, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor, pickleDir=pickleDir) # Class that is going to run the analysis needs to check the dependencies # if not CanRunOrthologueDependencies(workingDir_ogs, qMSA, qStopAfterTrees, userSpeciesTree == None): # print("Orthogroups have been inferred but the dependencies for inferring gene trees and") # print("orthologues have not been met. Please review previous messages for more information.") # sys.exit() resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ if qMSA or qPhyldog: treeGen = msa.TreesForOrthogroups(tree_options, msa_method, tree_method, resultsDir, workingDir_ogs) seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.Spec_SeqDict(), nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog) if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) if not userSpeciesTree: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") db.ReadAndPickle() spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if qPhyldog: trees_from_phyldog.RunPhyldogAnalysis(resultsDir + "WorkingDirectory/phyldog/", ogSet.OGs(), speciesToUse) return "Running Phyldog" + "\n".join(seqs_alignments_dirs) else: util.PrintUnderline("Calculating gene distances") db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) db.ReadAndPickle() nOGs, D, spTreeFN_ids, spTreeUnrootedFN = db.RunAnalysis() """ === 2 === Check can continue with analysis """ if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") userSpeciesTree = ConvertUserSpeciesTree(db.workingDir + "Trees_ids/", userSpeciesTree, ogSet.SpeciesDict()) rootedSpeciesTreeFN = [userSpeciesTree] roots = [None] qMultiple = False else: util.PrintUnderline("Best outgroup(s) for species tree") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.TreeFilename_IDs(0))[0] + "/", rfd.GeneToSpecies_dash, nHighParallel, treeFmt = 1) if len(roots) > 1: print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (resultsDir + "Gene_Trees/") return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if len(roots) == 1: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nAnalysing each of the potential species tree roots", True) resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): util.PrintUnderline("Reconciling gene trees and species tree" + (" (root %d)"%i if qMultiple else "")) if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) elif userSpeciesTree: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") print("Outgroup: " + (", ".join([spDict[s] for s in r]))) os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) ReconciliationAndOrthologues(db.TreeFilename_IDs, db.ogSet, speciesTree_fn, db.workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, i if qMultiple else None, pickleDir=pickleDir) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)
def OrthologuesWorkflow(speciesToUse, nSpAll, tree_options, msa_method, tree_method, recon_method, nHighParallel, nLowParrallel, qDoubleBlast, qAddSpeciesToIDs, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, results_name = ""): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(files.FileHandler.GetWorkingDirectory1_Read(), speciesToUse, nSpAll, qAddSpeciesToIDs, idExtractor = util.FirstWordExtractor) tree_generation_method = "msa" if qMSA or qPhyldog else "dendroblast" stop_after = "seqs" if qStopAfterSeqs else "align" if qStopAfterAlign else "" files.FileHandler.MakeResultsDirectory2(tree_generation_method, stop_after, results_name) """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ qDB_SpeciesTree = False if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), spTreeFN_ids) if qMSA or qPhyldog: qLessThanFourSpecies = len(ogSet.seqsInfo.speciesToUse) < 4 treeGen = trees_msa.TreesForOrthogroups(tree_options, msa_method, tree_method) if (not userSpeciesTree) and qLessThanFourSpecies: spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() WriteSpeciesTreeIDs_TwoThree(ogSet.seqsInfo.speciesToUse, spTreeFN_ids) util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True) qDoMSASpeciesTree = (not qLessThanFourSpecies) and (not userSpeciesTree) util.PrintTime("Starting MSA/Trees") seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.OrthogroupMatrix(), ogSet.Spec_SeqDict(), ogSet.SpeciesDict(), ogSet.speciesToUse, nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog, qDoSpeciesTree=qDoMSASpeciesTree) util.PrintTime("Done MSA/Trees") if qDoMSASpeciesTree: spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast) if qDB_SpeciesTree and not userSpeciesTree and not qLessThanFourSpecies: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") spTreeFN_ids = db.SpeciesTreeOnly() if qPhyldog: # util.PrintTime("Do species tree for phyldog") # spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if userSpeciesTree: userSpeciesTree = ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), files.FileHandler.GetSpeciesTreeUnrootedFN()) util.PrintTime("Starting phyldog") species_tree_ids_labelled_phyldog = wrapper_phyldog.RunPhyldogAnalysis(files.FileHandler.GetPhyldogWorkingDirectory(), ogSet.OGs(), speciesToUse, nHighParallel) else: db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast) spTreeFN_ids, qSTAG = db.RunAnalysis() files.FileHandler.LogWorkingDirectoryTrees() qSpeciesTreeSupports = False if (userSpeciesTree or qMSA or qPhyldog) else qSTAG """ SpeciesTree spTreeFN_ids, or equivalently FileHandler.GetSpeciesTreeUnrootedFN() in all cases (user, inferred etc) Thus, we always have the species tree ids format With phyldog, we also have species_tree_ids_labelled_phyldog - with the node labels given by phyldog """ """ === 2 === Check can continue with analysis """ # if len(ogSet.speciesToUse) < 4: # print("ERROR: Not enough species to infer species tree") # util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if qPhyldog: rootedSpeciesTreeFN = [species_tree_ids_labelled_phyldog] roots = [None] qMultiple = False all_stride_dup_genes = None elif userSpeciesTree: rootedSpeciesTreeFN = [spTreeFN_ids] roots = [None] qMultiple = False all_stride_dup_genes = None elif len(ogSet.seqsInfo.speciesToUse) == 2: hardcodeSpeciesTree = GetSpeciesTreeRoot_TwoTaxa(ogSet.seqsInfo.speciesToUse) rootedSpeciesTreeFN = [hardcodeSpeciesTree] roots = [None] qMultiple = False all_stride_dup_genes = None else: util.PrintUnderline("Best outgroup(s) for species tree") util.PrintTime("Starting STRIDE") roots, clusters_counter, rootedSpeciesTreeFN, nSupport, _, _, all_stride_dup_genes = stride.GetRoot(spTreeFN_ids, files.FileHandler.GetOGsTreeDir(), stride.GeneToSpecies_dash, nHighParallel, qWriteRootedTree=True) util.PrintTime("Done STRIDE") nAll = sum(clusters_counter.values()) nFP_mp = nAll - nSupport n_non_trivial = sum([v for k, v in clusters_counter.items() if len(k) > 1]) if len(roots) > 1: print("Observed %d well-supported, non-terminal duplications. %d support the best roots and %d contradict them." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp)) print("Best outgroups for species tree:") else: print("Observed %d well-supported, non-terminal duplications. %d support the best root and %d contradict it." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp)) print("Best outgroup for species tree:") spDict = ogSet.SpeciesDict() for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 shutil.copy(rootedSpeciesTreeFN[0], files.FileHandler.GetSpeciesTreeIDsRootedFN()) """ SpeciesTree: We now have a list of rooted species trees: rootedSpeciesTreeFN (this should be recorded by the file handler) """ if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (files.FileHandler.GetResultsTreesDir()) return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(i, not qMultiple)) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True) labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN() util.RenameTreeTaxa(speciesTree_fn, labeled_tree_fn, db.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N') files.FileHandler.CleanWorkingDir2() return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nMultiple potential species tree roots were identified, only one will be analyed.", True) resultsSpeciesTrees = [] i = 0 r = roots[0] speciesTree_fn = rootedSpeciesTreeFN[0] util.PrintUnderline("Reconciling gene trees and species tree") resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(0, True)) if (not userSpeciesTree) and (not qPhyldog) and len(ogSet.seqsInfo.speciesToUse) != 2: print("Outgroup: " + (", ".join([spDict[s] for s in r]))) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True) util.PrintTime("Starting Recon and orthologues") ReconciliationAndOrthologues(recon_method, db.ogSet, nHighParallel, i if qMultiple else None, all_stride_dup_genes=all_stride_dup_genes) util.PrintTime("Done Recon") if qMultiple: for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): unanalysedSpeciesTree = files.FileHandler.GetSpeciesTreeResultsFN(i, False) util.RenameTreeTaxa(speciesTree_fn, unanalysedSpeciesTree, db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True, label='N') """ SpeciesTree: If it's been inferred, there is now at least one rooted results species trees: GetSpeciesTreeResultsFN() """ files.FileHandler.CleanWorkingDir2() util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)
def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, speciesToUse, qOutputCommands, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree): idDict.update( speciesIdDict ) # smae code will then also convert concatenated alignment for species tree # 0 resultsDirsFullPath = [ files.FileHandler.GetResultsSeqsDir(), files.FileHandler.GetResultsAlignDir(), files.FileHandler.GetResultsTreesDir() ] # 1. fastaWriter = FastaWriter(files.FileHandler.GetSpeciesSeqsDir(), speciesToUse) self.WriteFastaFiles(fastaWriter, ogs, idDict, True) if qStopAfterSeqs: return resultsDirsFullPath job_files = [] # 3 # Get OGs to use for species tree if qDoSpeciesTree: iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree( ogMatrix) concatenated_algn_fn = files.FileHandler.GetSpeciesTreeConcatAlignFN( ) else: iOgsForSpeciesTree = [] alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames( ogs) if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") if qOutputCommands: job_files.append( CreateMsaJob(alignCommands_and_filenames, len(job_files))) else: pc.RunParallelCommandsAndMoveResultsFile( nProcesses, alignCommands_and_filenames, False) if qDoSpeciesTree: if qOutputCommands: job_files.append( CreateConcatenatedAlignmentJob(iOgsForSpeciesTree, concatenated_algn_fn, fSingleCopy, len(job_files))) else: CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # ids -> accessions alignmentFilesToUse = [ self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames) ] accessionAlignmentFNs = [ self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse)) ] if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append( files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) if qOutputCommands: # TODO: make rename alignment taxa command util.PrintUnderline("Execute the commands in " + ','.join(job_files)) else: self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) return resultsDirsFullPath[:2] # Otherwise, alignments and trees # Strategy is # 1. Do alignments (and trees) require for species tree # 2. Create concatenated alignment # 3. Create second list of commands [speciestree] + [remaining alignments and trees] alignmentFilesToUse = [ self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames) ] treeCommands_and_filenames = self.GetTreeCommands( alignmentFilesToUse, ogs) commands_and_filenames = [] if qDoSpeciesTree: print( "Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100. * fSingleCopy)) util.PrintUnderline( "Inferring multiple sequence alignments for species tree") # Do required alignments and trees speciesTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() for i in iOgsForSpeciesTree: commands_and_filenames.append([ alignCommands_and_filenames[i], treeCommands_and_filenames[i] ]) if qOutputCommands: job_files.append( CreateMsaJob(commands_and_filenames, len(job_files))) job_files.append( CreateConcatenatedAlignmentJob(iOgsForSpeciesTree, concatenated_algn_fn, fSingleCopy, len(job_files))) else: pc.RunParallelCommandsAndMoveResultsFile( nProcesses, commands_and_filenames, True) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # Add species tree to list of commands to run commands_and_filenames = [ self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"]) ] if qOutputCommands: job_files.append( CreateSpeciesTreeJob(commands_and_filenames, len(job_files))) commands_and_filenames = [] util.PrintUnderline( "Inferring remaining multiple sequence alignments and gene trees" ) else: util.PrintUnderline( "Inferring multiple sequence alignments and gene trees") # Now continue as before iOgsForSpeciesTree = set(iOgsForSpeciesTree) for i in xrange(len(treeCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([ alignCommands_and_filenames[i], treeCommands_and_filenames[i] ]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i]]) if qOutputCommands: job_files.append( CreateOGTreesJob(commands_and_filenames, len(job_files))) else: pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions accessionAlignmentFNs = [ self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse)) ] # Add concatenated Alignment if qDoSpeciesTree: if qOutputCommands: job_files.append( CreateRenameTaxaJob([ (concatenated_algn_fn, files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) ], [(speciesTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True))], len(job_files))) else: qHaveSupport = util.HaveSupportValues(speciesTreeFN_ids) alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append( files.FileHandler.GetSpeciesTreeConcatAlignFN(True)) if os.path.exists(speciesTreeFN_ids): util.RenameTreeTaxa( speciesTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), idDict, qSupport=qHaveSupport, qFixNegatives=True) else: text = "ERROR: Species tree inference failed" files.FileHandler.LogFailAndExit(text) if qOutputCommands: job_files.append( CreateRenameTaxaJob( zip(alignmentFilesToUse, accessionAlignmentFNs), [(self.GetTreeFilename(i), self.GetTreeFilename(i, True)) for i in xrange(len(treeCommands_and_filenames))], len(job_files))) if qOutputCommands: print( "Run the commands contained in these files (each depends on the previous):\n" + "\n".join(job_files)) files.FileHandler.LogWorkingDirectoryTrees() else: self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) qHaveSupport = None for i in xrange(len(treeCommands_and_filenames)): infn = self.GetTreeFilename(i) if os.path.exists(infn): if qHaveSupport == None: qHaveSupport = util.HaveSupportValues(infn) util.RenameTreeTaxa(infn, self.GetTreeFilename(i, True), idDict, qSupport=qHaveSupport, qFixNegatives=True) return resultsDirsFullPath[:2]