def one_to_one_efficient(orthodict, genenumbers, speciesLabels, iSpecies, outputDir): """ speciesLabels is an ordered list of the speciesIDs try to mostly deal with iSpecies which is the ordinal number not the label it is given """ #Creates all matrices and appends them to matrixlist. util.PrintTime("Processing orthologues for species %d" % iSpecies) matrixlist = [] numspecies = len(speciesLabels) speciesLabelsReverse = {label: i for i, label in enumerate(speciesLabels)} for j in range(numspecies): if iSpecies > j: matrixlist.append( sparse.lil_matrix((genenumbers[iSpecies], genenumbers[j]), dtype=np.dtype(np.int8))) else: matrixlist.append(None) #Fill matrices with orthodata iSpecieslist = [ x for x in orthodict if x.startswith('%d_' % speciesLabels[iSpecies]) ] for count, queryGene in enumerate(iSpecieslist): _, iGene = map(int, queryGene.split('_')) for Gene in orthodict[queryGene]: jSpLabel, jGene = map(int, Gene.split('_')) jSp = speciesLabelsReverse[jSpLabel] if iSpecies > jSp: matrixlist[jSp][iGene, jGene] = 1 for j, m in enumerate(matrixlist): with open(outputDir + 'ortholog_%d_%d_matrix.pic' % (iSpecies, j), 'wb') as file: pic.dump(m, file) return matrixlist
def RunAnalysis(self, qSpeciesTree=True): util.PrintUnderline("Calculating gene distances") ogs, ogMatrices_partial = self.GetOGMatrices_FullParallel() ogMatrices = self.CompleteAndWriteOGMatrices(ogs, ogMatrices_partial) util.PrintTime("Done") cmds_trees = self.PrepareGeneTreeCommand() qLessThanFourSpecies = len(self.ogSet.seqsInfo.speciesToUse) < 4 if qLessThanFourSpecies: qSTAG = False spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() WriteSpeciesTreeIDs_TwoThree(self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids) else: qSTAG = self.EnoughOGsForSTAG(ogs, self.ogSet.seqsInfo.speciesToUse) if not qSTAG: print("Using fallback species tree inference method") D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_trees = [[cmd_spTree]] + cmds_trees util.PrintUnderline("Inferring gene and species trees") util.RunParallelOrderedCommandLists(self.nProcesses, cmds_trees) if qSTAG: # Trees must have been completed print("") spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() stag.Run_ForOrthoFinder(files.FileHandler.GetOGsTreeDir(), files.FileHandler.GetWorkingDirectory_Write(), self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.OGs())): util.RenameTreeTaxa(files.FileHandler.GetOGsTreeFN(iog), files.FileHandler.GetOGsTreeFN(iog, True), seqDict, qSupport=False, qFixNegatives=True) if qSpeciesTree: util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), self.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True) return spTreeFN_ids, qSTAG else: return None, qSTAG
def GetOGMatrices(self): """ ogMatrices contains matrix M for each OG where: Mij = 0.5*max(Bij, Bmin_i)/Bmax_i """ with warnings.catch_warnings(): warnings.simplefilter("ignore") ogs = self.ogSet.OGs() ogsPerSpecies = [[[(g, i) for i, g in enumerate(og) if g.iSp == iSp] for iSp in self.ogSet.seqsInfo.speciesToUse] for og in ogs] nGenes = [len(og) for og in ogs] nSeqs = self.ogSet.seqsInfo.nSeqsPerSpecies ogMatrices = [np.zeros((n, n)) for n in nGenes] for iiSp, sp1 in enumerate(self.ogSet.seqsInfo.speciesToUse): util.PrintTime("Processing species %d" % sp1) Bs = [matrices.LoadMatrix("Bit", self.ogSet.fileInfo, sp1, sp2) for sp2 in self.ogSet.seqsInfo.speciesToUse] mins = np.ones((nSeqs[sp1], 1), dtype=np.float64)*9e99 maxes = np.zeros((nSeqs[sp1], 1), dtype=np.float64) for B in Bs: mins = np.minimum(mins, lil_min(B)) maxes = np.maximum(maxes, lil_max(B)) for jjSp, B in enumerate(Bs): for og, m in zip(ogsPerSpecies, ogMatrices): for gi, i in og[iiSp]: for gj, j in og[jjSp]: m[i, j] = 0.5*max(B[gi.iSeq, gj.iSeq], mins[gi.iSeq]) / maxes[gi.iSeq] # inf if i doesn't hit anything but is hit return ogs, ogMatrices
def DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDirectory, GeneToSpecies, output_dir, reconTreesRenamedDir): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) for index1 in xrange(nspecies): d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): recon_tree = files.FileHandler.GetPhyldogOGResultsTreeFN(iog) orthologues = GetOrthologues_from_phyldog_tree(iog, recon_tree, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict()) allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, False) return nOrthologues_SpPair
def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo): while True: try: i, args = cmd_queue.get(True, 1) nDone = i - nProcesses + 1 if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (nDone, nToDo)) B = BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False) matrices.DumpMatrix("Bit", B, fileInfo, args[0], args[1]) except Queue.Empty: return
def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo): while True: try: i, args = cmd_queue.get(True, 1) nDone = i - nProcesses + 1 if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (nDone, nToDo)) B = BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False) with open(fileInfo.workingDir + "Bit%d_%d.pic" % args, 'wb') as outfile: pic.dump(B, outfile, protocol = util.picProtocol) except Queue.Empty: return
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) dSuspect = output_dir + "Putative_Xenologues/" if not os.path.exists(dSuspect): os.mkdir(dSuspect) for index1 in xrange(nspecies): with open(dSuspect + '%s.csv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) neighbours = GetSpeciesNeighbours(species_tree_rooted) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) species = speciesDict.keys() with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): orthologues, recon_tree, suspect_genes = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes) for index0 in xrange(nspecies): strsp0 = species[index0] strsp0_ = strsp0+"_" these_genes = [g for g in suspect_genes if g.startswith(strsp0_)] if len(these_genes) > 0: with open(output_dir + "Orthologues_" + speciesDict[strsp0] + "/Putative_Horizontal_Gene_Transfer.txt", 'ab') as outfile: outfile.write("\n".join([SequenceDict[g]]) + "\n") allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, True) return nOrthologues_SpPair
def OrthologuesFromTrees(recon_method, nHighParallel, userSpeciesTree_fn, qAddSpeciesToIDs): """ userSpeciesTree_fn - None if not supplied otherwise rooted tree using user species names (not orthofinder IDs) qUserSpTree - is the speciesTree_fn user-supplied Just infer orthologues from trees, don't do any of the preceeding steps. """ speciesToUse, nSpAll, _ = util.GetSpeciesToUse(files.FileHandler.GetSpeciesIDsFN()) ogSet = OrthoGroupsSet(files.FileHandler.GetWorkingDirectory1_Read(), speciesToUse, nSpAll, qAddSpeciesToIDs, idExtractor = util.FirstWordExtractor) if userSpeciesTree_fn != None: speciesDict = files.FileHandler.GetSpeciesDict() speciesToUseNames = [speciesDict[str(iSp)] for iSp in ogSet.speciesToUse] CheckUserSpeciesTree(userSpeciesTree_fn, speciesToUseNames) speciesTreeFN_ids = files.FileHandler.GetSpeciesTreeIDsRootedFN() ConvertUserSpeciesTree(userSpeciesTree_fn, speciesDict, speciesTreeFN_ids) util.PrintUnderline("Running Orthologue Prediction", True) util.PrintUnderline("Reconciling gene and species trees") ReconciliationAndOrthologues(recon_method, ogSet, nHighParallel) util.PrintUnderline("Writing results files") util.PrintTime("Writing results files") files.FileHandler.CleanWorkingDir2() return "Species-by-species orthologues directory:\n %s\n" % files.FileHandler.GetOrthologuesDirectory()
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) for index1 in xrange(nspecies): d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) allOrthologues = [] with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): orthologues, recon_tree = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes) allOrthologues.append((iog, orthologues)) util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir) return nOrthologues_SpPair
def ReconciliationAndOrthologues(recon_method, ogSet, nParallel, iSpeciesTree=None, all_stride_dup_genes=None): """ ogSet - info about the orthogroups, species etc resultsDir - where the Orthologues top level results directory will go (should exist already) reconTreesRenamedDir - where to put the reconcilled trees that use the gene accessions iSpeciesTree - which of the potential roots of the species tree is this method - can be dlcpar, dlcpar_deep, of_recon """ speciesTree_ids_fn = files.FileHandler.GetSpeciesTreeIDsRootedFN() labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN() util.RenameTreeTaxa(speciesTree_ids_fn, labeled_tree_fn, ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N') workingDir = files.FileHandler.GetWorkingDirectory_Write() # workingDir - Orthologues working dir resultsDir_ologs = files.FileHandler.GetOrthologuesDirectory() reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) if "dlcpar" in recon_method: qDeepSearch = (recon_method == "dlcpar_convergedsearch") util.PrintTime("Starting DLCpar") dlcparResultsDir, dlcparLocusTreePat = trees2ologs_dlcpar.RunDlcpar(ogSet, speciesTree_ids_fn, workingDir, nParallel, qDeepSearch) util.PrintTime("Done DLCpar") spec_seq_dict = ogSet.Spec_SeqDict() for iog in xrange(len(ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + dlcparLocusTreePat % iog, files.FileHandler.GetOGsReconTreeFN(iog), spec_seq_dict, qSupport=False, qFixNegatives=False, inFormat=8, label='n') # Orthologue lists util.PrintUnderline("Inferring orthologues from gene trees" + (" (root %d)"%iSpeciesTree if iSpeciesTree != None else "")) pickleDir = files.FileHandler.GetPickleDir() nOrthologues_SpPair = trees2ologs_dlcpar.create_orthologue_lists(ogSet, resultsDir_ologs, dlcparResultsDir, pickleDir) elif "phyldog" == recon_method: util.PrintTime("Starting Orthologues from Phyldog") nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDir, trees2ologs_of.GeneToSpecies_dash, resultsDir_ologs, reconTreesRenamedDir) util.PrintTime("Done Orthologues from Phyldog") else: util.PrintTime("Starting OF Orthologues") qNoRecon = ("only_overlap" == recon_method) nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder(ogSet, speciesTree_ids_fn, trees2ologs_of.GeneToSpecies_dash, all_stride_dup_genes, qNoRecon) util.PrintTime("Done OF Orthologues") nOrthologues_SpPair += TwoAndThreeGeneOrthogroups(ogSet, resultsDir_ologs) WriteOrthologuesStats(ogSet, nOrthologues_SpPair)
def DoOrthologuesForOrthoFinder(ogSet, species_tree_rooted_fn, GeneToSpecies, all_stride_dup_genes, qNoRecon): """ """ # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure qInitialisedSuspectGenesDirs = False speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory() for index1 in xrange(nspecies): d = dResultsOrthologues + "Orthologues_" + speciesDict[str( speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open( d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str( speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow( ("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) neighbours = GetSpeciesNeighbours(species_tree_rooted) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) species = speciesDict.keys() reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) spec_seq_dict = ogSet.Spec_SeqDict() with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow([ "Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2" ]) for iog in xrange(nOgs): rooted_tree_ids, qHaveSupport = CheckAndRootTree( files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted, GeneToSpecies) # this can be parallelised easily if rooted_tree_ids is None: continue # Write rooted tree with accessions util.RenameTreeTaxa(rooted_tree_ids, files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=qHaveSupport, qFixNegatives=True, qViaCopy=True) orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree( iog, rooted_tree_ids, species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=spec_seq_dict, spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes, qNoRecon=qNoRecon) qContainsSuspectGenes = len(suspect_genes) > 0 if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes: qInitialisedSuspectGenesDirs = True dSuspectGenes = files.FileHandler.GetSuspectGenesDir() dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir( ) for index1 in xrange(nspecies): with open( dSuspectOrthologues + '%s.tsv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow( ("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) for index0 in xrange(nspecies): strsp0 = species[index0] strsp0_ = strsp0 + "_" these_genes = [ g for g in suspect_genes if g.startswith(strsp0_) ] if len(these_genes) > 0: with open(dSuspectGenes + speciesDict[strsp0] + ".txt", 'ab') as outfile: outfile.write( "\n".join([SequenceDict[g] for g in these_genes]) + "\n") allOrthologues = [(iog, orthologues)] # don't relabel nodes, they've already been done util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, spec_seq_dict, qSupport=False, qFixNegatives=True) if iog >= 0 and divmod( iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles( allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, dResultsOrthologues, qContainsSuspectGenes) return nOrthologues_SpPair
def OrthologuesWorkflow(speciesToUse, nSpAll, tree_options, msa_method, tree_method, recon_method, nHighParallel, nLowParrallel, qDoubleBlast, qAddSpeciesToIDs, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, results_name = ""): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(files.FileHandler.GetWorkingDirectory1_Read(), speciesToUse, nSpAll, qAddSpeciesToIDs, idExtractor = util.FirstWordExtractor) tree_generation_method = "msa" if qMSA or qPhyldog else "dendroblast" stop_after = "seqs" if qStopAfterSeqs else "align" if qStopAfterAlign else "" files.FileHandler.MakeResultsDirectory2(tree_generation_method, stop_after, results_name) """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ qDB_SpeciesTree = False if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), spTreeFN_ids) if qMSA or qPhyldog: qLessThanFourSpecies = len(ogSet.seqsInfo.speciesToUse) < 4 treeGen = trees_msa.TreesForOrthogroups(tree_options, msa_method, tree_method) if (not userSpeciesTree) and qLessThanFourSpecies: spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() WriteSpeciesTreeIDs_TwoThree(ogSet.seqsInfo.speciesToUse, spTreeFN_ids) util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True) qDoMSASpeciesTree = (not qLessThanFourSpecies) and (not userSpeciesTree) util.PrintTime("Starting MSA/Trees") seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.OrthogroupMatrix(), ogSet.Spec_SeqDict(), ogSet.SpeciesDict(), ogSet.speciesToUse, nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog, qDoSpeciesTree=qDoMSASpeciesTree) util.PrintTime("Done MSA/Trees") if qDoMSASpeciesTree: spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN() if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast) if qDB_SpeciesTree and not userSpeciesTree and not qLessThanFourSpecies: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") spTreeFN_ids = db.SpeciesTreeOnly() if qPhyldog: # util.PrintTime("Do species tree for phyldog") # spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if userSpeciesTree: userSpeciesTree = ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), files.FileHandler.GetSpeciesTreeUnrootedFN()) util.PrintTime("Starting phyldog") species_tree_ids_labelled_phyldog = wrapper_phyldog.RunPhyldogAnalysis(files.FileHandler.GetPhyldogWorkingDirectory(), ogSet.OGs(), speciesToUse, nHighParallel) else: db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast) spTreeFN_ids, qSTAG = db.RunAnalysis() files.FileHandler.LogWorkingDirectoryTrees() qSpeciesTreeSupports = False if (userSpeciesTree or qMSA or qPhyldog) else qSTAG """ SpeciesTree spTreeFN_ids, or equivalently FileHandler.GetSpeciesTreeUnrootedFN() in all cases (user, inferred etc) Thus, we always have the species tree ids format With phyldog, we also have species_tree_ids_labelled_phyldog - with the node labels given by phyldog """ """ === 2 === Check can continue with analysis """ # if len(ogSet.speciesToUse) < 4: # print("ERROR: Not enough species to infer species tree") # util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if qPhyldog: rootedSpeciesTreeFN = [species_tree_ids_labelled_phyldog] roots = [None] qMultiple = False all_stride_dup_genes = None elif userSpeciesTree: rootedSpeciesTreeFN = [spTreeFN_ids] roots = [None] qMultiple = False all_stride_dup_genes = None elif len(ogSet.seqsInfo.speciesToUse) == 2: hardcodeSpeciesTree = GetSpeciesTreeRoot_TwoTaxa(ogSet.seqsInfo.speciesToUse) rootedSpeciesTreeFN = [hardcodeSpeciesTree] roots = [None] qMultiple = False all_stride_dup_genes = None else: util.PrintUnderline("Best outgroup(s) for species tree") util.PrintTime("Starting STRIDE") roots, clusters_counter, rootedSpeciesTreeFN, nSupport, _, _, all_stride_dup_genes = stride.GetRoot(spTreeFN_ids, files.FileHandler.GetOGsTreeDir(), stride.GeneToSpecies_dash, nHighParallel, qWriteRootedTree=True) util.PrintTime("Done STRIDE") nAll = sum(clusters_counter.values()) nFP_mp = nAll - nSupport n_non_trivial = sum([v for k, v in clusters_counter.items() if len(k) > 1]) if len(roots) > 1: print("Observed %d well-supported, non-terminal duplications. %d support the best roots and %d contradict them." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp)) print("Best outgroups for species tree:") else: print("Observed %d well-supported, non-terminal duplications. %d support the best root and %d contradict it." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp)) print("Best outgroup for species tree:") spDict = ogSet.SpeciesDict() for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 shutil.copy(rootedSpeciesTreeFN[0], files.FileHandler.GetSpeciesTreeIDsRootedFN()) """ SpeciesTree: We now have a list of rooted species trees: rootedSpeciesTreeFN (this should be recorded by the file handler) """ if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (files.FileHandler.GetResultsTreesDir()) return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(i, not qMultiple)) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True) labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN() util.RenameTreeTaxa(speciesTree_fn, labeled_tree_fn, db.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N') files.FileHandler.CleanWorkingDir2() return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nMultiple potential species tree roots were identified, only one will be analyed.", True) resultsSpeciesTrees = [] i = 0 r = roots[0] speciesTree_fn = rootedSpeciesTreeFN[0] util.PrintUnderline("Reconciling gene trees and species tree") resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(0, True)) if (not userSpeciesTree) and (not qPhyldog) and len(ogSet.seqsInfo.speciesToUse) != 2: print("Outgroup: " + (", ".join([spDict[s] for s in r]))) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True) util.PrintTime("Starting Recon and orthologues") ReconciliationAndOrthologues(recon_method, db.ogSet, nHighParallel, i if qMultiple else None, all_stride_dup_genes=all_stride_dup_genes) util.PrintTime("Done Recon") if qMultiple: for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): unanalysedSpeciesTree = files.FileHandler.GetSpeciesTreeResultsFN(i, False) util.RenameTreeTaxa(speciesTree_fn, unanalysedSpeciesTree, db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True, label='N') """ SpeciesTree: If it's been inferred, there is now at least one rooted results species trees: GetSpeciesTreeResultsFN() """ files.FileHandler.CleanWorkingDir2() util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)