def ReadAlignment(fn): msa = dict() accession = None length = None seq = "" with open(fn, 'rb') as infile: for line in infile: line = line.rstrip() if line.startswith(">"): if accession != None: if length != None and len(seq) != length: print( "ERROR: Sequence length mismatch in MSA: %s & %d" % (length, len(seq))) util.Fail() msa[accession] = seq accession = line[1:] seq = "" else: seq += line if accession != None: if length != None and len(seq) != length: print("Error: Sequence length mismatch in MSA: %s & %d" % (length, len(seq))) util.Fail() msa[accession] = seq return MSA(msa)
def OrthologuesFromTrees(groupsDir, workingDir, nHighParallel, speciesTree_fn = None, pickleDir=None): """ groupsDir - directory with orthogroups file in userSpeciesTree_fn - None if not supplied otherwise rooted tree using user species names (not orthofinder IDs) workingDir - orthologues 'WorkingDirectory' qUserSpTree - is the speciesTree_fn user-supplied Just infer orthologues from trees, don't do any of the preceeding steps. """ # Check species tree qUserSpTree = (speciesTree_fn != None) if qUserSpTree: if not os.path.exists(speciesTree_fn): print("\nERROR: %s does not exist\n" % speciesTree_fn) util.Fail() else: possibilities = ["SpeciesTree_ids_0_rooted.txt", "SpeciesTree_ids_1_rooted.txt", "SpeciesTree_user_ids.txt"] # etc (only need to determine if unique) nTrees = 0 for p in possibilities: fn = workingDir + "Trees_ids/" + p if os.path.exists(fn): nTrees += 1 speciesTree_fn = fn if nTrees == 0: print("\nERROR: There is a problem with the specified directory. The rooted species tree %s or %s is not present." % (possibilities[0], possibilities[2])) print("Please rectify the problem or alternatively use the -s option to specify the species tree to use.\n") util.Fail() if nTrees > 1: print("\nERROR: There is more than one rooted species tree in the specified directory structure. Please use the -s option to specify which species tree should be used\n") util.Fail() def TreePatIDs(iog): return workingDir + ("Trees_ids/OG%07d_tree_id.txt" % iog) reconTreesRenamedDir = workingDir + "Recon_Gene_Trees/" resultsDir_new = workingDir + "../Orthologues" # for the Orthologues_Species/ directories # if os.path.exists(resultsDir_new): resultsDir_new = util.CreateNewWorkingDirectory(resultsDir_new + "_") # else: # resultsDir_new += os.sep # os.mkdir(resultsDir_new) orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(groupsDir) speciesToUse, nSpAll = util.GetSpeciesToUse(orthofinderWorkingDir + "SpeciesIDs.txt") ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor) if qUserSpTree: speciesToUseNames = ogSet.SpeciesDict().values() CheckUserSpeciesTree(speciesTree_fn, speciesToUseNames) speciesTree_fn = ConvertUserSpeciesTree(workingDir + "Trees_ids/", speciesTree_fn, ogSet.SpeciesDict()) util.PrintUnderline("Running Orthologue Prediction", True) util.PrintUnderline("Reconciling gene and species trees") ReconciliationAndOrthologues(TreePatIDs, ogSet, speciesTree_fn, workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, pickleDir=pickleDir) util.PrintUnderline("Writing results files") CleanWorkingDir(workingDir) return "Species-by-species orthologues directory:\n %s\n" % resultsDir_new
def GetBLAST6Scores(seqsInfo, fileInfo, iSpecies, jSpecies, qExcludeSelfHits = True, sep = "_", qDoubleBlast=True): qSameSpecies = iSpecies==jSpecies qCheckForSelfHits = qExcludeSelfHits and qSameSpecies if not qDoubleBlast: qRev = (iSpecies > jSpecies) else: qRev = False if qRev: iQ = 1 iH = 0 iSpeciesOpen = jSpecies jSpeciesOpen = iSpecies else: iQ = 0 iH = 1 iSpeciesOpen = iSpecies jSpeciesOpen = jSpecies nSeqs_i = seqsInfo.nSeqsPerSpecies[iSpecies] nSeqs_j = seqsInfo.nSeqsPerSpecies[jSpecies] B = sparse.lil_matrix((nSeqs_i, nSeqs_j)) row = "" fn = fileInfo.workingDir + "Blast%d_%d.txt" % (iSpeciesOpen, jSpeciesOpen) try: with (gzip.open(fn + ".gz", 'rb') if os.path.exists(fn + ".gz") else open(fn, 'rb')) as blastfile: blastreader = csv.reader(blastfile, delimiter='\t') for row in blastreader: # Get hit and query IDs try: sequence1ID = int(row[iQ].split(sep, 1)[1]) sequence2ID = int(row[iH].split(sep, 1)[1]) except (IndexError, ValueError): sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n") raise # Get bit score for pair try: score = float(row[11]) except (IndexError, ValueError): sys.stderr.write("\nERROR: 12th field in BLAST results file line should be the bit-score for the hit\n") raise if (qCheckForSelfHits and sequence1ID == sequence2ID): continue # store bit score try: if score > B[sequence1ID, sequence2ID]: B[sequence1ID, sequence2ID] = score except IndexError: def ord(n): return str(n)+("th" if 4<=n%100<=20 else {1:"st",2:"nd",3:"rd"}.get(n%10, "th")) # sys.stderr.write("\nError in input files, expected only %d sequences in species %d and %d sequences in species %d but found a hit in the Blast%d_%d.txt between sequence %d_%d (i.e. %s sequence in species) and sequence %d_%d (i.e. %s sequence in species)\n" % (nSeqs_i, iSpecies, nSeqs_j, jSpecies, iSpecies, jSpecies, iSpecies, sequence1ID, ord(sequence1ID+1), jSpecies, sequence2ID, ord(sequence2ID+1))) sys.stderr.write("\nERROR: Inconsistent input files.\n") kSpecies, nSeqs_k, sequencekID = (iSpecies, nSeqs_i, sequence1ID) if sequence1ID >= nSeqs_i else (jSpecies, nSeqs_j, sequence2ID) sys.stderr.write("Species%d.fa contains only %d sequences " % (kSpecies, nSeqs_k)) sys.stderr.write("but found a query/hit in the Blast%d_%d.txt for sequence %d_%d (i.e. %s sequence in species %d).\n" % (iSpecies, jSpecies, kSpecies, sequencekID, ord(sequencekID+1), kSpecies)) util.Fail() except Exception: sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (fileInfo.workingDir, iSpecies, jSpecies)) sys.stderr.write("\t".join(row) + "\n") raise return B
def _FindFromTrees(self, orthologuesDir, userSpeciesTree): """ if userSpeciesTree == None: Use existing tree """ print("\nFind from trees:") print((orthologuesDir, userSpeciesTree)) self.wd_trees = orthologuesDir + "WorkingDirectory/" # Find species tree if userSpeciesTree == None: possibilities = [ "SpeciesTree_ids_0_rooted.txt", "SpeciesTree_ids_1_rooted.txt", "SpeciesTree_user_ids.txt", "SpeciesTree_unrooted_0_rooted.txt", "STAG_SpeciesTree_ids_0_rooted.txt" ] # etc (only need to determine if unique) nTrees = 0 for p in possibilities: for d in [self.wd_trees, self.wd_trees + "Trees_ids/"]: fn = d + p if os.path.exists(fn): nTrees += 1 speciesTree_fn = fn if nTrees == 0: print( "\nERROR: There is a problem with the specified directory. The rooted species tree %s or %s is not present." % (possibilities[0], possibilities[2])) print( "Please rectify the problem or alternatively use the -s option to specify the species tree to use.\n" ) util.Fail() if nTrees > 1: print( "\nERROR: There is more than one rooted species tree in the specified directory structure. Please use the -s option to specify which species tree should be used\n" ) util.Fail() self.speciesTreeRootedIDsFN = speciesTree_fn else: if not os.path.exists(userSpeciesTree): print("\nERROR: %s does not exist\n" % userSpeciesTree) util.Fail() self.speciesTreeRootedIDsFN = userSpeciesTree
def SequenceDict(self): if self.seqIDsEx == None: try: self.seqIDsEx = self._extractor(self.seqIDsFN) except RuntimeError as error: print(error.message) if error.message.startswith("ERROR"): util.Fail() else: print("Tried to use only the first part of the accession in order to list the sequences in each orthogroup\nmore concisely but these were not unique. The full accession line will be used instead.\n") self.seqIDsEx = util.FullAccession(self.seqIDsFN) return self.seqIDsEx.GetIDToNameDict()
def CheckUserSpeciesTree(speciesTreeFN, expSpecies): # File exists if not os.path.exists(speciesTreeFN): print("Species tree file does not exist: %s" % speciesTreeFN) util.Fail() # Species in tree are unique t = tree.Tree(speciesTreeFN) actSpecies = (t.get_leaf_names()) c = Counter(actSpecies) if 1 != c.most_common()[0][1]: print("ERROR: Species names in species tree are not unique") for sp, n in c.most_common(): if 1 != n: print("Species '%s' appears %d times" % (sp, n)) util.Fail() # All required species are present actSpecies = set(actSpecies) ok = True for sp in expSpecies: if sp not in actSpecies: print("ERROR: '%s' is missing from species tree" % sp) ok = False # expected species are unique c = Counter(expSpecies) if 1 != c.most_common()[0][1]: print("ERROR: Species names are not unique") for sp, n in c.most_common(): if 1 != n: print("Species '%s' appears %d times" % (sp, n)) util.Fail() expSpecies = set(expSpecies) for sp in actSpecies: if sp not in expSpecies: print("ERROR: Additional species '%s' in species tree" % sp) ok = False if not ok: util.Fail() # Tree is rooted if len(t.get_children()) != 2: print("ERROR: Species tree is not rooted") util.Fail()
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, nProcesses): ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor=util.FirstWordExtractor) if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() print("\n1. Checking required programs are installed") print("-------------------------------------------") if not CanRunDependencies(orthofinderWorkingDir): print( "Orthogroups have been inferred but the dependencies for inferring gene trees and\northologues have not been met. Please review previous messages for more information." ) sys.exit() print("\n2. Calculating gene distances") print("-----------------------------") resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) db.ReadAndPickle() nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() print("\n4. Best outgroup(s) for species tree") print("------------------------------------") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot( spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt=1) if len(roots) > 1: print( "Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print( "Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r]))) qMultiple = len(roots) > 1 if qMultiple: print("\nAnalysing each of the potential species tree roots.") resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append( resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) print("\n5%s. Reconciling gene and species trees" % ("-%d" % i if qMultiple else "")) print("-------------------------------------" + ("--" if qMultiple else "")) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) os.mkdir(reconTreesRenamedDir) for iog in xrange(len(db.ogSet.OGs())): util.RenameTreeTaxa(dlcparResultsDir + "OG%07d_tree_id.locus.tree" % iog, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, db.ogSet.Spec_SeqDict(), qFixNegatives=False, inFormat=8) # Orthologue lists print("\n6%s. Inferring orthologues from gene trees" % ("-%d" % i if qMultiple else "")) print("----------------------------------------" + ("--" if qMultiple else "")) pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) CleanWorkingDir(db) print("\n7. Writing results files") print("------------------------") return GetResultsFilesString(resultsSpeciesTrees)
if __name__ == "__main__": if len(sys.argv) < 2 or sys.argv[1] == "--help" or sys.argv[1] == "-h": PrintHelp() sys.exit() # get arguments userDir = None nProcesses = None args = sys.argv[1:] while len(args) != 0: arg = args.pop(0) if arg == "-t" or arg == "--threads": if len(args) == 0: print("Missing option for command line argument -t") util.Fail() arg = args.pop(0) try: nProcesses = int(arg) except: print("Incorrect argument for number of threads: %s" % arg) util.Fail() else: userDir = arg # Check arguments print("0. Getting Orthologues") print("----------------------") if nProcesses == None: print( """\nNumber of parallel processes has not been specified, will use the default value.
def GetRoots(tree, species_tree_rooted, GeneToSpecies): species = set([GeneToSpecies(g) for g in tree.get_leaf_names()]) if len(species) == 1: return [], 0, None ch = species_tree_rooted.get_children() if len(ch) != 2: print("ERROR: Species tree is not rooted") util.Fail() n1, n2 = ch t1 = set(n1.get_leaf_names()) t2 = set(n2.get_leaf_names()) have1 = len(species.intersection(t1)) != 0 have2 = len(species.intersection(t2)) != 0 # print(tree) while not (have1 and have2): # Doesn't contain outgroup, step down in species tree until it does if have1: n = n1 else: n = n2 n1, n2 = n.get_children() t1 = n1.get_leaf_names() t2 = n2.get_leaf_names() have1 = len(species.intersection(t1)) != 0 have2 = len(species.intersection(t2)) != 0 n1, n2 = species_tree_rooted.get_children() root_mapper = RootMap(t1, t2, GeneToSpecies) GeneMap = root_mapper.GeneMap StoreSpeciesSets(tree, GeneMap) found = set() TF = set([True, False]) TFfr = frozenset([True, False]) Tfr = frozenset([True]) Ffr = frozenset([False]) fail = 0 for m in tree: n = m.up while not n.is_root() and n.sp_down != TF: m = n n = m.up if n.sp_down == TF: children = n.get_children() if n.is_root(): colour = m.sp_down if any([ x.sp_down != colour and len(x.sp_down) == 1 for x in children ]): comb = Counter([frozenset(x.sp_down) for x in children]) # case 0 if comb[TFfr] == 0: # case 0A - one of the branches is the root for c in children: if sum([c.sp_down == x.sp_down for x in children]) == 1: found.add(c) # only holds for one of them break elif comb[TFfr] == 1 and (comb[Tfr] == 2 or comb[Ffr] == 2): # case 0B - one mixed branch, two identical True/False branches # we don't know this is the division, stepping down in the mixed branch might still be all same as the single state ones # we'll find this division while walking up the tree pass elif comb[TFfr] == 1 and comb[Tfr] == 1: # case 0C - one mixed branch, one True & one False found.add([c for c in children if c.sp_down == TF][0]) else: # case 0D - two mixed branches # while find the transition while walking up the tree pass # found.add(n) # print("*** Root1 ***") elif len(children) == 2: # found.add(n) c1, c2 = children single_state = c1.sp_down if len( c1.sp_down) == 1 else c2.sp_down if len(c1.sp_down) == 1 and len(c2.sp_down) == 1: # Case 1 - Both single state if len(n.sp_up) == 1: # Case 1A - 3rd clade also single state # Root is the bipartition separating True from False found.add(c1 if n.sp_up == c2.sp_down else c2) else: # Case 1B - 3rd clade is mixed found.add(n) else: # Case 2 - only one is single state and it's not the same as the 3rd clade if single_state != n.sp_up: # Case 2A - only one is single state and it's not the same as the 3rd clade # print("*** Root3 ***") found.add(c1 if len(c1.sp_down) == 1 else c2) # else: # # Case 2A - only one is single state and it's the same as the 3rd clade # # root is in the mixed clade and will be found while walking up that # pass else: fail += 1 return list(found), fail, GeneMap
def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree): idDict.update(speciesIdDict) # smae code will then also convert concatenated alignment for species tree # 0 resultsDirsFullPath = [] for fn in [self.GetFastaFilename, self.GetAlignmentFilename, self.GetTreeFilename]: for qIDs in [True, False]: d = os.path.split(fn(0, not qIDs))[0] if not os.path.exists(d): os.mkdir(d) if not qIDs: resultsDirsFullPath.append(d) if qStopAfterSeqs: break if qStopAfterAlignments and fn == self.GetAlignmentFilename: break # 1. fastaWriter = FastaWriter(self.ogsWorkingDir) self.WriteFastaFiles(fastaWriter, ogs, idDict) if qStopAfterSeqs: return resultsDirsFullPath # 3 # Get OGs to use for species tree if qDoSpeciesTree: iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree(ogMatrix) concatenated_algn_fn = os.path.split(self.GetAlignmentFilename(0))[0] + "/SpeciesTreeAlignment.fa" else: iOgsForSpeciesTree = [] alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs) if qStopAfterAlignments: util.PrintUnderline("Inferring multiple sequence alignments") pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # ids -> accessions alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa") self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) return resultsDirsFullPath[:2] # Otherwise, alignments and trees # Strategy is # 1. Do alignments (and trees) require for species tree # 2. Create concatenated alignment # 3. Create second list of commands [speciestree] + [remaining alignments and trees] alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)] treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs) commands_and_filenames = [] if qDoSpeciesTree: print("Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100.*fSingleCopy)) util.PrintUnderline("Inferring multiple sequence alignments for species tree") # Do required alignments and trees speciesTreeFN_ids = os.path.split(self.GetTreeFilename(i))[0] + "/SpeciesTree_unrooted.txt" for i in iOgsForSpeciesTree: commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy) # Add species tree to list of commands to run commands_and_filenames = [self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"])] util.PrintUnderline("Inferring remaining multiple sequence alignments and gene trees") else: util.PrintUnderline("Inferring multiple sequence alignments and gene trees") # Now continue as before iOgsForSpeciesTree = set(iOgsForSpeciesTree) for i in xrange(len(treeCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]]) for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)): if i in iOgsForSpeciesTree: continue commands_and_filenames.append([alignCommands_and_filenames[i]]) pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True) # Convert ids to accessions accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))] # Add concatenated Alignment if qDoSpeciesTree: alignmentFilesToUse.append(concatenated_algn_fn) accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa") self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict) if os.path.exists(speciesTreeFN_ids): util.RenameTreeTaxa(speciesTreeFN_ids, self.workingDir + "SpeciesTree_unrooted.txt", idDict, qFixNegatives=True) else: print("ERROR: Species tree inference failed") util.Fail() for i in xrange(len(treeCommands_and_filenames)): if os.path.exists(self.GetTreeFilename(i)): util.RenameTreeTaxa(self.GetTreeFilename(i), self.GetTreeFilename(i, True), idDict, qFixNegatives=True) return resultsDirsFullPath[:2]
def OrthologuesWorkflow(workingDir_ogs, orthofinderResultsDir, speciesToUse, nSpAll, clustersFilename_pairs, tree_options, msa_method, tree_method, nHighParallel, nLowParrallel, userSpeciesTree = None, qStopAfterSeqs = False, qStopAfterAlign = False, qStopAfterTrees = False, qMSA = False, qPhyldog = False, pickleDir=None): """ 1. Setup: - ogSet, directories - DendroBLASTTress - object 2. DendrobBLAST: - read scores - RunAnalysis: Get distance matrices, do trees 3. Root species tree 4. Reconciliation/Orthologues 5. Clean up Variables: - ogSet - all the relevant information about the orthogroups, species etc. """ ogSet = OrthoGroupsSet(workingDir_ogs, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor, pickleDir=pickleDir) # Class that is going to run the analysis needs to check the dependencies # if not CanRunOrthologueDependencies(workingDir_ogs, qMSA, qStopAfterTrees, userSpeciesTree == None): # print("Orthogroups have been inferred but the dependencies for inferring gene trees and") # print("orthologues have not been met. Please review previous messages for more information.") # sys.exit() resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") """ === 1 === ust = UserSpeciesTree MSA: Sequences Alignments GeneTrees db SpeciesTree Phyldog: Sequences Alignments GeneTrees db SpeciesTree Dendroblast: DistanceMatrices GeneTrees db SpeciesTree MSA (ust): Sequences Alignments GeneTrees db Phyldog (ust): Sequences Alignments GeneTrees db Dendroblast (ust): DistanceMatrices GeneTrees db """ if qMSA or qPhyldog: treeGen = msa.TreesForOrthogroups(tree_options, msa_method, tree_method, resultsDir, workingDir_ogs) seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.Spec_SeqDict(), nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog) if qStopAfterSeqs: print("") return ("\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0]) elif qStopAfterAlign: print("") st = "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] return st db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) if not userSpeciesTree: util.PrintUnderline("Inferring species tree (calculating gene distances)") print("Loading BLAST scores") db.ReadAndPickle() spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly() if qPhyldog: trees_from_phyldog.RunPhyldogAnalysis(resultsDir + "WorkingDirectory/phyldog/", ogSet.OGs(), speciesToUse) return "Running Phyldog" + "\n".join(seqs_alignments_dirs) else: util.PrintUnderline("Calculating gene distances") db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel) db.ReadAndPickle() nOGs, D, spTreeFN_ids, spTreeUnrootedFN = db.RunAnalysis() """ === 2 === Check can continue with analysis """ if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") util.Fail() """ === 3 === MSA: RootSpeciesTree Phyldog: RootSpeciesTree Dendroblast: RootSpeciesTree MSA (ust): ConvertSpeciesTreeIDs Phyldog (ust): ConvertSpeciesTreeIDs Dendroblast (ust): ConvertSpeciesTreeIDs """ if userSpeciesTree: util.PrintUnderline("Using user-supplied species tree") userSpeciesTree = ConvertUserSpeciesTree(db.workingDir + "Trees_ids/", userSpeciesTree, ogSet.SpeciesDict()) rootedSpeciesTreeFN = [userSpeciesTree] roots = [None] qMultiple = False else: util.PrintUnderline("Best outgroup(s) for species tree") spDict = ogSet.SpeciesDict() roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.TreeFilename_IDs(0))[0] + "/", rfd.GeneToSpecies_dash, nHighParallel, treeFmt = 1) if len(roots) > 1: print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroups for species tree:") else: print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) print("Best outgroup for species tree:") for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) qMultiple = len(roots) > 1 if qStopAfterTrees: if userSpeciesTree: st = "" if qMSA: st += "\nSequences for orthogroups:\n %s\n" % seqs_alignments_dirs[0] st += "\nMultiple sequence alignments:\n %s\n" % seqs_alignments_dirs[1] st += "\nGene trees:\n %s\n" % (resultsDir + "Gene_Trees/") return st # otherwise, root species tree resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if len(roots) == 1: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False) if qMultiple: util.PrintUnderline("\nAnalysing each of the potential species tree roots", True) resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): util.PrintUnderline("Reconciling gene trees and species tree" + (" (root %d)"%i if qMultiple else "")) if qMultiple: resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) print("Outgroup: " + (", ".join([spDict[s] for s in r]))) elif userSpeciesTree: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") else: resultsDir_new = resultsDir + "Orthologues/" reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/" resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") print("Outgroup: " + (", ".join([spDict[s] for s in r]))) os.mkdir(resultsDir_new) util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) ReconciliationAndOrthologues(db.TreeFilename_IDs, db.ogSet, speciesTree_fn, db.workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, i if qMultiple else None, pickleDir=pickleDir) db.DeleteBlastMatrices() CleanWorkingDir(db.workingDir) util.PrintUnderline("Writing results files", True) return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)
def _GetOGsFile(self, userArg): """returns the WorkingDirectory, ResultsDirectory and clusters_id_pairs filename""" qSpecifiedResultsFile = False if userArg == None: print( "ERROR: orthofinder_results_directory has not been specified") util.Fail() if os.path.isfile(userArg): fn = os.path.split(userArg)[1] if ("clusters_OrthoFinder_" not in fn) or ("txt_id_pairs.txt" not in fn): print( "ERROR:\n %s\nis neither a directory or a clusters_OrthoFinder_*.txt_id_pairs.txt file." % userArg) util.Fail() qSpecifiedResultsFile = True # user has specified specific results file elif userArg[-1] != os.path.sep: userArg += os.path.sep # find required files if qSpecifiedResultsFile: orthofinderWorkingDir = os.path.split(userArg)[0] + os.sep if not self._IsWorkingDirectory(orthofinderWorkingDir): print( "ERROR: cannot find files from OrthoFinder run in directory:\n %s" % orthofinderWorkingDir) util.Fail() else: orthofinderWorkingDir = os.path.split( userArg)[0] if qSpecifiedResultsFile else userArg if not self._IsWorkingDirectory(orthofinderWorkingDir): orthofinderWorkingDir = userArg + "WorkingDirectory" + os.sep if not self._IsWorkingDirectory(orthofinderWorkingDir): print( "ERROR: cannot find files from OrthoFinder run in directory:\n %s\nor\n %s\n" % (userArg, orthofinderWorkingDir)) util.Fail() if qSpecifiedResultsFile: print("\nUsing orthogroups in file:\n %s" % userArg) return orthofinderWorkingDir, orthofinderWorkingDir, userArg else: # identify orthogroups file clustersFiles = glob.glob( orthofinderWorkingDir + "clusters_OrthoFinder_*.txt_id_pairs.txt") orthogroupFiles = glob.glob(orthofinderWorkingDir + "OrthologousGroups*.txt") + glob.glob( orthofinderWorkingDir + "Orthogroups*.txt") if orthofinderWorkingDir != userArg: orthogroupFiles += glob.glob(userArg + "OrthologousGroups*.txt") orthogroupFiles += glob.glob(userArg + "Orthogroups*.txt") # User may have specified a WorkingDirectory and results could be in directory above if len(orthogroupFiles) < len(clustersFiles): orthogroupFiles += glob.glob(userArg + ".." + os.sep + "OrthologousGroups*.txt") orthogroupFiles += glob.glob(userArg + ".." + os.sep + "Orthogroups*.txt") clustersFiles = sorted(clustersFiles) orthogroupFiles = sorted(orthogroupFiles) if len(clustersFiles) > 1 or len(orthogroupFiles) > 1: print("ERROR: Results from multiple OrthoFinder runs found\n") print( "Tab-delimiter Orthogroups*.txt/OrthologousGroups*.txt files:" ) for fn in orthogroupFiles: print(" " + fn) print("With corresponding cluster files:") for fn in clustersFiles: print(" " + fn) print( "\nPlease run with only one set of results in directories or specifiy the specific clusters_OrthoFinder_*.txt_id_pairs.txt file on the command line" ) util.Fail() if len(clustersFiles) != 1 or len(orthogroupFiles) != 1: print( "ERROR: Results not found in <orthofinder_results_directory> or <orthofinder_results_directory>/WorkingDirectory" ) print( "\nCould not find:\n Orthogroups*.txt/OrthologousGroups*.txt\nor\n clusters_OrthoFinder_*.txt_id_pairs.txt" ) util.Fail() print("\nUsing orthogroups in file:\n %s" % orthogroupFiles[0]) print("and corresponding clusters file:\n %s" % clustersFiles[0]) return orthofinderWorkingDir, userArg, clustersFiles[0]
def _ProcessLog(self, logFN): """ Get all relevant data from log file. Checks the paths ssaved do exist still Should work with relevant paths to allow directory to move Other methods can then check that the data required for a particualr run is available """ with open(logFN, 'rb') as infile: for line in infile: if line.startswith("Species used:"): self.species_ids_lines = "" line = infile.next() while line.rstrip() != "": self.species_ids_lines += line line = infile.next() wd_base_str = "WorkingDirectory_Base: " wd_trees_str = "WorkingDirectory_Trees: " clusters_str = "FN_Orthogroups: " if line.startswith(wd_base_str): wd_base_anchor = line.rstrip()[len(wd_base_str):] if not os.path.exists(wd_base_anchor): # try to see if it's a relative directory to current one path, d_wd = os.path.split(wd_base_anchor[:-1]) path, d_res = os.path.split(path) wd_base_anchor = os.path.split(logFN)[0] + ( "/../%s/%s/" % (d_res, d_wd)) if not os.path.exists(wd_base_anchor): print("ERROR: Missing directory: %s" % wd_base_anchor) util.Fail() self.wd_base_prev = self.GetWDBaseChain(wd_base_anchor) if line.startswith(clusters_str): clusters_fn_full_path = line.rstrip()[len(clusters_str):] self.clustersFilename_pairs = clusters_fn_full_path if not os.path.exists(self.clustersFilename_pairs): # try to see if it's a relative directory to current one path, clusters_fn = os.path.split( self.clustersFilename_pairs) path, d_wd = os.path.split(path) path, d_res = os.path.split(path) self.clustersFilename_pairs = os.path.split( logFN)[0] + ("/../%s/%s/%s" % (d_res, d_wd, clusters_fn)) if not os.path.exists(self.clustersFilename_pairs): print("ERROR: Missing orthogroups file: %s or %s" % (self.clustersFilename_pairs, clusters_fn_full_path)) util.Fail() # self._GetOGsFile(wd_ogs_path) if line.startswith(wd_trees_str): self.wd_trees = line.rstrip()[len(wd_trees_str):] self.speciesTreeRootedIDsFN = self.wd_trees + "SpeciesTree_rooted_ids.txt" if not os.path.exists(self.wd_trees): # try to see if it's a relative directory to current one path, d_wd = os.path.split(self.wd_trees[:-1]) path, d_res = os.path.split(path) self.wd_trees = os.path.split(logFN)[0] + ( "/../%s/%s/" % (d_res, d_wd)) if not os.path.exists(self.wd_trees): print("ERROR: Missing directory: %s" % self.wd_trees) util.Fail()
def LogFailAndExit(self, text=""): if text != "": print(text) self.WriteToLog("\nERROR: An error occurred\n" + text) util.Fail()