Exemple #1
0
def ReadAlignment(fn):
    msa = dict()
    accession = None
    length = None
    seq = ""
    with open(fn, 'rb') as infile:
        for line in infile:
            line = line.rstrip()
            if line.startswith(">"):
                if accession != None:
                    if length != None and len(seq) != length:
                        print(
                            "ERROR: Sequence length mismatch in MSA: %s & %d" %
                            (length, len(seq)))
                        util.Fail()
                    msa[accession] = seq
                accession = line[1:]
                seq = ""
            else:
                seq += line
        if accession != None:
            if length != None and len(seq) != length:
                print("Error: Sequence length mismatch in MSA: %s & %d" %
                      (length, len(seq)))
                util.Fail()
            msa[accession] = seq
    return MSA(msa)
def OrthologuesFromTrees(groupsDir, workingDir, nHighParallel, speciesTree_fn = None, pickleDir=None):
    """
    groupsDir - directory with orthogroups file in
    userSpeciesTree_fn - None if not supplied otherwise rooted tree using user species names (not orthofinder IDs)
    workingDir - orthologues 'WorkingDirectory'
    qUserSpTree - is the speciesTree_fn user-supplied
    
    Just infer orthologues from trees, don't do any of the preceeding steps.
    """
    # Check species tree
    qUserSpTree = (speciesTree_fn != None)
    if qUserSpTree:
        if not os.path.exists(speciesTree_fn):
            print("\nERROR: %s does not exist\n" % speciesTree_fn)
            util.Fail()
    else:
        possibilities = ["SpeciesTree_ids_0_rooted.txt", "SpeciesTree_ids_1_rooted.txt", "SpeciesTree_user_ids.txt"] # etc (only need to determine if unique)
        nTrees = 0
        for p in possibilities:
            fn = workingDir + "Trees_ids/" + p
            if os.path.exists(fn): 
                nTrees += 1
                speciesTree_fn = fn
        if nTrees == 0:
            print("\nERROR: There is a problem with the specified directory. The rooted species tree %s or %s is not present." % (possibilities[0], possibilities[2]))
            print("Please rectify the problem or alternatively use the -s option to specify the species tree to use.\n")
            util.Fail()
        if nTrees > 1:
            print("\nERROR: There is more than one rooted species tree in the specified directory structure. Please use the -s option to specify which species tree should be used\n")
            util.Fail()
    
    def TreePatIDs(iog):
        return workingDir + ("Trees_ids/OG%07d_tree_id.txt" % iog)
    reconTreesRenamedDir = workingDir + "Recon_Gene_Trees/"
    resultsDir_new = workingDir + "../Orthologues"      # for the Orthologues_Species/ directories
#    if os.path.exists(resultsDir_new):
    resultsDir_new = util.CreateNewWorkingDirectory(resultsDir_new + "_")
#    else:
#        resultsDir_new += os.sep
#        os.mkdir(resultsDir_new)
    orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(groupsDir)
    speciesToUse, nSpAll = util.GetSpeciesToUse(orthofinderWorkingDir + "SpeciesIDs.txt")    
    ogSet = OrthoGroupsSet(orthofinderWorkingDir, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor)
    if qUserSpTree:
        speciesToUseNames = ogSet.SpeciesDict().values()
        CheckUserSpeciesTree(speciesTree_fn, speciesToUseNames)
        speciesTree_fn = ConvertUserSpeciesTree(workingDir + "Trees_ids/", speciesTree_fn, ogSet.SpeciesDict())
    util.PrintUnderline("Running Orthologue Prediction", True)
    util.PrintUnderline("Reconciling gene and species trees") 
    ReconciliationAndOrthologues(TreePatIDs, ogSet, speciesTree_fn, workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, pickleDir=pickleDir)
    util.PrintUnderline("Writing results files")
    CleanWorkingDir(workingDir)
    return "Species-by-species orthologues directory:\n   %s\n" % resultsDir_new
def GetBLAST6Scores(seqsInfo, fileInfo, iSpecies, jSpecies, qExcludeSelfHits = True, sep = "_", qDoubleBlast=True): 
    qSameSpecies = iSpecies==jSpecies
    qCheckForSelfHits = qExcludeSelfHits and qSameSpecies
    if not qDoubleBlast:
        qRev = (iSpecies > jSpecies)
    else:
        qRev = False      
    if qRev:
        iQ = 1 
        iH = 0
        iSpeciesOpen = jSpecies
        jSpeciesOpen = iSpecies
    else:        
        iQ = 0 
        iH = 1 
        iSpeciesOpen = iSpecies
        jSpeciesOpen = jSpecies
    nSeqs_i = seqsInfo.nSeqsPerSpecies[iSpecies]
    nSeqs_j = seqsInfo.nSeqsPerSpecies[jSpecies]
    B = sparse.lil_matrix((nSeqs_i, nSeqs_j))
    row = ""
    fn = fileInfo.workingDir + "Blast%d_%d.txt" % (iSpeciesOpen, jSpeciesOpen)
    try:
        with (gzip.open(fn + ".gz", 'rb') if os.path.exists(fn + ".gz") else open(fn, 'rb')) as blastfile:
            blastreader = csv.reader(blastfile, delimiter='\t')
            for row in blastreader:    
                # Get hit and query IDs
                try:
                    sequence1ID = int(row[iQ].split(sep, 1)[1])
                    sequence2ID = int(row[iH].split(sep, 1)[1])     
                except (IndexError, ValueError):
                    sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n")
                    raise
                # Get bit score for pair
                try:
                    score = float(row[11])   
                except (IndexError, ValueError):
                    sys.stderr.write("\nERROR: 12th field in BLAST results file line should be the bit-score for the hit\n")
                    raise
                if (qCheckForSelfHits and sequence1ID == sequence2ID):
                    continue
                # store bit score
                try:
                    if score > B[sequence1ID, sequence2ID]: 
                        B[sequence1ID, sequence2ID] = score   
                except IndexError:
                    def ord(n):
                        return str(n)+("th" if 4<=n%100<=20 else {1:"st",2:"nd",3:"rd"}.get(n%10, "th"))
#                        sys.stderr.write("\nError in input files, expected only %d sequences in species %d and %d sequences in species %d but found a hit in the Blast%d_%d.txt between sequence %d_%d (i.e. %s sequence in species) and sequence %d_%d (i.e. %s sequence in species)\n" %  (nSeqs_i, iSpecies, nSeqs_j, jSpecies, iSpecies, jSpecies, iSpecies, sequence1ID, ord(sequence1ID+1), jSpecies, sequence2ID, ord(sequence2ID+1)))
                    sys.stderr.write("\nERROR: Inconsistent input files.\n")
                    kSpecies, nSeqs_k, sequencekID = (iSpecies,  nSeqs_i, sequence1ID) if sequence1ID >= nSeqs_i else (jSpecies,  nSeqs_j, sequence2ID)
                    sys.stderr.write("Species%d.fa contains only %d sequences " % (kSpecies,  nSeqs_k)) 
                    sys.stderr.write("but found a query/hit in the Blast%d_%d.txt for sequence %d_%d (i.e. %s sequence in species %d).\n" %  (iSpecies, jSpecies, kSpecies, sequencekID, ord(sequencekID+1), kSpecies))
                    util.Fail()
    except Exception:
        sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (fileInfo.workingDir, iSpecies, jSpecies))
        sys.stderr.write("\t".join(row) + "\n")
        raise 
    return B  
Exemple #4
0
 def _FindFromTrees(self, orthologuesDir, userSpeciesTree):
     """
     if userSpeciesTree == None: Use existing tree
     """
     print("\nFind from trees:")
     print((orthologuesDir, userSpeciesTree))
     self.wd_trees = orthologuesDir + "WorkingDirectory/"
     # Find species tree
     if userSpeciesTree == None:
         possibilities = [
             "SpeciesTree_ids_0_rooted.txt", "SpeciesTree_ids_1_rooted.txt",
             "SpeciesTree_user_ids.txt",
             "SpeciesTree_unrooted_0_rooted.txt",
             "STAG_SpeciesTree_ids_0_rooted.txt"
         ]  # etc (only need to determine if unique)
         nTrees = 0
         for p in possibilities:
             for d in [self.wd_trees, self.wd_trees + "Trees_ids/"]:
                 fn = d + p
                 if os.path.exists(fn):
                     nTrees += 1
                     speciesTree_fn = fn
         if nTrees == 0:
             print(
                 "\nERROR: There is a problem with the specified directory. The rooted species tree %s or %s is not present."
                 % (possibilities[0], possibilities[2]))
             print(
                 "Please rectify the problem or alternatively use the -s option to specify the species tree to use.\n"
             )
             util.Fail()
         if nTrees > 1:
             print(
                 "\nERROR: There is more than one rooted species tree in the specified directory structure. Please use the -s option to specify which species tree should be used\n"
             )
             util.Fail()
         self.speciesTreeRootedIDsFN = speciesTree_fn
     else:
         if not os.path.exists(userSpeciesTree):
             print("\nERROR: %s does not exist\n" % userSpeciesTree)
             util.Fail()
         self.speciesTreeRootedIDsFN = userSpeciesTree
 def SequenceDict(self):
     if self.seqIDsEx == None:
         try:
             self.seqIDsEx = self._extractor(self.seqIDsFN)
         except RuntimeError as error:
             print(error.message)
             if error.message.startswith("ERROR"): 
                 util.Fail()
             else:
                 print("Tried to use only the first part of the accession in order to list the sequences in each orthogroup\nmore concisely but these were not unique. The full accession line will be used instead.\n")     
                 self.seqIDsEx = util.FullAccession(self.seqIDsFN)
     return self.seqIDsEx.GetIDToNameDict()
def CheckUserSpeciesTree(speciesTreeFN, expSpecies):
    # File exists
    if not os.path.exists(speciesTreeFN):
        print("Species tree file does not exist: %s" % speciesTreeFN)
        util.Fail()
    # Species in tree are unique
    t = tree.Tree(speciesTreeFN)
    actSpecies = (t.get_leaf_names())
    c = Counter(actSpecies)
    if 1 != c.most_common()[0][1]:
        print("ERROR: Species names in species tree are not unique")
        for sp, n in c.most_common():
            if 1 != n:
                print("Species '%s' appears %d times" % (sp, n))
        util.Fail()
    # All required species are present
    actSpecies = set(actSpecies)
    ok = True
    for sp in expSpecies:
        if sp not in actSpecies:
            print("ERROR: '%s' is missing from species tree" % sp)
            ok = False
    # expected species are unique
    c = Counter(expSpecies)
    if 1 != c.most_common()[0][1]:
        print("ERROR: Species names are not unique")
        for sp, n in c.most_common():
            if 1 != n:
                print("Species '%s' appears %d times" % (sp, n))
        util.Fail()
    expSpecies = set(expSpecies)
    for sp in actSpecies:
        if sp not in expSpecies:
            print("ERROR: Additional species '%s' in species tree" % sp)
            ok = False
    if not ok: util.Fail()
    # Tree is rooted
    if len(t.get_children()) != 2:
        print("ERROR: Species tree is not rooted")
        util.Fail()
Exemple #7
0
def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, speciesToUse,
                   nSpAll, clustersFilename_pairs, nProcesses):
    ogSet = OrthoGroupsSet(orthofinderWorkingDir,
                           speciesToUse,
                           nSpAll,
                           clustersFilename_pairs,
                           idExtractor=util.FirstWordExtractor)
    if len(ogSet.speciesToUse) < 4:
        print("ERROR: Not enough species to infer species tree")
        util.Fail()

    print("\n1. Checking required programs are installed")
    print("-------------------------------------------")
    if not CanRunDependencies(orthofinderWorkingDir):
        print(
            "Orthogroups have been inferred but the dependencies for inferring gene trees and\northologues have not been met. Please review previous messages for more information."
        )
        sys.exit()

    print("\n2. Calculating gene distances")
    print("-----------------------------")
    resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir +
                                                "Orthologues_")

    db = DendroBLASTTrees(ogSet, resultsDir, nProcesses)
    db.ReadAndPickle()
    nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis()

    print("\n4. Best outgroup(s) for species tree")
    print("------------------------------------")
    spDict = ogSet.SpeciesDict()
    roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(
        spTreeFN_ids,
        os.path.split(db.treesPatIDs)[0] + "/",
        rfd.GeneToSpecies_dash,
        nProcesses,
        treeFmt=1)
    if len(roots) > 1:
        print(
            "Observed %d duplications. %d support the best roots and %d contradict them."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroups for species tree:")
    else:
        print(
            "Observed %d duplications. %d support the best root and %d contradict it."
            % (len(clusters), nSupport, len(clusters) - nSupport))
        print("Best outgroup for species tree:")
    for r in roots:
        print("  " + (", ".join([spDict[s] for s in r])))

    qMultiple = len(roots) > 1
    if qMultiple:
        print("\nAnalysing each of the potential species tree roots.")
    resultsSpeciesTrees = []
    for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
        if qMultiple:
            resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i
            resultsSpeciesTrees.append(
                resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i)
        else:
            resultsDir_new = resultsDir + "Orthologues/"
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/"
            resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
        os.mkdir(resultsDir_new)
        util.RenameTreeTaxa(speciesTree_fn,
                            resultsSpeciesTrees[-1],
                            db.ogSet.SpeciesDict(),
                            qFixNegatives=True)

        print("\n5%s. Reconciling gene and species trees" %
              ("-%d" % i if qMultiple else ""))
        print("-------------------------------------" +
              ("--" if qMultiple else ""))
        print("Outgroup: " + (", ".join([spDict[s] for s in r])))
        dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs,
                                     speciesTree_fn, db.workingDir)
        os.mkdir(reconTreesRenamedDir)
        for iog in xrange(len(db.ogSet.OGs())):
            util.RenameTreeTaxa(dlcparResultsDir +
                                "OG%07d_tree_id.locus.tree" % iog,
                                reconTreesRenamedDir + "OG%07d_tree.txt" % iog,
                                db.ogSet.Spec_SeqDict(),
                                qFixNegatives=False,
                                inFormat=8)

        # Orthologue lists
        print("\n6%s. Inferring orthologues from gene trees" %
              ("-%d" % i if qMultiple else ""))
        print("----------------------------------------" +
              ("--" if qMultiple else ""))
        pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir,
                                db.workingDir)

    CleanWorkingDir(db)
    print("\n7. Writing results files")
    print("------------------------")

    return GetResultsFilesString(resultsSpeciesTrees)
Exemple #8
0
if __name__ == "__main__":
    if len(sys.argv) < 2 or sys.argv[1] == "--help" or sys.argv[1] == "-h":
        PrintHelp()
        sys.exit()

    # get arguments
    userDir = None
    nProcesses = None

    args = sys.argv[1:]
    while len(args) != 0:
        arg = args.pop(0)
        if arg == "-t" or arg == "--threads":
            if len(args) == 0:
                print("Missing option for command line argument -t")
                util.Fail()
            arg = args.pop(0)
            try:
                nProcesses = int(arg)
            except:
                print("Incorrect argument for number of threads: %s" % arg)
                util.Fail()
        else:
            userDir = arg

    # Check arguments
    print("0. Getting Orthologues")
    print("----------------------")
    if nProcesses == None:
        print(
            """\nNumber of parallel processes has not been specified, will use the default value.  
def GetRoots(tree, species_tree_rooted, GeneToSpecies):
    species = set([GeneToSpecies(g) for g in tree.get_leaf_names()])
    if len(species) == 1:
        return [], 0, None
    ch = species_tree_rooted.get_children()
    if len(ch) != 2:
        print("ERROR: Species tree is not rooted")
        util.Fail()
    n1, n2 = ch
    t1 = set(n1.get_leaf_names())
    t2 = set(n2.get_leaf_names())
    have1 = len(species.intersection(t1)) != 0
    have2 = len(species.intersection(t2)) != 0
    #    print(tree)
    while not (have1 and have2):
        # Doesn't contain outgroup, step down in species tree until it does
        if have1:
            n = n1
        else:
            n = n2
        n1, n2 = n.get_children()
        t1 = n1.get_leaf_names()
        t2 = n2.get_leaf_names()
        have1 = len(species.intersection(t1)) != 0
        have2 = len(species.intersection(t2)) != 0

    n1, n2 = species_tree_rooted.get_children()
    root_mapper = RootMap(t1, t2, GeneToSpecies)
    GeneMap = root_mapper.GeneMap
    StoreSpeciesSets(tree, GeneMap)
    found = set()
    TF = set([True, False])
    TFfr = frozenset([True, False])
    Tfr = frozenset([True])
    Ffr = frozenset([False])
    fail = 0
    for m in tree:
        n = m.up
        while not n.is_root() and n.sp_down != TF:
            m = n
            n = m.up
        if n.sp_down == TF:
            children = n.get_children()
            if n.is_root():
                colour = m.sp_down
                if any([
                        x.sp_down != colour and len(x.sp_down) == 1
                        for x in children
                ]):
                    comb = Counter([frozenset(x.sp_down) for x in children])
                    # case 0
                    if comb[TFfr] == 0:
                        # case 0A - one of the branches is the root
                        for c in children:
                            if sum([c.sp_down == x.sp_down
                                    for x in children]) == 1:
                                found.add(c)  # only holds for one of them
                                break
                    elif comb[TFfr] == 1 and (comb[Tfr] == 2
                                              or comb[Ffr] == 2):
                        # case 0B - one mixed branch, two identical True/False branches
                        # we don't know this is the division, stepping down in the mixed branch might still be all same as the single state ones
                        # we'll find this division while walking up the tree
                        pass
                    elif comb[TFfr] == 1 and comb[Tfr] == 1:
                        # case 0C - one mixed branch, one True & one False
                        found.add([c for c in children if c.sp_down == TF][0])
                    else:
                        # case 0D - two mixed branches
                        # while find the transition while walking up the tree
                        pass
#                    found.add(n)
#                    print("*** Root1 ***")
            elif len(children) == 2:
                #                found.add(n)
                c1, c2 = children
                single_state = c1.sp_down if len(
                    c1.sp_down) == 1 else c2.sp_down
                if len(c1.sp_down) == 1 and len(c2.sp_down) == 1:
                    # Case 1 - Both single state
                    if len(n.sp_up) == 1:
                        # Case 1A - 3rd clade also single state
                        # Root is the bipartition separating True from False
                        found.add(c1 if n.sp_up == c2.sp_down else c2)
                    else:
                        # Case 1B - 3rd clade is mixed
                        found.add(n)
                else:
                    # Case 2 - only one is single state and it's not the same as the 3rd clade
                    if single_state != n.sp_up:
                        # Case 2A - only one is single state and it's not the same as the 3rd clade
                        #                        print("*** Root3 ***")
                        found.add(c1 if len(c1.sp_down) == 1 else c2)


#                    else:
#                        # Case 2A - only one is single state and it's the same as the 3rd clade
#                        # root is in the mixed clade and will be found while walking up that
#                        pass
            else:
                fail += 1
    return list(found), fail, GeneMap
Exemple #10
0
    def DoTrees(self, ogs, ogMatrix, idDict, speciesIdDict, nProcesses, qStopAfterSeqs, qStopAfterAlignments, qDoSpeciesTree):
        idDict.update(speciesIdDict) # smae code will then also convert concatenated alignment for species tree
        # 0       
        resultsDirsFullPath = []
        for fn in [self.GetFastaFilename, self.GetAlignmentFilename, self.GetTreeFilename]:
            for qIDs in [True, False]:
                d = os.path.split(fn(0, not qIDs))[0]
                if not os.path.exists(d): os.mkdir(d)
                if not qIDs: resultsDirsFullPath.append(d)
            if qStopAfterSeqs: break
            if qStopAfterAlignments and fn == self.GetAlignmentFilename: break
        
        # 1.
        fastaWriter = FastaWriter(self.ogsWorkingDir)
        self.WriteFastaFiles(fastaWriter, ogs, idDict)
        if qStopAfterSeqs: return resultsDirsFullPath

        # 3
        # Get OGs to use for species tree
        if qDoSpeciesTree:
            iOgsForSpeciesTree, fSingleCopy = DetermineOrthogroupsForSpeciesTree(ogMatrix)            
            concatenated_algn_fn = os.path.split(self.GetAlignmentFilename(0))[0] + "/SpeciesTreeAlignment.fa"
        else:
            iOgsForSpeciesTree = []
        alignCommands_and_filenames = self.GetAlignmentCommandsAndNewFilenames(ogs)
        if qStopAfterAlignments:
            util.PrintUnderline("Inferring multiple sequence alignments")
            pc.RunParallelCommandsAndMoveResultsFile(nProcesses, alignCommands_and_filenames, False)
            CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy)
            # ids -> accessions
            alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)]        
            accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))]
            alignmentFilesToUse.append(concatenated_algn_fn)
            accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa")
            self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict)
            return resultsDirsFullPath[:2]
        
        # Otherwise, alignments and trees
        # Strategy is
        # 1. Do alignments (and trees) require for species tree
        # 2. Create concatenated alignment
        # 3. Create second list of commands [speciestree] + [remaining alignments and trees]
        alignmentFilesToUse = [self.GetAlignmentFilename(i) for i, _ in enumerate(alignCommands_and_filenames)]
        treeCommands_and_filenames = self.GetTreeCommands(alignmentFilesToUse, ogs)
        commands_and_filenames = []
        if qDoSpeciesTree:
            print("Species tree: Using %d orthogroups with minimum of %0.1f%% of species having single-copy genes in any orthogroup" % (len(iOgsForSpeciesTree), 100.*fSingleCopy))
            util.PrintUnderline("Inferring multiple sequence alignments for species tree") 
            # Do required alignments and trees
            speciesTreeFN_ids = os.path.split(self.GetTreeFilename(i))[0] + "/SpeciesTree_unrooted.txt"
            for i in iOgsForSpeciesTree:
                commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]])
            pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True)
            CreateConcatenatedAlignment(iOgsForSpeciesTree, ogs, self.GetAlignmentFilename, concatenated_algn_fn, fSingleCopy)
            # Add species tree to list of commands to run
            commands_and_filenames = [self.program_caller.GetTreeCommands(self.tree_program, [concatenated_algn_fn], [speciesTreeFN_ids], ["SpeciesTree"])]
            util.PrintUnderline("Inferring remaining multiple sequence alignments and gene trees") 
        else:
            util.PrintUnderline("Inferring multiple sequence alignments and gene trees") 

        # Now continue as before
        iOgsForSpeciesTree = set(iOgsForSpeciesTree)                         
        for i in xrange(len(treeCommands_and_filenames)):
            if i in iOgsForSpeciesTree: continue
            commands_and_filenames.append([alignCommands_and_filenames[i], treeCommands_and_filenames[i]])
        for i in xrange(len(treeCommands_and_filenames), len(alignCommands_and_filenames)):
            if i in iOgsForSpeciesTree: continue
            commands_and_filenames.append([alignCommands_and_filenames[i]])
        pc.RunParallelCommandsAndMoveResultsFile(nProcesses, commands_and_filenames, True)
        
        # Convert ids to accessions
        accessionAlignmentFNs = [self.GetAlignmentFilename(i, True) for i in xrange(len(alignmentFilesToUse))]
        # Add concatenated Alignment
        if qDoSpeciesTree:
            alignmentFilesToUse.append(concatenated_algn_fn)
            accessionAlignmentFNs.append(os.path.split(self.GetAlignmentFilename(0, True))[0] + "/SpeciesTreeAlignment.fa")
            self.RenameAlignmentTaxa(alignmentFilesToUse, accessionAlignmentFNs, idDict)
            if os.path.exists(speciesTreeFN_ids):
                util.RenameTreeTaxa(speciesTreeFN_ids, self.workingDir + "SpeciesTree_unrooted.txt", idDict, qFixNegatives=True)
            else:
                print("ERROR: Species tree inference failed")
                util.Fail()
        for i in xrange(len(treeCommands_and_filenames)):
            if os.path.exists(self.GetTreeFilename(i)):
                util.RenameTreeTaxa(self.GetTreeFilename(i), self.GetTreeFilename(i, True), idDict, qFixNegatives=True)       
        return resultsDirsFullPath[:2]
def OrthologuesWorkflow(workingDir_ogs, 
                       orthofinderResultsDir, 
                       speciesToUse, nSpAll, 
                       clustersFilename_pairs, 
                       tree_options,
                       msa_method,
                       tree_method,
                       nHighParallel,
                       nLowParrallel,
                       userSpeciesTree = None, 
                       qStopAfterSeqs = False,
                       qStopAfterAlign = False,
                       qStopAfterTrees = False, 
                       qMSA = False,
                       qPhyldog = False,
                       pickleDir=None):
    """
    1. Setup:
        - ogSet, directories
        - DendroBLASTTress - object
    2. DendrobBLAST:
        - read scores
        - RunAnalysis: Get distance matrices, do trees
    3. Root species tree
    4. Reconciliation/Orthologues
    5. Clean up
    
    Variables:
    - ogSet - all the relevant information about the orthogroups, species etc.
    """
    ogSet = OrthoGroupsSet(workingDir_ogs, speciesToUse, nSpAll, clustersFilename_pairs, idExtractor = util.FirstWordExtractor, pickleDir=pickleDir)
    
    # Class that is going to run the analysis needs to check the dependencies
#    if not CanRunOrthologueDependencies(workingDir_ogs, qMSA, qStopAfterTrees, userSpeciesTree == None): 
#        print("Orthogroups have been inferred but the dependencies for inferring gene trees and")
#        print("orthologues have not been met. Please review previous messages for more information.")
#        sys.exit()
    
    resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_")
    """ === 1 === ust = UserSpeciesTree
    MSA:               Sequences    Alignments                        GeneTrees    db    SpeciesTree
    Phyldog:           Sequences    Alignments                        GeneTrees    db    SpeciesTree  
    Dendroblast:                                  DistanceMatrices    GeneTrees    db    SpeciesTree
    MSA (ust):         Sequences    Alignments                        GeneTrees    db
    Phyldog (ust):     Sequences    Alignments                        GeneTrees    db      
    Dendroblast (ust):                            DistanceMatrices    GeneTrees    db        
    """
    if qMSA or qPhyldog:
        treeGen = msa.TreesForOrthogroups(tree_options, msa_method, tree_method, resultsDir, workingDir_ogs)
        seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.Spec_SeqDict(), nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog) 
        if qStopAfterSeqs:
            print("")
            return ("\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0])
        elif qStopAfterAlign:
            print("")
            st = "\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0]
            st += "\nMultiple sequence alignments:\n   %s\n" % seqs_alignments_dirs[1]
            return st
        db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel)
        if not userSpeciesTree:
            util.PrintUnderline("Inferring species tree (calculating gene distances)")
            print("Loading BLAST scores")
            db.ReadAndPickle()
            spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly()
        if qPhyldog:
            trees_from_phyldog.RunPhyldogAnalysis(resultsDir + "WorkingDirectory/phyldog/", ogSet.OGs(), speciesToUse)
            return "Running Phyldog" + "\n".join(seqs_alignments_dirs)       
    else:
        util.PrintUnderline("Calculating gene distances")
        db = DendroBLASTTrees(ogSet, resultsDir, nLowParrallel)
        db.ReadAndPickle()
        nOGs, D, spTreeFN_ids, spTreeUnrootedFN = db.RunAnalysis()
    
    """ === 2 ===
    Check can continue with analysis 
    """
    if len(ogSet.speciesToUse) < 4: 
        print("ERROR: Not enough species to infer species tree")
        util.Fail()
     
    """ === 3 ===
    MSA:               RootSpeciesTree
    Phyldog:           RootSpeciesTree    
    Dendroblast:       RootSpeciesTree  
    MSA (ust):         ConvertSpeciesTreeIDs
    Phyldog (ust):     ConvertSpeciesTreeIDs
    Dendroblast (ust): ConvertSpeciesTreeIDs
    """    
    if userSpeciesTree:
        util.PrintUnderline("Using user-supplied species tree") 
        userSpeciesTree = ConvertUserSpeciesTree(db.workingDir + "Trees_ids/", userSpeciesTree, ogSet.SpeciesDict())
        rootedSpeciesTreeFN = [userSpeciesTree]
        roots = [None]
        qMultiple = False
    else:
        util.PrintUnderline("Best outgroup(s) for species tree") 
        spDict = ogSet.SpeciesDict()
        roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.TreeFilename_IDs(0))[0] + "/", rfd.GeneToSpecies_dash, nHighParallel, treeFmt = 1)
        if len(roots) > 1:
            print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport))
            print("Best outgroups for species tree:")  
        else:
            print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport))
            print("Best outgroup for species tree:")  
        for r in roots: print("  " + (", ".join([spDict[s] for s in r]))  )
        qMultiple = len(roots) > 1
        
    if qStopAfterTrees:
        if userSpeciesTree:
            st = ""
            if qMSA:
                st += "\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0]
                st += "\nMultiple sequence alignments:\n   %s\n" % seqs_alignments_dirs[1]
            st += "\nGene trees:\n   %s\n" % (resultsDir + "Gene_Trees/")
            return st
        # otherwise, root species tree
        resultsSpeciesTrees = []
        for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
            if len(roots) == 1:
                resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
            else:
                resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted_at_outgroup_%d.txt" % i)
            util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True)
        db.DeleteBlastMatrices()
        CleanWorkingDir(db.workingDir)
        return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False)
    
    if qMultiple: util.PrintUnderline("\nAnalysing each of the potential species tree roots", True)
    resultsSpeciesTrees = []
    for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
        util.PrintUnderline("Reconciling gene trees and species tree" + (" (root %d)"%i if qMultiple else "")) 
        if qMultiple: 
            resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees_using_outgroup_%d/" % i
            resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i)
            print("Outgroup: " + (", ".join([spDict[s] for s in r])))
        elif userSpeciesTree:
            resultsDir_new = resultsDir + "Orthologues/"
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/"
            resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
        else:
            resultsDir_new = resultsDir + "Orthologues/"
            reconTreesRenamedDir = db.workingDir + "Recon_Gene_Trees/"
            resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt")
            print("Outgroup: " + (", ".join([spDict[s] for s in r])))
        os.mkdir(resultsDir_new)
        util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True)
        ReconciliationAndOrthologues(db.TreeFilename_IDs, db.ogSet, speciesTree_fn, db.workingDir, resultsDir_new, reconTreesRenamedDir, nHighParallel, i if qMultiple else None, pickleDir=pickleDir) 
    
    db.DeleteBlastMatrices()
    CleanWorkingDir(db.workingDir)
    util.PrintUnderline("Writing results files", True)
    
    return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)
Exemple #12
0
    def _GetOGsFile(self, userArg):
        """returns the WorkingDirectory, ResultsDirectory and clusters_id_pairs filename"""
        qSpecifiedResultsFile = False
        if userArg == None:
            print(
                "ERROR: orthofinder_results_directory has not been specified")
            util.Fail()
        if os.path.isfile(userArg):
            fn = os.path.split(userArg)[1]
            if ("clusters_OrthoFinder_" not in fn) or ("txt_id_pairs.txt"
                                                       not in fn):
                print(
                    "ERROR:\n    %s\nis neither a directory or a clusters_OrthoFinder_*.txt_id_pairs.txt file."
                    % userArg)
                util.Fail()
            qSpecifiedResultsFile = True
            # user has specified specific results file
        elif userArg[-1] != os.path.sep:
            userArg += os.path.sep

        # find required files
        if qSpecifiedResultsFile:
            orthofinderWorkingDir = os.path.split(userArg)[0] + os.sep
            if not self._IsWorkingDirectory(orthofinderWorkingDir):
                print(
                    "ERROR: cannot find files from OrthoFinder run in directory:\n   %s"
                    % orthofinderWorkingDir)
                util.Fail()
        else:
            orthofinderWorkingDir = os.path.split(
                userArg)[0] if qSpecifiedResultsFile else userArg
            if not self._IsWorkingDirectory(orthofinderWorkingDir):
                orthofinderWorkingDir = userArg + "WorkingDirectory" + os.sep
                if not self._IsWorkingDirectory(orthofinderWorkingDir):
                    print(
                        "ERROR: cannot find files from OrthoFinder run in directory:\n   %s\nor\n   %s\n"
                        % (userArg, orthofinderWorkingDir))
                    util.Fail()

        if qSpecifiedResultsFile:
            print("\nUsing orthogroups in file:\n    %s" % userArg)
            return orthofinderWorkingDir, orthofinderWorkingDir, userArg
        else:
            # identify orthogroups file
            clustersFiles = glob.glob(
                orthofinderWorkingDir +
                "clusters_OrthoFinder_*.txt_id_pairs.txt")
            orthogroupFiles = glob.glob(orthofinderWorkingDir +
                                        "OrthologousGroups*.txt") + glob.glob(
                                            orthofinderWorkingDir +
                                            "Orthogroups*.txt")
            if orthofinderWorkingDir != userArg:
                orthogroupFiles += glob.glob(userArg +
                                             "OrthologousGroups*.txt")
                orthogroupFiles += glob.glob(userArg + "Orthogroups*.txt")
            # User may have specified a WorkingDirectory and results could be in directory above
            if len(orthogroupFiles) < len(clustersFiles):
                orthogroupFiles += glob.glob(userArg + ".." + os.sep +
                                             "OrthologousGroups*.txt")
                orthogroupFiles += glob.glob(userArg + ".." + os.sep +
                                             "Orthogroups*.txt")
            clustersFiles = sorted(clustersFiles)
            orthogroupFiles = sorted(orthogroupFiles)
            if len(clustersFiles) > 1 or len(orthogroupFiles) > 1:
                print("ERROR: Results from multiple OrthoFinder runs found\n")
                print(
                    "Tab-delimiter Orthogroups*.txt/OrthologousGroups*.txt files:"
                )
                for fn in orthogroupFiles:
                    print("    " + fn)
                print("With corresponding cluster files:")
                for fn in clustersFiles:
                    print("    " + fn)
                print(
                    "\nPlease run with only one set of results in directories or specifiy the specific clusters_OrthoFinder_*.txt_id_pairs.txt file on the command line"
                )
                util.Fail()

            if len(clustersFiles) != 1 or len(orthogroupFiles) != 1:
                print(
                    "ERROR: Results not found in <orthofinder_results_directory> or <orthofinder_results_directory>/WorkingDirectory"
                )
                print(
                    "\nCould not find:\n    Orthogroups*.txt/OrthologousGroups*.txt\nor\n    clusters_OrthoFinder_*.txt_id_pairs.txt"
                )
                util.Fail()

            print("\nUsing orthogroups in file:\n    %s" % orthogroupFiles[0])
            print("and corresponding clusters file:\n    %s" %
                  clustersFiles[0])
            return orthofinderWorkingDir, userArg, clustersFiles[0]
Exemple #13
0
    def _ProcessLog(self, logFN):
        """
        Get all relevant data from log file. 
        Checks the paths ssaved do exist still
        Should work with relevant paths to allow directory to move
        Other methods can then check that the data required for a particualr run is available
        """
        with open(logFN, 'rb') as infile:
            for line in infile:
                if line.startswith("Species used:"):
                    self.species_ids_lines = ""
                    line = infile.next()
                    while line.rstrip() != "":
                        self.species_ids_lines += line
                        line = infile.next()
                wd_base_str = "WorkingDirectory_Base: "
                wd_trees_str = "WorkingDirectory_Trees: "
                clusters_str = "FN_Orthogroups: "
                if line.startswith(wd_base_str):
                    wd_base_anchor = line.rstrip()[len(wd_base_str):]
                    if not os.path.exists(wd_base_anchor):
                        # try to see if it's a relative directory to current one
                        path, d_wd = os.path.split(wd_base_anchor[:-1])
                        path, d_res = os.path.split(path)
                        wd_base_anchor = os.path.split(logFN)[0] + (
                            "/../%s/%s/" % (d_res, d_wd))
                        if not os.path.exists(wd_base_anchor):
                            print("ERROR: Missing directory: %s" %
                                  wd_base_anchor)
                            util.Fail()
                    self.wd_base_prev = self.GetWDBaseChain(wd_base_anchor)
                if line.startswith(clusters_str):
                    clusters_fn_full_path = line.rstrip()[len(clusters_str):]
                    self.clustersFilename_pairs = clusters_fn_full_path
                    if not os.path.exists(self.clustersFilename_pairs):
                        # try to see if it's a relative directory to current one
                        path, clusters_fn = os.path.split(
                            self.clustersFilename_pairs)
                        path, d_wd = os.path.split(path)
                        path, d_res = os.path.split(path)
                        self.clustersFilename_pairs = os.path.split(
                            logFN)[0] + ("/../%s/%s/%s" %
                                         (d_res, d_wd, clusters_fn))
                        if not os.path.exists(self.clustersFilename_pairs):
                            print("ERROR: Missing orthogroups file: %s or %s" %
                                  (self.clustersFilename_pairs,
                                   clusters_fn_full_path))
                            util.Fail()


#                    self._GetOGsFile(wd_ogs_path)
                if line.startswith(wd_trees_str):
                    self.wd_trees = line.rstrip()[len(wd_trees_str):]
                    self.speciesTreeRootedIDsFN = self.wd_trees + "SpeciesTree_rooted_ids.txt"
                    if not os.path.exists(self.wd_trees):
                        # try to see if it's a relative directory to current one
                        path, d_wd = os.path.split(self.wd_trees[:-1])
                        path, d_res = os.path.split(path)
                        self.wd_trees = os.path.split(logFN)[0] + (
                            "/../%s/%s/" % (d_res, d_wd))
                        if not os.path.exists(self.wd_trees):
                            print("ERROR: Missing directory: %s" %
                                  self.wd_trees)
                            util.Fail()
Exemple #14
0
 def LogFailAndExit(self, text=""):
     if text != "": print(text)
     self.WriteToLog("\nERROR: An error occurred\n" + text)
     util.Fail()