コード例 #1
0
def one_to_one_efficient(orthodict, genenumbers, speciesLabels, iSpecies,
                         outputDir):
    """ speciesLabels is an ordered list of the speciesIDs
        try to mostly deal with iSpecies which is the ordinal number not the label it is given
    """
    #Creates all matrices and appends them to matrixlist.
    util.PrintTime("Processing orthologues for species %d" % iSpecies)
    matrixlist = []
    numspecies = len(speciesLabels)
    speciesLabelsReverse = {label: i for i, label in enumerate(speciesLabels)}
    for j in range(numspecies):
        if iSpecies > j:
            matrixlist.append(
                sparse.lil_matrix((genenumbers[iSpecies], genenumbers[j]),
                                  dtype=np.dtype(np.int8)))
        else:
            matrixlist.append(None)
    #Fill matrices with orthodata
    iSpecieslist = [
        x for x in orthodict if x.startswith('%d_' % speciesLabels[iSpecies])
    ]
    for count, queryGene in enumerate(iSpecieslist):
        _, iGene = map(int, queryGene.split('_'))
        for Gene in orthodict[queryGene]:
            jSpLabel, jGene = map(int, Gene.split('_'))
            jSp = speciesLabelsReverse[jSpLabel]
            if iSpecies > jSp:
                matrixlist[jSp][iGene, jGene] = 1
    for j, m in enumerate(matrixlist):
        with open(outputDir + 'ortholog_%d_%d_matrix.pic' % (iSpecies, j),
                  'wb') as file:
            pic.dump(m, file)
    return matrixlist
コード例 #2
0
 def RunAnalysis(self, qSpeciesTree=True):
     util.PrintUnderline("Calculating gene distances")
     ogs, ogMatrices_partial = self.GetOGMatrices_FullParallel()
     ogMatrices = self.CompleteAndWriteOGMatrices(ogs, ogMatrices_partial)
     util.PrintTime("Done")
     cmds_trees = self.PrepareGeneTreeCommand()
     qLessThanFourSpecies = len(self.ogSet.seqsInfo.speciesToUse) < 4
     if qLessThanFourSpecies:
         qSTAG = False
         spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN()
         WriteSpeciesTreeIDs_TwoThree(self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids)
     else:
         qSTAG = self.EnoughOGsForSTAG(ogs, self.ogSet.seqsInfo.speciesToUse)
         if not qSTAG:
             print("Using fallback species tree inference method")
             D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices)
             cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs)
             cmds_trees = [[cmd_spTree]] + cmds_trees
     util.PrintUnderline("Inferring gene and species trees")
     util.RunParallelOrderedCommandLists(self.nProcesses, cmds_trees)
     if qSTAG:
         # Trees must have been completed
         print("")
         spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN()
         stag.Run_ForOrthoFinder(files.FileHandler.GetOGsTreeDir(), files.FileHandler.GetWorkingDirectory_Write(), self.ogSet.seqsInfo.speciesToUse, spTreeFN_ids)
     seqDict = self.ogSet.Spec_SeqDict()
     for iog in xrange(len(self.ogSet.OGs())):
         util.RenameTreeTaxa(files.FileHandler.GetOGsTreeFN(iog), files.FileHandler.GetOGsTreeFN(iog, True), seqDict, qSupport=False, qFixNegatives=True)
     if qSpeciesTree:
         util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), self.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True)        
         return spTreeFN_ids, qSTAG
     else:      
         return None, qSTAG
コード例 #3
0
 def GetOGMatrices(self):
     """
     ogMatrices contains matrix M for each OG where:
         Mij = 0.5*max(Bij, Bmin_i)/Bmax_i
     """
     with warnings.catch_warnings():         
         warnings.simplefilter("ignore")
         ogs = self.ogSet.OGs()
         ogsPerSpecies = [[[(g, i) for i, g in enumerate(og) if g.iSp == iSp] for iSp in self.ogSet.seqsInfo.speciesToUse] for og in ogs]
         nGenes = [len(og) for og in ogs]
         nSeqs = self.ogSet.seqsInfo.nSeqsPerSpecies
         ogMatrices = [np.zeros((n, n)) for n in nGenes]
         for iiSp, sp1 in enumerate(self.ogSet.seqsInfo.speciesToUse):
             util.PrintTime("Processing species %d" % sp1)
             Bs = [matrices.LoadMatrix("Bit", self.ogSet.fileInfo, sp1, sp2) for sp2 in self.ogSet.seqsInfo.speciesToUse]
             mins = np.ones((nSeqs[sp1], 1), dtype=np.float64)*9e99 
             maxes = np.zeros((nSeqs[sp1], 1), dtype=np.float64)
             for B in Bs:
                 mins = np.minimum(mins, lil_min(B))
                 maxes = np.maximum(maxes, lil_max(B))
             for jjSp, B  in enumerate(Bs):
                 for og, m in zip(ogsPerSpecies, ogMatrices):
                     for gi, i in og[iiSp]:
                         for gj, j in og[jjSp]:
                                 m[i, j] = 0.5*max(B[gi.iSeq, gj.iSeq], mins[gi.iSeq]) / maxes[gi.iSeq]   # inf if i doesn't hit anything but is hit
         return ogs, ogMatrices
コード例 #4
0
def DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDirectory, GeneToSpecies, output_dir, reconTreesRenamedDir):    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)      
    for index1 in xrange(nspecies):
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
    with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            recon_tree = files.FileHandler.GetPhyldogOGResultsTreeFN(iog)
            orthologues = GetOrthologues_from_phyldog_tree(iog, recon_tree, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict())
            allOrthologues = [(iog, orthologues)]
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, False)
    return nOrthologues_SpPair
コード例 #5
0
def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo):
    while True:
        try:
            i, args = cmd_queue.get(True, 1)
            nDone = i - nProcesses + 1
            if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (nDone, nToDo))
            B = BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False)
            matrices.DumpMatrix("Bit", B, fileInfo, args[0], args[1])
        except Queue.Empty:
            return 
コード例 #6
0
def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo):
    while True:
        try:
            i, args = cmd_queue.get(True, 1)
            nDone = i - nProcesses + 1
            if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (nDone, nToDo))
            B = BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False)
            with open(fileInfo.workingDir + "Bit%d_%d.pic" % args, 'wb') as outfile:
                pic.dump(B, outfile, protocol = util.picProtocol)
        except Queue.Empty:
            return 
コード例 #7
0
ファイル: trees2ologs_of.py プロジェクト: pythseq/OrthoFinder
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes):    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)      
    dSuspect = output_dir + "Putative_Xenologues/"
    if not os.path.exists(dSuspect): os.mkdir(dSuspect)     
    for index1 in xrange(nspecies):
        with open(dSuspect + '%s.csv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile:
            writer1 = csv.writer(outfile, delimiter="\t")
            writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other"))
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    # Infer orthologues and write them to file           
    species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn)
    neighbours = GetSpeciesNeighbours(species_tree_rooted)
    # Label nodes of species tree
    species_tree_rooted.name = "N0"    
    iNode = 1
    for n in species_tree_rooted.traverse():
        if (not n.is_leaf()) and (not n.is_root()):
            n.name = "N%d" % iNode
            iNode += 1
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
    species = speciesDict.keys()
    with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            orthologues, recon_tree, suspect_genes = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes)
            for index0 in xrange(nspecies):
                strsp0 = species[index0]
                strsp0_ = strsp0+"_"
                these_genes = [g for g in suspect_genes if g.startswith(strsp0_)]
                if len(these_genes) > 0:
                    with open(output_dir + "Orthologues_" + speciesDict[strsp0] + "/Putative_Horizontal_Gene_Transfer.txt", 'ab') as outfile:
                        outfile.write("\n".join([SequenceDict[g]]) + "\n")
            allOrthologues = [(iog, orthologues)]
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, True)
    return nOrthologues_SpPair
コード例 #8
0
def OrthologuesFromTrees(recon_method, nHighParallel, userSpeciesTree_fn, qAddSpeciesToIDs):
    """
    userSpeciesTree_fn - None if not supplied otherwise rooted tree using user species names (not orthofinder IDs)
    qUserSpTree - is the speciesTree_fn user-supplied
    
    Just infer orthologues from trees, don't do any of the preceeding steps.
    """
    speciesToUse, nSpAll, _ = util.GetSpeciesToUse(files.FileHandler.GetSpeciesIDsFN())    
    ogSet = OrthoGroupsSet(files.FileHandler.GetWorkingDirectory1_Read(), speciesToUse, nSpAll, qAddSpeciesToIDs, idExtractor = util.FirstWordExtractor)
    if userSpeciesTree_fn != None:
        speciesDict = files.FileHandler.GetSpeciesDict()
        speciesToUseNames = [speciesDict[str(iSp)] for iSp in ogSet.speciesToUse]
        CheckUserSpeciesTree(userSpeciesTree_fn, speciesToUseNames)
        speciesTreeFN_ids = files.FileHandler.GetSpeciesTreeIDsRootedFN()
        ConvertUserSpeciesTree(userSpeciesTree_fn, speciesDict, speciesTreeFN_ids)
    util.PrintUnderline("Running Orthologue Prediction", True)
    util.PrintUnderline("Reconciling gene and species trees") 
    ReconciliationAndOrthologues(recon_method, ogSet, nHighParallel)
    util.PrintUnderline("Writing results files")
    util.PrintTime("Writing results files")
    files.FileHandler.CleanWorkingDir2()
    return "Species-by-species orthologues directory:\n   %s\n" % files.FileHandler.GetOrthologuesDirectory()
コード例 #9
0
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes):    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)           
    for index1 in xrange(nspecies):
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    # Infer orthologues and write them to file           
    species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn)
    # Label nodes of species tree
    species_tree_rooted.name = "N0"    
    iNode = 1
    for n in species_tree_rooted.traverse():
        if (not n.is_leaf()) and (not n.is_root()):
            n.name = "N%d" % iNode
            iNode += 1
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies)
    allOrthologues = []
    with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            orthologues, recon_tree = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes)
            allOrthologues.append((iog, orthologues))
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
    nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir)
    return nOrthologues_SpPair
コード例 #10
0
def ReconciliationAndOrthologues(recon_method, ogSet, nParallel, iSpeciesTree=None, all_stride_dup_genes=None):
    """
    ogSet - info about the orthogroups, species etc
    resultsDir - where the Orthologues top level results directory will go (should exist already)
    reconTreesRenamedDir - where to put the reconcilled trees that use the gene accessions
    iSpeciesTree - which of the potential roots of the species tree is this
    method - can be dlcpar, dlcpar_deep, of_recon
    """
    speciesTree_ids_fn = files.FileHandler.GetSpeciesTreeIDsRootedFN()
    labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN()
    util.RenameTreeTaxa(speciesTree_ids_fn, labeled_tree_fn, ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N')
    workingDir = files.FileHandler.GetWorkingDirectory_Write()    # workingDir - Orthologues working dir
    resultsDir_ologs = files.FileHandler.GetOrthologuesDirectory()
    reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True)
    if "dlcpar" in recon_method:
        qDeepSearch = (recon_method == "dlcpar_convergedsearch")
        util.PrintTime("Starting DLCpar")
        dlcparResultsDir, dlcparLocusTreePat = trees2ologs_dlcpar.RunDlcpar(ogSet, speciesTree_ids_fn, workingDir, nParallel, qDeepSearch)
        util.PrintTime("Done DLCpar")
        spec_seq_dict = ogSet.Spec_SeqDict()
        for iog in xrange(len(ogSet.OGs())):
            util.RenameTreeTaxa(dlcparResultsDir + dlcparLocusTreePat % iog, files.FileHandler.GetOGsReconTreeFN(iog), spec_seq_dict, qSupport=False, qFixNegatives=False, inFormat=8, label='n')
    
        # Orthologue lists
        util.PrintUnderline("Inferring orthologues from gene trees" + (" (root %d)"%iSpeciesTree if iSpeciesTree != None else ""))
        pickleDir = files.FileHandler.GetPickleDir()
        nOrthologues_SpPair = trees2ologs_dlcpar.create_orthologue_lists(ogSet, resultsDir_ologs, dlcparResultsDir, pickleDir)  

    elif "phyldog" == recon_method:
        util.PrintTime("Starting Orthologues from Phyldog")
        nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDir, trees2ologs_of.GeneToSpecies_dash, resultsDir_ologs, reconTreesRenamedDir)
        util.PrintTime("Done Orthologues from Phyldog")
    else:
        util.PrintTime("Starting OF Orthologues")
        qNoRecon = ("only_overlap" == recon_method)
        nOrthologues_SpPair = trees2ologs_of.DoOrthologuesForOrthoFinder(ogSet, speciesTree_ids_fn, trees2ologs_of.GeneToSpecies_dash, all_stride_dup_genes, qNoRecon)
        util.PrintTime("Done OF Orthologues")
    nOrthologues_SpPair += TwoAndThreeGeneOrthogroups(ogSet, resultsDir_ologs)
    WriteOrthologuesStats(ogSet, nOrthologues_SpPair)
コード例 #11
0
def DoOrthologuesForOrthoFinder(ogSet, species_tree_rooted_fn, GeneToSpecies,
                                all_stride_dup_genes, qNoRecon):
    """
    """
    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    qInitialisedSuspectGenesDirs = False
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)
    dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory()
    for index1 in xrange(nspecies):
        d = dResultsOrthologues + "Orthologues_" + speciesDict[str(
            speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(
                    d + '%s__v__%s.tsv' %
                (speciesDict[str(speciesIDs[index1])], speciesDict[str(
                    speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(
                    ("Orthogroup", speciesDict[str(speciesIDs[index1])],
                     speciesDict[str(speciesIDs[index2])]))
    # Infer orthologues and write them to file
    species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn)
    neighbours = GetSpeciesNeighbours(species_tree_rooted)
    # Label nodes of species tree
    species_tree_rooted.name = "N0"
    iNode = 1
    for n in species_tree_rooted.traverse():
        if (not n.is_leaf()) and (not n.is_root()):
            n.name = "N%d" % iNode
            iNode += 1
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies)
    species = speciesDict.keys()
    reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True)
    spec_seq_dict = ogSet.Spec_SeqDict()
    with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow([
            "Orthogroup", "Species Tree Node", "Gene Tree Node", "Support",
            "Type", "Genes 1", "Genes 2"
        ])
        for iog in xrange(nOgs):
            rooted_tree_ids, qHaveSupport = CheckAndRootTree(
                files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted,
                GeneToSpecies)  # this can be parallelised easily
            if rooted_tree_ids is None: continue
            # Write rooted tree with accessions
            util.RenameTreeTaxa(rooted_tree_ids,
                                files.FileHandler.GetOGsTreeFN(iog, True),
                                spec_seq_dict,
                                qSupport=qHaveSupport,
                                qFixNegatives=True,
                                qViaCopy=True)
            orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(
                iog,
                rooted_tree_ids,
                species_tree_rooted,
                GeneToSpecies,
                neighbours,
                dupsWriter=dupWriter,
                seqIDs=spec_seq_dict,
                spIDs=ogSet.SpeciesDict(),
                all_stride_dup_genes=all_stride_dup_genes,
                qNoRecon=qNoRecon)
            qContainsSuspectGenes = len(suspect_genes) > 0
            if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes:
                qInitialisedSuspectGenesDirs = True
                dSuspectGenes = files.FileHandler.GetSuspectGenesDir()
                dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir(
                )
                for index1 in xrange(nspecies):
                    with open(
                            dSuspectOrthologues +
                            '%s.tsv' % speciesDict[str(speciesIDs[index1])],
                            'wb') as outfile:
                        writer1 = csv.writer(outfile, delimiter="\t")
                        writer1.writerow(
                            ("Orthogroup",
                             speciesDict[str(speciesIDs[index1])], "Other"))
            for index0 in xrange(nspecies):
                strsp0 = species[index0]
                strsp0_ = strsp0 + "_"
                these_genes = [
                    g for g in suspect_genes if g.startswith(strsp0_)
                ]
                if len(these_genes) > 0:
                    with open(dSuspectGenes + speciesDict[strsp0] + ".txt",
                              'ab') as outfile:
                        outfile.write(
                            "\n".join([SequenceDict[g]
                                       for g in these_genes]) + "\n")
            allOrthologues = [(iog, orthologues)]
            # don't relabel nodes, they've already been done
            util.RenameTreeTaxa(recon_tree,
                                reconTreesRenamedDir + "OG%07d_tree.txt" % iog,
                                spec_seq_dict,
                                qSupport=False,
                                qFixNegatives=True)
            if iog >= 0 and divmod(
                    iog, 10
                    if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(
                allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict,
                dResultsOrthologues, qContainsSuspectGenes)
    return nOrthologues_SpPair
コード例 #12
0
def OrthologuesWorkflow(speciesToUse, nSpAll, 
                       tree_options,
                       msa_method,
                       tree_method,
                       recon_method,
                       nHighParallel,
                       nLowParrallel,
                       qDoubleBlast,
                       qAddSpeciesToIDs,
                       userSpeciesTree = None, 
                       qStopAfterSeqs = False,
                       qStopAfterAlign = False,
                       qStopAfterTrees = False, 
                       qMSA = False,
                       qPhyldog = False,
                       results_name = ""):
    """
    1. Setup:
        - ogSet, directories
        - DendroBLASTTress - object
    2. DendrobBLAST:
        - read scores
        - RunAnalysis: Get distance matrices, do trees
    3. Root species tree
    4. Reconciliation/Orthologues
    5. Clean up
    
    Variables:
    - ogSet - all the relevant information about the orthogroups, species etc.
    """
    ogSet = OrthoGroupsSet(files.FileHandler.GetWorkingDirectory1_Read(), speciesToUse, nSpAll, qAddSpeciesToIDs, idExtractor = util.FirstWordExtractor)
    
    tree_generation_method = "msa" if qMSA or qPhyldog else "dendroblast"
    stop_after = "seqs" if qStopAfterSeqs else "align" if qStopAfterAlign else ""
    files.FileHandler.MakeResultsDirectory2(tree_generation_method, stop_after, results_name)    
    """ === 1 === ust = UserSpeciesTree
    MSA:               Sequences    Alignments                        GeneTrees    db    SpeciesTree
    Phyldog:           Sequences    Alignments                        GeneTrees    db    SpeciesTree  
    Dendroblast:                                  DistanceMatrices    GeneTrees    db    SpeciesTree
    MSA (ust):         Sequences    Alignments                        GeneTrees    db
    Phyldog (ust):     Sequences    Alignments                        GeneTrees    db      
    Dendroblast (ust):                            DistanceMatrices    GeneTrees    db        
    """
    qDB_SpeciesTree = False
    if userSpeciesTree:
        util.PrintUnderline("Using user-supplied species tree") 
        spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN()
        ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), spTreeFN_ids)
    
    if qMSA or qPhyldog:
        qLessThanFourSpecies = len(ogSet.seqsInfo.speciesToUse) < 4
        treeGen = trees_msa.TreesForOrthogroups(tree_options, msa_method, tree_method)       
        if (not userSpeciesTree) and qLessThanFourSpecies:
            spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN()
            WriteSpeciesTreeIDs_TwoThree(ogSet.seqsInfo.speciesToUse, spTreeFN_ids)
            util.RenameTreeTaxa(spTreeFN_ids, files.FileHandler.GetSpeciesTreeUnrootedFN(True), ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True)
        qDoMSASpeciesTree = (not qLessThanFourSpecies) and (not userSpeciesTree)
        util.PrintTime("Starting MSA/Trees")
        seqs_alignments_dirs = treeGen.DoTrees(ogSet.OGs(qInclAll=True), ogSet.OrthogroupMatrix(), ogSet.Spec_SeqDict(), ogSet.SpeciesDict(), ogSet.speciesToUse, nHighParallel, qStopAfterSeqs, qStopAfterAlign or qPhyldog, qDoSpeciesTree=qDoMSASpeciesTree) 
        util.PrintTime("Done MSA/Trees")
        if qDoMSASpeciesTree:
            spTreeFN_ids = files.FileHandler.GetSpeciesTreeUnrootedFN()
        if qStopAfterSeqs:
            print("")
            return ("\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0])
        elif qStopAfterAlign:
            print("")
            st = "\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0]
            st += "\nMultiple sequence alignments:\n   %s\n" % seqs_alignments_dirs[1]
            return st
        db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast)
        if qDB_SpeciesTree and not userSpeciesTree and not qLessThanFourSpecies:
            util.PrintUnderline("Inferring species tree (calculating gene distances)")
            print("Loading BLAST scores")
            spTreeFN_ids = db.SpeciesTreeOnly()
        if qPhyldog:
#            util.PrintTime("Do species tree for phyldog")
#            spTreeFN_ids, spTreeUnrootedFN = db.SpeciesTreeOnly()
            if userSpeciesTree: 
                userSpeciesTree = ConvertUserSpeciesTree(userSpeciesTree, ogSet.SpeciesDict(), files.FileHandler.GetSpeciesTreeUnrootedFN())
            util.PrintTime("Starting phyldog")
            species_tree_ids_labelled_phyldog = wrapper_phyldog.RunPhyldogAnalysis(files.FileHandler.GetPhyldogWorkingDirectory(), ogSet.OGs(), speciesToUse, nHighParallel)
    else:
        db = DendroBLASTTrees(ogSet, nLowParrallel, qDoubleBlast)
        spTreeFN_ids, qSTAG = db.RunAnalysis()
    files.FileHandler.LogWorkingDirectoryTrees()
    qSpeciesTreeSupports = False if (userSpeciesTree or qMSA or qPhyldog) else qSTAG
    """
    SpeciesTree
    spTreeFN_ids, or equivalently FileHandler.GetSpeciesTreeUnrootedFN() in all cases (user, inferred etc)
    Thus, we always have the species tree ids format
    
    With phyldog, we also have species_tree_ids_labelled_phyldog - with the node labels given by phyldog
    """    
    
    """ === 2 ===
    Check can continue with analysis 
    """
#    if len(ogSet.speciesToUse) < 4: 
#        print("ERROR: Not enough species to infer species tree")
#        util.Fail()
     
    """ === 3 ===
    MSA:               RootSpeciesTree
    Phyldog:           RootSpeciesTree    
    Dendroblast:       RootSpeciesTree  
    MSA (ust):         ConvertSpeciesTreeIDs
    Phyldog (ust):     ConvertSpeciesTreeIDs
    Dendroblast (ust): ConvertSpeciesTreeIDs
    """    
    if qPhyldog:
        rootedSpeciesTreeFN = [species_tree_ids_labelled_phyldog]
        roots = [None]
        qMultiple = False
        all_stride_dup_genes = None
    elif userSpeciesTree:
        rootedSpeciesTreeFN = [spTreeFN_ids]
        roots = [None]
        qMultiple = False
        all_stride_dup_genes = None
    elif len(ogSet.seqsInfo.speciesToUse) == 2:
        hardcodeSpeciesTree = GetSpeciesTreeRoot_TwoTaxa(ogSet.seqsInfo.speciesToUse)
        rootedSpeciesTreeFN = [hardcodeSpeciesTree]
        roots = [None]
        qMultiple = False
        all_stride_dup_genes = None
    else:
        util.PrintUnderline("Best outgroup(s) for species tree") 
        util.PrintTime("Starting STRIDE")
        roots, clusters_counter, rootedSpeciesTreeFN, nSupport, _, _, all_stride_dup_genes = stride.GetRoot(spTreeFN_ids, files.FileHandler.GetOGsTreeDir(), stride.GeneToSpecies_dash, nHighParallel, qWriteRootedTree=True)
        util.PrintTime("Done STRIDE")
        nAll = sum(clusters_counter.values())
        nFP_mp = nAll - nSupport
        n_non_trivial = sum([v for k, v in clusters_counter.items() if len(k) > 1])
        if len(roots) > 1:
            print("Observed %d well-supported, non-terminal duplications. %d support the best roots and %d contradict them." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp))
            print("Best outgroups for species tree:")  
        else:
            print("Observed %d well-supported, non-terminal duplications. %d support the best root and %d contradict it." % (n_non_trivial, n_non_trivial-nFP_mp, nFP_mp))
            print("Best outgroup for species tree:")  
        spDict = ogSet.SpeciesDict()
        for r in roots: print("  " + (", ".join([spDict[s] for s in r]))  )
        qMultiple = len(roots) > 1
    shutil.copy(rootedSpeciesTreeFN[0], files.FileHandler.GetSpeciesTreeIDsRootedFN())
        
    """
    SpeciesTree:
    We now have a list of rooted species trees: rootedSpeciesTreeFN (this should be recorded by the file handler)
    """
        
    if qStopAfterTrees:
        if userSpeciesTree:
            st = ""
            if qMSA:
                st += "\nSequences for orthogroups:\n   %s\n" % seqs_alignments_dirs[0]
                st += "\nMultiple sequence alignments:\n   %s\n" % seqs_alignments_dirs[1]
            st += "\nGene trees:\n   %s\n" % (files.FileHandler.GetResultsTreesDir())
            return st
        # otherwise, root species tree
        resultsSpeciesTrees = []
        for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
            resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(i, not qMultiple))
            util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True)
            labeled_tree_fn = files.FileHandler.GetSpeciesTreeResultsNodeLabelsFN()
            util.RenameTreeTaxa(speciesTree_fn, labeled_tree_fn, db.ogSet.SpeciesDict(), qSupport=False, qFixNegatives=True, label='N')
        files.FileHandler.CleanWorkingDir2()
        return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None, False)
    
    if qMultiple: util.PrintUnderline("\nMultiple potential species tree roots were identified, only one will be analyed.", True)
    resultsSpeciesTrees = []
    i = 0
    r = roots[0]
    speciesTree_fn = rootedSpeciesTreeFN[0]
    util.PrintUnderline("Reconciling gene trees and species tree")         
    resultsSpeciesTrees.append(files.FileHandler.GetSpeciesTreeResultsFN(0, True))
    if (not userSpeciesTree) and (not qPhyldog) and len(ogSet.seqsInfo.speciesToUse) != 2:
        print("Outgroup: " + (", ".join([spDict[s] for s in r])))
    util.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True)
    util.PrintTime("Starting Recon and orthologues")
    ReconciliationAndOrthologues(recon_method, db.ogSet, nHighParallel, i if qMultiple else None, all_stride_dup_genes=all_stride_dup_genes) 
    util.PrintTime("Done Recon")
    
    if qMultiple:
        for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)):
            unanalysedSpeciesTree = files.FileHandler.GetSpeciesTreeResultsFN(i, False)
            util.RenameTreeTaxa(speciesTree_fn, unanalysedSpeciesTree, db.ogSet.SpeciesDict(), qSupport=qSpeciesTreeSupports, qFixNegatives=True, label='N')
    
    """
    SpeciesTree: If it's been inferred, there is now at least one rooted results species trees: GetSpeciesTreeResultsFN()
    """
    
    files.FileHandler.CleanWorkingDir2()
    util.PrintUnderline("Writing results files", True)
    
    return GetResultsFilesString(resultsSpeciesTrees, seqs_alignments_dirs if qMSA else None)