Ejemplo n.º 1
0
def DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDirectory, GeneToSpecies, output_dir, reconTreesRenamedDir):    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)      
    for index1 in xrange(nspecies):
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
    with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            recon_tree = files.FileHandler.GetPhyldogOGResultsTreeFN(iog)
            orthologues = GetOrthologues_from_phyldog_tree(iog, recon_tree, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict())
            allOrthologues = [(iog, orthologues)]
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, False)
    return nOrthologues_SpPair
Ejemplo n.º 2
0
def DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDirectory, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir):    # Create directory structure
    resultsDir = workingDirectory + "phyldog/Results/"
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)      
    for index1 in xrange(nspecies):
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
    with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            recon_tree = resultsDir + "OG%07d.ReconciledTree.txt" % iog
            orthologues = GetOrthologues_from_phyldog_tree(iog, recon_tree, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict())
            allOrthologues = [(iog, orthologues)]
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, False)
    return nOrthologues_SpPair
Ejemplo n.º 3
0
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse, sequenceDict, resultsDir):
    # Sort the orthologues according to speices pairs
    sp_to_index = {str(sp):i for i, sp in enumerate(iSpeciesToUse)}
    nOrtho = util.nOrtho_sp(len(iSpeciesToUse))    
    species = speciesDict.keys()
#    left = [[] for sp in species]  
#    right = [[] for sp in species]
    # reorder orthologues on a per-species basis
    nSpecies = len(species)
    for i in xrange(nSpecies):
        sp0 = species[i]
        strsp0 = sp0 + "_"
        isp0 = sp_to_index[sp0]
        d0 = resultsDir + "Orthologues_" + speciesDict[sp0] + "/"
        for j in xrange(i, nSpecies):
            sp1 = species[j]
            if sp1 == sp0: continue
            strsp1 = sp1 + "_"
            isp1 = sp_to_index[sp1]
            d1 = resultsDir + "Orthologues_" + speciesDict[sp1] + "/"
            with open(d0 + '%s__v__%s.csv' % (speciesDict[sp0], speciesDict[sp1]), 'ab') as outfile1, open(d1 + '%s__v__%s.csv' % (speciesDict[sp1], speciesDict[sp0]), 'ab') as outfile2:
                writer1 = csv.writer(outfile1, delimiter="\t")
                writer2 = csv.writer(outfile2, delimiter="\t")
                for iog, ortholouges_onetree in orthologues_alltrees:                   
                    og = "OG%07d" % iog
                    for leavesL, leavesR in ortholouges_onetree:
                        nL0 = len(leavesL[sp0])
                        nR0 = len(leavesR[sp0])
                        nL1 = len(leavesL[sp1])
                        nR1 = len(leavesR[sp1])
                        if nL0*nR1 + nL1*nR0 == 0: continue # no orthologues
                        # each species can be in only one of L and R at most: they might both be in the same half
                        if nL0 > 0:
                            # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0)
                            n0 = nL0
                            n1 = nR1
                            text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL[sp0]])
                            text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]])
                        else:
                            n0 = nR0
                            n1 = nL1
                            text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR[sp0]])
                            text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]])
                        writer1.writerow((og, text0, text1))
                        writer2.writerow((og, text1, text0))
                        nOrtho.n[isp0, isp1] += n0
                        nOrtho.n[isp1, isp0] += n1
                        if n0 == 1 and n1 == 1:
                            nOrtho.n_121[isp0, isp1] += 1
                            nOrtho.n_121[isp1, isp0] += 1
                        elif n0 == 1:
                            nOrtho.n_12m[isp0, isp1] += 1
                            nOrtho.n_m21[isp1, isp0] += n1
                        elif n1 == 1:
                            nOrtho.n_m21[isp0, isp1] += n0
                            nOrtho.n_12m[isp1, isp0] += 1
                        else:
                            nOrtho.n_m2m[isp0, isp1] += n0
                            nOrtho.n_m2m[isp1, isp0] += n1
    return nOrtho   
Ejemplo n.º 4
0
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes):    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)      
    dSuspect = output_dir + "Putative_Xenologues/"
    if not os.path.exists(dSuspect): os.mkdir(dSuspect)     
    for index1 in xrange(nspecies):
        with open(dSuspect + '%s.csv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile:
            writer1 = csv.writer(outfile, delimiter="\t")
            writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other"))
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    # Infer orthologues and write them to file           
    species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn)
    neighbours = GetSpeciesNeighbours(species_tree_rooted)
    # Label nodes of species tree
    species_tree_rooted.name = "N0"    
    iNode = 1
    for n in species_tree_rooted.traverse():
        if (not n.is_leaf()) and (not n.is_root()):
            n.name = "N%d" % iNode
            iNode += 1
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
    species = speciesDict.keys()
    with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            orthologues, recon_tree, suspect_genes = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes)
            for index0 in xrange(nspecies):
                strsp0 = species[index0]
                strsp0_ = strsp0+"_"
                these_genes = [g for g in suspect_genes if g.startswith(strsp0_)]
                if len(these_genes) > 0:
                    with open(output_dir + "Orthologues_" + speciesDict[strsp0] + "/Putative_Horizontal_Gene_Transfer.txt", 'ab') as outfile:
                        outfile.write("\n".join([SequenceDict[g]]) + "\n")
            allOrthologues = [(iog, orthologues)]
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, True)
    return nOrthologues_SpPair
Ejemplo n.º 5
0
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes):    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)      
    dSuspect = output_dir + "Putative_Xenologues/"
    if not os.path.exists(dSuspect): os.mkdir(dSuspect)     
    for index1 in xrange(nspecies):
        with open(dSuspect + '%s.csv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile:
            writer1 = csv.writer(outfile, delimiter="\t")
            writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other"))
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    # Infer orthologues and write them to file           
    species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn)
    neighbours = GetSpeciesNeighbours(species_tree_rooted)
    # Label nodes of species tree
    species_tree_rooted.name = "N0"    
    iNode = 1
    for n in species_tree_rooted.traverse():
        if (not n.is_leaf()) and (not n.is_root()):
            n.name = "N%d" % iNode
            iNode += 1
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
    species = speciesDict.keys()
    with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes)
            for index0 in xrange(nspecies):
                strsp0 = species[index0]
                strsp0_ = strsp0+"_"
                these_genes = [g for g in suspect_genes if g.startswith(strsp0_)]
                if len(these_genes) > 0:
                    with open(output_dir + "Orthologues_" + speciesDict[strsp0] + "/Putative_Horizontal_Gene_Transfer.txt", 'ab') as outfile:
                        outfile.write("\n".join([SequenceDict[g]]) + "\n")
            allOrthologues = [(iog, orthologues)]
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, True)
    return nOrthologues_SpPair
Ejemplo n.º 6
0
def TwoAndThreeGeneOrthogroups(ogSet, resultsDir):
    speciesDict = ogSet.SpeciesDict()
    sequenceDict = ogSet.SequenceDict()
    ogs = ogSet.OGs(qInclAll=True)
    nOrthologues_SpPair = util.nOrtho_sp(len(ogSet.speciesToUse))
    all_orthologues = []
    d_empty = defaultdict(list)
    for iog, og in enumerate(ogs):
        n = len(og) 
        if n == 1: break
        elif n == 2:
            if og[0].iSp == og[1].iSp: continue
            # orthologues is a list of tuples of dictionaries
            # each dictionary is sp->list of genes in species
            d0 = defaultdict(list)
            d0[str(og[0].iSp)].append(str(og[0].iSeq))
            d1 = defaultdict(list)
            d1[str(og[1].iSp)].append(str(og[1].iSeq))
            orthologues = [(d0, d1, d_empty, d_empty)]  
        elif n == 3:
            sp = [g.iSp for g in og]
            c = Counter(sp) 
            nSp = len(c)
            if nSp == 3:
                g = [(str(g.iSp), str(g.iSeq)) for g in og]
                d0 = defaultdict(list)
                d0[g[0][0]].append(g[0][1])
                d1 = defaultdict(list)
                d1[g[1][0]].append(g[1][1])
                d1[g[2][0]].append(g[2][1])
                orthologues = [(d0, d1, d_empty, d_empty)]  
                d0 = defaultdict(list)
                d0[g[1][0]].append(g[1][1])
                d1 = defaultdict(list)
                d1[g[2][0]].append(g[2][1])
                orthologues.append((d0,d1, d_empty, d_empty))
            elif nSp == 2:             
                sp0, sp1 = c.keys()
                d0 = defaultdict(list)
                d0[str(sp0)] = [str(g.iSeq) for g in og if g.iSp == sp0]
                d1 = defaultdict(list)
                d1[str(sp1)] = [str(g.iSeq) for g in og if g.iSp == sp1]
                orthologues = [(d0, d1, d_empty, d_empty)]
            else: 
                continue # no orthologues
        elif n >= 4:
            continue
        all_orthologues.append((iog, orthologues))
    nOrthologues_SpPair += trees2ologs_of.AppendOrthologuesToFiles(all_orthologues, speciesDict, ogSet.speciesToUse, sequenceDict, resultsDir, False)
    return nOrthologues_SpPair
Ejemplo n.º 7
0
def species_write_all(ogSet, pickleDir, resultsDir):
    speciesDict = ogSet.SpeciesDict()
    # Calls multiply and find_all on each species pair, and appends the numbers from find_all's output to the relevant csv lists.
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)           
    nOrthologues_SpPair = util.nOrtho_sp(nspecies)
    for index1 in xrange(nspecies):
        d = resultsDir + "Orthologues_" + speciesDict[str(speciesIDs[index1])]
        if not os.path.exists(d): os.mkdir(d)
    for index1, index2 in itertools.product(xrange(nspecies), xrange(nspecies)):      
        if index1 >= index2: continue
        product, M = multiply(index1, index2, pickleDir)
        orthologues = find_all(product, M)
        WriteOrthologues(resultsDir, speciesIDs[index2], speciesIDs[index1], orthologues, ogSet, nOrthologues_SpPair, index2 ,index1)   
    return nOrthologues_SpPair
Ejemplo n.º 8
0
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes):    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)           
    for index1 in xrange(nspecies):
        d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)     
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
    # Infer orthologues and write them to file           
    species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn)
    # Label nodes of species tree
    species_tree_rooted.name = "N0"    
    iNode = 1
    for n in species_tree_rooted.traverse():
        if (not n.is_leaf()) and (not n.is_root()):
            n.name = "N%d" % iNode
            iNode += 1
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies)
    allOrthologues = []
    with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
        for iog in xrange(nOgs):
            orthologues, recon_tree = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes)
            allOrthologues.append((iog, orthologues))
            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qFixNegatives=True, label='n') 
            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
    nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir)
    return nOrthologues_SpPair
Ejemplo n.º 9
0
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse, sequenceDict, resultsDir, qContainsSuspectOlogs):
    # Sort the orthologues according to speices pairs
    sp_to_index = {str(sp):i for i, sp in enumerate(iSpeciesToUse)}
    nOrtho = util.nOrtho_sp(len(iSpeciesToUse))   
#    print(speciesDict)
#    print(iSpeciesToUse)
#    species = speciesDict.keys()
#    left = [[] for sp in species]  
#    right = [[] for sp in species]
    # reorder orthologues on a per-species basis
    nSpecies = len(iSpeciesToUse)
    for i in xrange(nSpecies):
        sp0 = str(iSpeciesToUse[i])
        if qContainsSuspectOlogs: 
            outfile1_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp0], 'ab')
            writer1_sus = csv.writer(outfile1_sus, delimiter="\t")
        strsp0 = sp0 + "_"
        isp0 = sp_to_index[sp0]
        d0 = resultsDir + "Orthologues_" + speciesDict[sp0] + "/"
        for j in xrange(i, nSpecies):
            sp1 = str(iSpeciesToUse[j])
            if sp1 == sp0: continue
            strsp1 = sp1 + "_"
            isp1 = sp_to_index[sp1]
            d1 = resultsDir + "Orthologues_" + speciesDict[sp1] + "/"
            with open(d0 + '%s__v__%s.csv' % (speciesDict[sp0], speciesDict[sp1]), 'ab') as outfile1, open(d1 + '%s__v__%s.csv' % (speciesDict[sp1], speciesDict[sp0]), 'ab') as outfile2:
                if qContainsSuspectOlogs:
                    outfile2_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp1], 'ab')
                    writer2_sus = csv.writer(outfile2_sus, delimiter="\t")
                writer1 = csv.writer(outfile1, delimiter="\t")
                writer2 = csv.writer(outfile2, delimiter="\t")
                for iog, ortholouges_onetree in orthologues_alltrees:                   
                    og = "OG%07d" % iog
                    for leavesL, leavesR, leavesL_sus, leavesR_sus  in ortholouges_onetree:
                        # suspect_genes are the genes which, for this level, the orthologues should be considered suspect as the gene appears misplaced (at this level)
                        nL0 = len(leavesL[sp0])
                        nR0 = len(leavesR[sp0])
                        nL1 = len(leavesL[sp1])
                        nR1 = len(leavesR[sp1])
                        if nL0*nR1 + nL1*nR0 != 0: 
                            # each species can be in only one of L and R at most: they might both be in the same half
                            if nL0 > 0:
                                # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0)
                                n0 = nL0
                                n1 = nR1
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]])
                            else:
                                n0 = nR0
                                n1 = nL1
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]])
                            writer1.writerow((og, text0, text1))
                            writer2.writerow((og, text1, text0))
                            nOrtho.n[isp0, isp1] += n0
                            nOrtho.n[isp1, isp0] += n1
                            if n0 == 1 and n1 == 1:
                                nOrtho.n_121[isp0, isp1] += 1
                                nOrtho.n_121[isp1, isp0] += 1
                            elif n0 == 1:
                                nOrtho.n_12m[isp0, isp1] += 1
                                nOrtho.n_m21[isp1, isp0] += n1
                            elif n1 == 1:
                                nOrtho.n_m21[isp0, isp1] += n0
                                nOrtho.n_12m[isp1, isp0] += 1
                            else:
                                nOrtho.n_m2m[isp0, isp1] += n0
                                nOrtho.n_m2m[isp1, isp0] += n1
                        # Write suspect orthologues
                        if not qContainsSuspectOlogs: continue
                        nL0s = len(leavesL_sus[sp0])
                        nR0s = len(leavesR_sus[sp0])
                        nL1s = len(leavesL_sus[sp1])
                        nR1s = len(leavesR_sus[sp1])
                        if nL0s*(nR1+nR1s) + (nL1+nL1s)*nR0s != 0: 
                            # each species can be in only one of L and R at most: they might both be in the same half
                            if nL0s > 0:
                                # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0)
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL_sus[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]+leavesR_sus[sp1]])
                            else:
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR_sus[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]+leavesL_sus[sp1]])
                            writer1_sus.writerow((og, text0, text1))
                            writer2_sus.writerow((og, text1, text0))
                if qContainsSuspectOlogs:
                    outfile2_sus.close()
        if qContainsSuspectOlogs:
            outfile1_sus.close()
    return nOrtho   
Ejemplo n.º 10
0
def DoOrthologuesForOrthoFinder(ogSet, species_tree_rooted_fn, GeneToSpecies,
                                all_stride_dup_genes, qNoRecon):
    """
    """
    # Create directory structure
    speciesDict = ogSet.SpeciesDict()
    SequenceDict = ogSet.SequenceDict()
    # Write directory and file structure
    qInitialisedSuspectGenesDirs = False
    speciesIDs = ogSet.speciesToUse
    nspecies = len(speciesIDs)
    dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory()
    for index1 in xrange(nspecies):
        d = dResultsOrthologues + "Orthologues_" + speciesDict[str(
            speciesIDs[index1])] + "/"
        if not os.path.exists(d): os.mkdir(d)
        for index2 in xrange(nspecies):
            if index2 == index1: continue
            with open(
                    d + '%s__v__%s.tsv' %
                (speciesDict[str(speciesIDs[index1])], speciesDict[str(
                    speciesIDs[index2])]), 'wb') as outfile:
                writer1 = csv.writer(outfile, delimiter="\t")
                writer1.writerow(
                    ("Orthogroup", speciesDict[str(speciesIDs[index1])],
                     speciesDict[str(speciesIDs[index2])]))
    # Infer orthologues and write them to file
    species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn)
    neighbours = GetSpeciesNeighbours(species_tree_rooted)
    # Label nodes of species tree
    species_tree_rooted.name = "N0"
    iNode = 1
    for n in species_tree_rooted.traverse():
        if (not n.is_leaf()) and (not n.is_root()):
            n.name = "N%d" % iNode
            iNode += 1
    nOgs = len(ogSet.OGs())
    nOrthologues_SpPair = util.nOrtho_sp(nspecies)
    species = speciesDict.keys()
    reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True)
    spec_seq_dict = ogSet.Spec_SeqDict()
    with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile:
        dupWriter = csv.writer(outfile, delimiter="\t")
        dupWriter.writerow([
            "Orthogroup", "Species Tree Node", "Gene Tree Node", "Support",
            "Type", "Genes 1", "Genes 2"
        ])
        for iog in xrange(nOgs):
            rooted_tree_ids, qHaveSupport = CheckAndRootTree(
                files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted,
                GeneToSpecies)  # this can be parallelised easily
            if rooted_tree_ids is None: continue
            # Write rooted tree with accessions
            util.RenameTreeTaxa(rooted_tree_ids,
                                files.FileHandler.GetOGsTreeFN(iog, True),
                                spec_seq_dict,
                                qSupport=qHaveSupport,
                                qFixNegatives=True,
                                qViaCopy=True)
            orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(
                iog,
                rooted_tree_ids,
                species_tree_rooted,
                GeneToSpecies,
                neighbours,
                dupsWriter=dupWriter,
                seqIDs=spec_seq_dict,
                spIDs=ogSet.SpeciesDict(),
                all_stride_dup_genes=all_stride_dup_genes,
                qNoRecon=qNoRecon)
            qContainsSuspectGenes = len(suspect_genes) > 0
            if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes:
                qInitialisedSuspectGenesDirs = True
                dSuspectGenes = files.FileHandler.GetSuspectGenesDir()
                dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir(
                )
                for index1 in xrange(nspecies):
                    with open(
                            dSuspectOrthologues +
                            '%s.tsv' % speciesDict[str(speciesIDs[index1])],
                            'wb') as outfile:
                        writer1 = csv.writer(outfile, delimiter="\t")
                        writer1.writerow(
                            ("Orthogroup",
                             speciesDict[str(speciesIDs[index1])], "Other"))
            for index0 in xrange(nspecies):
                strsp0 = species[index0]
                strsp0_ = strsp0 + "_"
                these_genes = [
                    g for g in suspect_genes if g.startswith(strsp0_)
                ]
                if len(these_genes) > 0:
                    with open(dSuspectGenes + speciesDict[strsp0] + ".txt",
                              'ab') as outfile:
                        outfile.write(
                            "\n".join([SequenceDict[g]
                                       for g in these_genes]) + "\n")
            allOrthologues = [(iog, orthologues)]
            # don't relabel nodes, they've already been done
            util.RenameTreeTaxa(recon_tree,
                                reconTreesRenamedDir + "OG%07d_tree.txt" % iog,
                                spec_seq_dict,
                                qSupport=False,
                                qFixNegatives=True)
            if iog >= 0 and divmod(
                    iog, 10
                    if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
                util.PrintTime("Done %d of %d" % (iog, nOgs))
            nOrthologues_SpPair += AppendOrthologuesToFiles(
                allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict,
                dResultsOrthologues, qContainsSuspectGenes)
    return nOrthologues_SpPair
Ejemplo n.º 11
0
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse,
                             sequenceDict, resultsDir, ortholog_file_writers,
                             suspect_genes_file_writers,
                             qContainsSuspectOlogs):
    # Sort the orthologues according to speices pairs
    sp_to_index = {str(sp): i for i, sp in enumerate(iSpeciesToUse)}
    nOrtho = util.nOrtho_sp(len(iSpeciesToUse))
    #    print(speciesDict)
    #    print(iSpeciesToUse)
    #    species = speciesDict.keys()
    #    left = [[] for sp in species]
    #    right = [[] for sp in species]
    # reorder orthologues on a per-species basis
    if qContainsSuspectOlogs:
        dSuspect = files.FileHandler.GetPutativeXenelogsDir()
    nSpecies = len(iSpeciesToUse)
    for i in xrange(nSpecies):
        sp0 = str(iSpeciesToUse[i])
        if qContainsSuspectOlogs:
            writer1_sus = suspect_genes_file_writers[i]
        strsp0 = sp0 + "_"
        isp0 = sp_to_index[sp0]
        for j in xrange(i, nSpecies):
            sp1 = str(iSpeciesToUse[j])
            if sp1 == sp0: continue
            strsp1 = sp1 + "_"
            isp1 = sp_to_index[sp1]
            if qContainsSuspectOlogs:
                writer2_sus = suspect_genes_file_writers[j]
            writer1 = ortholog_file_writers[i][j]
            writer2 = ortholog_file_writers[j][i]
            for iog, ortholouges_onetree in orthologues_alltrees:
                og = "OG%07d" % iog
                for leavesL, leavesR, leavesL_sus, leavesR_sus in ortholouges_onetree:
                    # suspect_genes are the genes which, for this level, the orthologues should be considered suspect as the gene appears misplaced (at this level)
                    nL0 = len(leavesL[sp0])
                    nR0 = len(leavesR[sp0])
                    nL1 = len(leavesL[sp1])
                    nR1 = len(leavesR[sp1])
                    if nL0 * nR1 + nL1 * nR0 != 0:
                        # each species can be in only one of L and R at most: they might both be in the same half
                        if nL0 > 0:
                            # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0)
                            n0 = nL0
                            n1 = nR1
                            text0 = ", ".join([
                                sequenceDict[strsp0 + g] for g in leavesL[sp0]
                            ])
                            text1 = ", ".join([
                                sequenceDict[strsp1 + g] for g in leavesR[sp1]
                            ])
                        else:
                            n0 = nR0
                            n1 = nL1
                            text0 = ", ".join([
                                sequenceDict[strsp0 + g] for g in leavesR[sp0]
                            ])
                            text1 = ", ".join([
                                sequenceDict[strsp1 + g] for g in leavesL[sp1]
                            ])
                        writer1.writerow((og, text0, text1))
                        writer2.writerow((og, text1, text0))
                        nOrtho.n[isp0, isp1] += n0
                        nOrtho.n[isp1, isp0] += n1
                        if n0 == 1 and n1 == 1:
                            nOrtho.n_121[isp0, isp1] += 1
                            nOrtho.n_121[isp1, isp0] += 1
                        elif n0 == 1:
                            nOrtho.n_12m[isp0, isp1] += 1
                            nOrtho.n_m21[isp1, isp0] += n1
                        elif n1 == 1:
                            nOrtho.n_m21[isp0, isp1] += n0
                            nOrtho.n_12m[isp1, isp0] += 1
                        else:
                            nOrtho.n_m2m[isp0, isp1] += n0
                            nOrtho.n_m2m[isp1, isp0] += n1
                    # Write suspect orthologues
                    if not qContainsSuspectOlogs: continue
                    nL0s = len(leavesL_sus[sp0])
                    nR0s = len(leavesR_sus[sp0])
                    nL1s = len(leavesL_sus[sp1])
                    nR1s = len(leavesR_sus[sp1])
                    if nL0s * (nR1 + nR1s) + (nL1 + nL1s) * nR0s != 0:
                        # each species can be in only one of L and R at most: they might both be in the same half
                        if nL0s > 0:
                            # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0)
                            text0 = ", ".join([
                                sequenceDict[strsp0 + g]
                                for g in leavesL_sus[sp0]
                            ])
                            text1 = ", ".join([
                                sequenceDict[strsp1 + g]
                                for g in leavesR[sp1] + leavesR_sus[sp1]
                            ])
                        else:
                            text0 = ", ".join([
                                sequenceDict[strsp0 + g]
                                for g in leavesR_sus[sp0]
                            ])
                            text1 = ", ".join([
                                sequenceDict[strsp1 + g]
                                for g in leavesL[sp1] + leavesL_sus[sp1]
                            ])
                        writer1_sus.writerow((og, text0, text1))
                        writer2_sus.writerow((og, text1, text0))
    return nOrtho
Ejemplo n.º 12
0
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse, sequenceDict, resultsDir, qContainsSuspectOlogs):
    # Sort the orthologues according to speices pairs
    sp_to_index = {str(sp):i for i, sp in enumerate(iSpeciesToUse)}
    nOrtho = util.nOrtho_sp(len(iSpeciesToUse))   
#    print(speciesDict)
#    print(iSpeciesToUse)
#    species = speciesDict.keys()
#    left = [[] for sp in species]  
#    right = [[] for sp in species]
    # reorder orthologues on a per-species basis
    nSpecies = len(iSpeciesToUse)
    for i in xrange(nSpecies):
        sp0 = str(iSpeciesToUse[i])
        if qContainsSuspectOlogs: 
            outfile1_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp0], 'ab')
            writer1_sus = csv.writer(outfile1_sus, delimiter="\t")
        strsp0 = sp0 + "_"
        isp0 = sp_to_index[sp0]
        d0 = resultsDir + "Orthologues_" + speciesDict[sp0] + "/"
        for j in xrange(i, nSpecies):
            sp1 = str(iSpeciesToUse[j])
            if sp1 == sp0: continue
            strsp1 = sp1 + "_"
            isp1 = sp_to_index[sp1]
            d1 = resultsDir + "Orthologues_" + speciesDict[sp1] + "/"
            with open(d0 + '%s__v__%s.csv' % (speciesDict[sp0], speciesDict[sp1]), 'ab') as outfile1, open(d1 + '%s__v__%s.csv' % (speciesDict[sp1], speciesDict[sp0]), 'ab') as outfile2:
                if qContainsSuspectOlogs:
                    outfile2_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp1], 'ab')
                    writer2_sus = csv.writer(outfile2_sus, delimiter="\t")
                writer1 = csv.writer(outfile1, delimiter="\t")
                writer2 = csv.writer(outfile2, delimiter="\t")
                for iog, ortholouges_onetree in orthologues_alltrees:                   
                    og = "OG%07d" % iog
                    for leavesL, leavesR, leavesL_sus, leavesR_sus  in ortholouges_onetree:
                        # suspect_genes are the genes which, for this level, the orthologues should be considered suspect as the gene appears misplaced (at this level)
                        nL0 = len(leavesL[sp0])
                        nR0 = len(leavesR[sp0])
                        nL1 = len(leavesL[sp1])
                        nR1 = len(leavesR[sp1])
                        if nL0*nR1 + nL1*nR0 != 0: 
                            # each species can be in only one of L and R at most: they might both be in the same half
                            if nL0 > 0:
                                # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0)
                                n0 = nL0
                                n1 = nR1
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]])
                            else:
                                n0 = nR0
                                n1 = nL1
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]])
                            writer1.writerow((og, text0, text1))
                            writer2.writerow((og, text1, text0))
                            nOrtho.n[isp0, isp1] += n0
                            nOrtho.n[isp1, isp0] += n1
                            if n0 == 1 and n1 == 1:
                                nOrtho.n_121[isp0, isp1] += 1
                                nOrtho.n_121[isp1, isp0] += 1
                            elif n0 == 1:
                                nOrtho.n_12m[isp0, isp1] += 1
                                nOrtho.n_m21[isp1, isp0] += n1
                            elif n1 == 1:
                                nOrtho.n_m21[isp0, isp1] += n0
                                nOrtho.n_12m[isp1, isp0] += 1
                            else:
                                nOrtho.n_m2m[isp0, isp1] += n0
                                nOrtho.n_m2m[isp1, isp0] += n1
                        # Write suspect orthologues
                        if not qContainsSuspectOlogs: continue
                        nL0s = len(leavesL_sus[sp0])
                        nR0s = len(leavesR_sus[sp0])
                        nL1s = len(leavesL_sus[sp1])
                        nR1s = len(leavesR_sus[sp1])
                        if nL0s*(nR1+nR1s) + (nL1+nL1s)*nR0s != 0: 
                            # each species can be in only one of L and R at most: they might both be in the same half
                            if nL0s > 0:
                                # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0)
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL_sus[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]+leavesR_sus[sp1]])
                            else:
                                text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR_sus[sp0]])
                                text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]+leavesL_sus[sp1]])
                            writer1_sus.writerow((og, text0, text1))
                            writer2_sus.writerow((og, text1, text0))
                if qContainsSuspectOlogs:
                    outfile2_sus.close()
        if qContainsSuspectOlogs:
            outfile1_sus.close()
    return nOrtho