def DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDirectory, GeneToSpecies, output_dir, reconTreesRenamedDir): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) for index1 in xrange(nspecies): d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): recon_tree = files.FileHandler.GetPhyldogOGResultsTreeFN(iog) orthologues = GetOrthologues_from_phyldog_tree(iog, recon_tree, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict()) allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, False) return nOrthologues_SpPair
def DoOrthologuesForOrthoFinder_Phyldog(ogSet, workingDirectory, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir): # Create directory structure resultsDir = workingDirectory + "phyldog/Results/" speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) for index1 in xrange(nspecies): d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): recon_tree = resultsDir + "OG%07d.ReconciledTree.txt" % iog orthologues = GetOrthologues_from_phyldog_tree(iog, recon_tree, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict()) allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, False) return nOrthologues_SpPair
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse, sequenceDict, resultsDir): # Sort the orthologues according to speices pairs sp_to_index = {str(sp):i for i, sp in enumerate(iSpeciesToUse)} nOrtho = util.nOrtho_sp(len(iSpeciesToUse)) species = speciesDict.keys() # left = [[] for sp in species] # right = [[] for sp in species] # reorder orthologues on a per-species basis nSpecies = len(species) for i in xrange(nSpecies): sp0 = species[i] strsp0 = sp0 + "_" isp0 = sp_to_index[sp0] d0 = resultsDir + "Orthologues_" + speciesDict[sp0] + "/" for j in xrange(i, nSpecies): sp1 = species[j] if sp1 == sp0: continue strsp1 = sp1 + "_" isp1 = sp_to_index[sp1] d1 = resultsDir + "Orthologues_" + speciesDict[sp1] + "/" with open(d0 + '%s__v__%s.csv' % (speciesDict[sp0], speciesDict[sp1]), 'ab') as outfile1, open(d1 + '%s__v__%s.csv' % (speciesDict[sp1], speciesDict[sp0]), 'ab') as outfile2: writer1 = csv.writer(outfile1, delimiter="\t") writer2 = csv.writer(outfile2, delimiter="\t") for iog, ortholouges_onetree in orthologues_alltrees: og = "OG%07d" % iog for leavesL, leavesR in ortholouges_onetree: nL0 = len(leavesL[sp0]) nR0 = len(leavesR[sp0]) nL1 = len(leavesL[sp1]) nR1 = len(leavesR[sp1]) if nL0*nR1 + nL1*nR0 == 0: continue # no orthologues # each species can be in only one of L and R at most: they might both be in the same half if nL0 > 0: # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0) n0 = nL0 n1 = nR1 text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]]) else: n0 = nR0 n1 = nL1 text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]]) writer1.writerow((og, text0, text1)) writer2.writerow((og, text1, text0)) nOrtho.n[isp0, isp1] += n0 nOrtho.n[isp1, isp0] += n1 if n0 == 1 and n1 == 1: nOrtho.n_121[isp0, isp1] += 1 nOrtho.n_121[isp1, isp0] += 1 elif n0 == 1: nOrtho.n_12m[isp0, isp1] += 1 nOrtho.n_m21[isp1, isp0] += n1 elif n1 == 1: nOrtho.n_m21[isp0, isp1] += n0 nOrtho.n_12m[isp1, isp0] += 1 else: nOrtho.n_m2m[isp0, isp1] += n0 nOrtho.n_m2m[isp1, isp0] += n1 return nOrtho
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) dSuspect = output_dir + "Putative_Xenologues/" if not os.path.exists(dSuspect): os.mkdir(dSuspect) for index1 in xrange(nspecies): with open(dSuspect + '%s.csv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) neighbours = GetSpeciesNeighbours(species_tree_rooted) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) species = speciesDict.keys() with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): orthologues, recon_tree, suspect_genes = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes) for index0 in xrange(nspecies): strsp0 = species[index0] strsp0_ = strsp0+"_" these_genes = [g for g in suspect_genes if g.startswith(strsp0_)] if len(these_genes) > 0: with open(output_dir + "Orthologues_" + speciesDict[strsp0] + "/Putative_Horizontal_Gene_Transfer.txt", 'ab') as outfile: outfile.write("\n".join([SequenceDict[g]]) + "\n") allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, True) return nOrthologues_SpPair
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) dSuspect = output_dir + "Putative_Xenologues/" if not os.path.exists(dSuspect): os.mkdir(dSuspect) for index1 in xrange(nspecies): with open(dSuspect + '%s.csv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) neighbours = GetSpeciesNeighbours(species_tree_rooted) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) species = speciesDict.keys() with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes) for index0 in xrange(nspecies): strsp0 = species[index0] strsp0_ = strsp0+"_" these_genes = [g for g in suspect_genes if g.startswith(strsp0_)] if len(these_genes) > 0: with open(output_dir + "Orthologues_" + speciesDict[strsp0] + "/Putative_Horizontal_Gene_Transfer.txt", 'ab') as outfile: outfile.write("\n".join([SequenceDict[g]]) + "\n") allOrthologues = [(iog, orthologues)] util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qSupport=False, qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir, True) return nOrthologues_SpPair
def TwoAndThreeGeneOrthogroups(ogSet, resultsDir): speciesDict = ogSet.SpeciesDict() sequenceDict = ogSet.SequenceDict() ogs = ogSet.OGs(qInclAll=True) nOrthologues_SpPair = util.nOrtho_sp(len(ogSet.speciesToUse)) all_orthologues = [] d_empty = defaultdict(list) for iog, og in enumerate(ogs): n = len(og) if n == 1: break elif n == 2: if og[0].iSp == og[1].iSp: continue # orthologues is a list of tuples of dictionaries # each dictionary is sp->list of genes in species d0 = defaultdict(list) d0[str(og[0].iSp)].append(str(og[0].iSeq)) d1 = defaultdict(list) d1[str(og[1].iSp)].append(str(og[1].iSeq)) orthologues = [(d0, d1, d_empty, d_empty)] elif n == 3: sp = [g.iSp for g in og] c = Counter(sp) nSp = len(c) if nSp == 3: g = [(str(g.iSp), str(g.iSeq)) for g in og] d0 = defaultdict(list) d0[g[0][0]].append(g[0][1]) d1 = defaultdict(list) d1[g[1][0]].append(g[1][1]) d1[g[2][0]].append(g[2][1]) orthologues = [(d0, d1, d_empty, d_empty)] d0 = defaultdict(list) d0[g[1][0]].append(g[1][1]) d1 = defaultdict(list) d1[g[2][0]].append(g[2][1]) orthologues.append((d0,d1, d_empty, d_empty)) elif nSp == 2: sp0, sp1 = c.keys() d0 = defaultdict(list) d0[str(sp0)] = [str(g.iSeq) for g in og if g.iSp == sp0] d1 = defaultdict(list) d1[str(sp1)] = [str(g.iSeq) for g in og if g.iSp == sp1] orthologues = [(d0, d1, d_empty, d_empty)] else: continue # no orthologues elif n >= 4: continue all_orthologues.append((iog, orthologues)) nOrthologues_SpPair += trees2ologs_of.AppendOrthologuesToFiles(all_orthologues, speciesDict, ogSet.speciesToUse, sequenceDict, resultsDir, False) return nOrthologues_SpPair
def species_write_all(ogSet, pickleDir, resultsDir): speciesDict = ogSet.SpeciesDict() # Calls multiply and find_all on each species pair, and appends the numbers from find_all's output to the relevant csv lists. speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) nOrthologues_SpPair = util.nOrtho_sp(nspecies) for index1 in xrange(nspecies): d = resultsDir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] if not os.path.exists(d): os.mkdir(d) for index1, index2 in itertools.product(xrange(nspecies), xrange(nspecies)): if index1 >= index2: continue product, M = multiply(index1, index2, pickleDir) orthologues = find_all(product, M) WriteOrthologues(resultsDir, speciesIDs[index2], speciesIDs[index1], orthologues, ogSet, nOrthologues_SpPair, index2 ,index1) return nOrthologues_SpPair
def DoOrthologuesForOrthoFinder(ogSet, treesIDsPatFn, species_tree_rooted_fn, GeneToSpecies, workingDir, output_dir, reconTreesRenamedDir, all_stride_dup_genes): # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) for index1 in xrange(nspecies): d = output_dir + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open(d + '%s__v__%s.csv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) allOrthologues = [] with open(reconTreesRenamedDir + "../Duplications.csv", 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) for iog in xrange(nOgs): orthologues, recon_tree = GetOrthologues_for_tree(iog, treesIDsPatFn(iog), species_tree_rooted, GeneToSpecies, dupsWriter=dupWriter, seqIDs=ogSet.Spec_SeqDict(), spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes) allOrthologues.append((iog, orthologues)) util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, ogSet.Spec_SeqDict(), qFixNegatives=True, label='n') if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, output_dir) return nOrthologues_SpPair
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse, sequenceDict, resultsDir, qContainsSuspectOlogs): # Sort the orthologues according to speices pairs sp_to_index = {str(sp):i for i, sp in enumerate(iSpeciesToUse)} nOrtho = util.nOrtho_sp(len(iSpeciesToUse)) # print(speciesDict) # print(iSpeciesToUse) # species = speciesDict.keys() # left = [[] for sp in species] # right = [[] for sp in species] # reorder orthologues on a per-species basis nSpecies = len(iSpeciesToUse) for i in xrange(nSpecies): sp0 = str(iSpeciesToUse[i]) if qContainsSuspectOlogs: outfile1_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp0], 'ab') writer1_sus = csv.writer(outfile1_sus, delimiter="\t") strsp0 = sp0 + "_" isp0 = sp_to_index[sp0] d0 = resultsDir + "Orthologues_" + speciesDict[sp0] + "/" for j in xrange(i, nSpecies): sp1 = str(iSpeciesToUse[j]) if sp1 == sp0: continue strsp1 = sp1 + "_" isp1 = sp_to_index[sp1] d1 = resultsDir + "Orthologues_" + speciesDict[sp1] + "/" with open(d0 + '%s__v__%s.csv' % (speciesDict[sp0], speciesDict[sp1]), 'ab') as outfile1, open(d1 + '%s__v__%s.csv' % (speciesDict[sp1], speciesDict[sp0]), 'ab') as outfile2: if qContainsSuspectOlogs: outfile2_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp1], 'ab') writer2_sus = csv.writer(outfile2_sus, delimiter="\t") writer1 = csv.writer(outfile1, delimiter="\t") writer2 = csv.writer(outfile2, delimiter="\t") for iog, ortholouges_onetree in orthologues_alltrees: og = "OG%07d" % iog for leavesL, leavesR, leavesL_sus, leavesR_sus in ortholouges_onetree: # suspect_genes are the genes which, for this level, the orthologues should be considered suspect as the gene appears misplaced (at this level) nL0 = len(leavesL[sp0]) nR0 = len(leavesR[sp0]) nL1 = len(leavesL[sp1]) nR1 = len(leavesR[sp1]) if nL0*nR1 + nL1*nR0 != 0: # each species can be in only one of L and R at most: they might both be in the same half if nL0 > 0: # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0) n0 = nL0 n1 = nR1 text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]]) else: n0 = nR0 n1 = nL1 text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]]) writer1.writerow((og, text0, text1)) writer2.writerow((og, text1, text0)) nOrtho.n[isp0, isp1] += n0 nOrtho.n[isp1, isp0] += n1 if n0 == 1 and n1 == 1: nOrtho.n_121[isp0, isp1] += 1 nOrtho.n_121[isp1, isp0] += 1 elif n0 == 1: nOrtho.n_12m[isp0, isp1] += 1 nOrtho.n_m21[isp1, isp0] += n1 elif n1 == 1: nOrtho.n_m21[isp0, isp1] += n0 nOrtho.n_12m[isp1, isp0] += 1 else: nOrtho.n_m2m[isp0, isp1] += n0 nOrtho.n_m2m[isp1, isp0] += n1 # Write suspect orthologues if not qContainsSuspectOlogs: continue nL0s = len(leavesL_sus[sp0]) nR0s = len(leavesR_sus[sp0]) nL1s = len(leavesL_sus[sp1]) nR1s = len(leavesR_sus[sp1]) if nL0s*(nR1+nR1s) + (nL1+nL1s)*nR0s != 0: # each species can be in only one of L and R at most: they might both be in the same half if nL0s > 0: # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0) text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL_sus[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]+leavesR_sus[sp1]]) else: text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR_sus[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]+leavesL_sus[sp1]]) writer1_sus.writerow((og, text0, text1)) writer2_sus.writerow((og, text1, text0)) if qContainsSuspectOlogs: outfile2_sus.close() if qContainsSuspectOlogs: outfile1_sus.close() return nOrtho
def DoOrthologuesForOrthoFinder(ogSet, species_tree_rooted_fn, GeneToSpecies, all_stride_dup_genes, qNoRecon): """ """ # Create directory structure speciesDict = ogSet.SpeciesDict() SequenceDict = ogSet.SequenceDict() # Write directory and file structure qInitialisedSuspectGenesDirs = False speciesIDs = ogSet.speciesToUse nspecies = len(speciesIDs) dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory() for index1 in xrange(nspecies): d = dResultsOrthologues + "Orthologues_" + speciesDict[str( speciesIDs[index1])] + "/" if not os.path.exists(d): os.mkdir(d) for index2 in xrange(nspecies): if index2 == index1: continue with open( d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str( speciesIDs[index2])]), 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow( ("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) # Infer orthologues and write them to file species_tree_rooted = tree_lib.Tree(species_tree_rooted_fn) neighbours = GetSpeciesNeighbours(species_tree_rooted) # Label nodes of species tree species_tree_rooted.name = "N0" iNode = 1 for n in species_tree_rooted.traverse(): if (not n.is_leaf()) and (not n.is_root()): n.name = "N%d" % iNode iNode += 1 nOgs = len(ogSet.OGs()) nOrthologues_SpPair = util.nOrtho_sp(nspecies) species = speciesDict.keys() reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) spec_seq_dict = ogSet.Spec_SeqDict() with open(files.FileHandler.GetDuplicationsFN(), 'wb') as outfile: dupWriter = csv.writer(outfile, delimiter="\t") dupWriter.writerow([ "Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2" ]) for iog in xrange(nOgs): rooted_tree_ids, qHaveSupport = CheckAndRootTree( files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted, GeneToSpecies) # this can be parallelised easily if rooted_tree_ids is None: continue # Write rooted tree with accessions util.RenameTreeTaxa(rooted_tree_ids, files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=qHaveSupport, qFixNegatives=True, qViaCopy=True) orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree( iog, rooted_tree_ids, species_tree_rooted, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=spec_seq_dict, spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes, qNoRecon=qNoRecon) qContainsSuspectGenes = len(suspect_genes) > 0 if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes: qInitialisedSuspectGenesDirs = True dSuspectGenes = files.FileHandler.GetSuspectGenesDir() dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir( ) for index1 in xrange(nspecies): with open( dSuspectOrthologues + '%s.tsv' % speciesDict[str(speciesIDs[index1])], 'wb') as outfile: writer1 = csv.writer(outfile, delimiter="\t") writer1.writerow( ("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) for index0 in xrange(nspecies): strsp0 = species[index0] strsp0_ = strsp0 + "_" these_genes = [ g for g in suspect_genes if g.startswith(strsp0_) ] if len(these_genes) > 0: with open(dSuspectGenes + speciesDict[strsp0] + ".txt", 'ab') as outfile: outfile.write( "\n".join([SequenceDict[g] for g in these_genes]) + "\n") allOrthologues = [(iog, orthologues)] # don't relabel nodes, they've already been done util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, spec_seq_dict, qSupport=False, qFixNegatives=True) if iog >= 0 and divmod( iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: util.PrintTime("Done %d of %d" % (iog, nOgs)) nOrthologues_SpPair += AppendOrthologuesToFiles( allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, dResultsOrthologues, qContainsSuspectGenes) return nOrthologues_SpPair
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse, sequenceDict, resultsDir, ortholog_file_writers, suspect_genes_file_writers, qContainsSuspectOlogs): # Sort the orthologues according to speices pairs sp_to_index = {str(sp): i for i, sp in enumerate(iSpeciesToUse)} nOrtho = util.nOrtho_sp(len(iSpeciesToUse)) # print(speciesDict) # print(iSpeciesToUse) # species = speciesDict.keys() # left = [[] for sp in species] # right = [[] for sp in species] # reorder orthologues on a per-species basis if qContainsSuspectOlogs: dSuspect = files.FileHandler.GetPutativeXenelogsDir() nSpecies = len(iSpeciesToUse) for i in xrange(nSpecies): sp0 = str(iSpeciesToUse[i]) if qContainsSuspectOlogs: writer1_sus = suspect_genes_file_writers[i] strsp0 = sp0 + "_" isp0 = sp_to_index[sp0] for j in xrange(i, nSpecies): sp1 = str(iSpeciesToUse[j]) if sp1 == sp0: continue strsp1 = sp1 + "_" isp1 = sp_to_index[sp1] if qContainsSuspectOlogs: writer2_sus = suspect_genes_file_writers[j] writer1 = ortholog_file_writers[i][j] writer2 = ortholog_file_writers[j][i] for iog, ortholouges_onetree in orthologues_alltrees: og = "OG%07d" % iog for leavesL, leavesR, leavesL_sus, leavesR_sus in ortholouges_onetree: # suspect_genes are the genes which, for this level, the orthologues should be considered suspect as the gene appears misplaced (at this level) nL0 = len(leavesL[sp0]) nR0 = len(leavesR[sp0]) nL1 = len(leavesL[sp1]) nR1 = len(leavesR[sp1]) if nL0 * nR1 + nL1 * nR0 != 0: # each species can be in only one of L and R at most: they might both be in the same half if nL0 > 0: # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0) n0 = nL0 n1 = nR1 text0 = ", ".join([ sequenceDict[strsp0 + g] for g in leavesL[sp0] ]) text1 = ", ".join([ sequenceDict[strsp1 + g] for g in leavesR[sp1] ]) else: n0 = nR0 n1 = nL1 text0 = ", ".join([ sequenceDict[strsp0 + g] for g in leavesR[sp0] ]) text1 = ", ".join([ sequenceDict[strsp1 + g] for g in leavesL[sp1] ]) writer1.writerow((og, text0, text1)) writer2.writerow((og, text1, text0)) nOrtho.n[isp0, isp1] += n0 nOrtho.n[isp1, isp0] += n1 if n0 == 1 and n1 == 1: nOrtho.n_121[isp0, isp1] += 1 nOrtho.n_121[isp1, isp0] += 1 elif n0 == 1: nOrtho.n_12m[isp0, isp1] += 1 nOrtho.n_m21[isp1, isp0] += n1 elif n1 == 1: nOrtho.n_m21[isp0, isp1] += n0 nOrtho.n_12m[isp1, isp0] += 1 else: nOrtho.n_m2m[isp0, isp1] += n0 nOrtho.n_m2m[isp1, isp0] += n1 # Write suspect orthologues if not qContainsSuspectOlogs: continue nL0s = len(leavesL_sus[sp0]) nR0s = len(leavesR_sus[sp0]) nL1s = len(leavesL_sus[sp1]) nR1s = len(leavesR_sus[sp1]) if nL0s * (nR1 + nR1s) + (nL1 + nL1s) * nR0s != 0: # each species can be in only one of L and R at most: they might both be in the same half if nL0s > 0: # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0) text0 = ", ".join([ sequenceDict[strsp0 + g] for g in leavesL_sus[sp0] ]) text1 = ", ".join([ sequenceDict[strsp1 + g] for g in leavesR[sp1] + leavesR_sus[sp1] ]) else: text0 = ", ".join([ sequenceDict[strsp0 + g] for g in leavesR_sus[sp0] ]) text1 = ", ".join([ sequenceDict[strsp1 + g] for g in leavesL[sp1] + leavesL_sus[sp1] ]) writer1_sus.writerow((og, text0, text1)) writer2_sus.writerow((og, text1, text0)) return nOrtho
def AppendOrthologuesToFiles(orthologues_alltrees, speciesDict, iSpeciesToUse, sequenceDict, resultsDir, qContainsSuspectOlogs): # Sort the orthologues according to speices pairs sp_to_index = {str(sp):i for i, sp in enumerate(iSpeciesToUse)} nOrtho = util.nOrtho_sp(len(iSpeciesToUse)) # print(speciesDict) # print(iSpeciesToUse) # species = speciesDict.keys() # left = [[] for sp in species] # right = [[] for sp in species] # reorder orthologues on a per-species basis nSpecies = len(iSpeciesToUse) for i in xrange(nSpecies): sp0 = str(iSpeciesToUse[i]) if qContainsSuspectOlogs: outfile1_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp0], 'ab') writer1_sus = csv.writer(outfile1_sus, delimiter="\t") strsp0 = sp0 + "_" isp0 = sp_to_index[sp0] d0 = resultsDir + "Orthologues_" + speciesDict[sp0] + "/" for j in xrange(i, nSpecies): sp1 = str(iSpeciesToUse[j]) if sp1 == sp0: continue strsp1 = sp1 + "_" isp1 = sp_to_index[sp1] d1 = resultsDir + "Orthologues_" + speciesDict[sp1] + "/" with open(d0 + '%s__v__%s.csv' % (speciesDict[sp0], speciesDict[sp1]), 'ab') as outfile1, open(d1 + '%s__v__%s.csv' % (speciesDict[sp1], speciesDict[sp0]), 'ab') as outfile2: if qContainsSuspectOlogs: outfile2_sus = open(resultsDir + "Putative_Xenologues/%s.csv" % speciesDict[sp1], 'ab') writer2_sus = csv.writer(outfile2_sus, delimiter="\t") writer1 = csv.writer(outfile1, delimiter="\t") writer2 = csv.writer(outfile2, delimiter="\t") for iog, ortholouges_onetree in orthologues_alltrees: og = "OG%07d" % iog for leavesL, leavesR, leavesL_sus, leavesR_sus in ortholouges_onetree: # suspect_genes are the genes which, for this level, the orthologues should be considered suspect as the gene appears misplaced (at this level) nL0 = len(leavesL[sp0]) nR0 = len(leavesR[sp0]) nL1 = len(leavesL[sp1]) nR1 = len(leavesR[sp1]) if nL0*nR1 + nL1*nR0 != 0: # each species can be in only one of L and R at most: they might both be in the same half if nL0 > 0: # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0) n0 = nL0 n1 = nR1 text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]]) else: n0 = nR0 n1 = nL1 text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]]) writer1.writerow((og, text0, text1)) writer2.writerow((og, text1, text0)) nOrtho.n[isp0, isp1] += n0 nOrtho.n[isp1, isp0] += n1 if n0 == 1 and n1 == 1: nOrtho.n_121[isp0, isp1] += 1 nOrtho.n_121[isp1, isp0] += 1 elif n0 == 1: nOrtho.n_12m[isp0, isp1] += 1 nOrtho.n_m21[isp1, isp0] += n1 elif n1 == 1: nOrtho.n_m21[isp0, isp1] += n0 nOrtho.n_12m[isp1, isp0] += 1 else: nOrtho.n_m2m[isp0, isp1] += n0 nOrtho.n_m2m[isp1, isp0] += n1 # Write suspect orthologues if not qContainsSuspectOlogs: continue nL0s = len(leavesL_sus[sp0]) nR0s = len(leavesR_sus[sp0]) nL1s = len(leavesL_sus[sp1]) nR1s = len(leavesR_sus[sp1]) if nL0s*(nR1+nR1s) + (nL1+nL1s)*nR0s != 0: # each species can be in only one of L and R at most: they might both be in the same half if nL0s > 0: # then nR0 == 0 so nR1 > 0 since checked (nL0*nR1 + nL1*nR0 != 0) text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesL_sus[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesR[sp1]+leavesR_sus[sp1]]) else: text0 = ", ".join([sequenceDict[strsp0 + g] for g in leavesR_sus[sp0]]) text1 = ", ".join([sequenceDict[strsp1 + g] for g in leavesL[sp1]+leavesL_sus[sp1]]) writer1_sus.writerow((og, text0, text1)) writer2_sus.writerow((og, text1, text0)) if qContainsSuspectOlogs: outfile2_sus.close() if qContainsSuspectOlogs: outfile1_sus.close() return nOrtho