def main(proteinTree, outFile, sortAttr=None): if sortAttr: get_attribute = lambda *args: ProteinTree_getnodeattr( *args, attrname=sortAttr) else: get_attribute = ProteinTree_getId with myFile.openFile(outFile, 'w') as out: for tree in myProteinTree.loadTree(proteinTree): ProteinTree_LeafSort(tree, get_attribute) tree.printTree(out)
def main(ensembltree, outputfile): get_ch = lambda tree, node: [x[0] for x in tree.data.get(node, [])] get_chd = lambda tree, nodedist: tree.data.get(nodedist[0], []) count_trees = 0 count_treenodes = [] count_splits = 0 count_split_desc = 0 with myFile.openFile(outputfile, 'w') as out: for tree in ProteinTree.loadTree(ensembltree): count_treenodes.append(0) for (node, dist), childrendists in dfw_descendants_generalized( tree, get_chd, queue=[(None, (tree.root, 0))]): count_treenodes[-1] += 1 assert tree.info[node]['duplication'] != 10, \ "Unexpected. parent node is a split gene: %s: %s" % \ (node, tree.info[node]) for child, chdist in childrendists: if tree.info[child]['Duplication'] == 10: # It's a gene split # Recurse through all the descendants to remove them count_splits += 1 for _, GS_descendant in reversed( list( dfw_pairs_generalized(tree, get_ch, queue=[(None, child) ], include_root=True))): tree.info.pop(GS_descendant) tree.data.pop(GS_descendant) count_split_desc += 1 tree.data[node].remove((child, chdist)) tree.printTree(out) print("%d trees" % count_trees, file=stderr) print("treenodes:", " ".join(str(nn) for nn in count_treenodes), file=stderr) print("Splits: %d Split descendants: %d" % (count_splits, count_split_desc), file=stderr)
def loadVals(filename, func): vals = {} if filename != "": f = myFile.openFile(filename, "r") for l in f: #print >> sys.stderr, "the line", l t = l.replace('\n', '').split("\t") s1 = sys.intern(t[0]) s2 = sys.intern(t[1]) assert (s1 == phylTree.parent[s2][0]) or ( s2 == phylTree.parent[s1][0]), (s1, s2) #x = func(float(t[2]) / (3. if s2 in ["Sus scrofa", "Meleagris gallopavo"] else 1.) , float(abs(phylTree.ages[s1] - phylTree.ages[s2]))) x = func(float(t[2]), float(abs(phylTree.ages[s1] - phylTree.ages[s2]))) vals[(s1, s2)] = x vals[(s2, s1)] = x print('%s/%s (%s)' % (s1, s2, t[2]), file=sys.stderr) f.close() return vals
def ancgenes_extract_genecounts(ancgenes_file, ensembl_version, spset=set(('H**o sapiens', ))): anc_genecounts = [] anc_spgenes = [] ancgenes = [] with myFile.openFile(ancgenes_file, 'r') as f: for line in f: ancgene, *genes = line.split() ancgenes.append(ancgene) gcounts = defaultdict(int) spgenes = defaultdict(list) for g in genes: species = convert_gene2species(g, ensembl_version) gcounts[species] += 1 if species in spset: spgenes[species].append(g) anc_genecounts.append(gcounts) anc_spgenes.append(spgenes) return ancgenes, anc_genecounts, anc_spgenes
def readerDependingOnFileWithDebAndEnd(fileName): flb = myFile.firstLineBuffer(myFile.openFile(fileName, 'r')) c = flb.firstLine.split("\t") if len(c) == 6: print("(c, beg, end, s, gName, transcriptName) -> (c, s, gName)", file=sys.stderr) # c, beg, end, s, gName, transcriptName reader = myFile.myTSV.readTabular(fileName, [str, str, str, str, str, str]) reader = ((c, intOrNone(beg), intOrNone(end), intOrNone(strand), gName) for (c, beg, end, strand, gName, tName) in reader) elif len(c) == 5: print("(c, beg, end, s, gName) -> (c, s, gName)", file=sys.stderr) # c, beg, end, s, gName tmpReader = myFile.myTSV.readTabular(fileName, [str, str, str, str, str]) # check, with the first line, if there are several gene names (the format genome of Matthieu contains several gene names) (c, beg, end, strand, gNames) = next(tmpReader) severalNames = True if len(gNames.split(' ')) > 0 else False reader = itertools.chain([(c, beg, end, strand, gNames)], tmpReader) if severalNames: # if gNames contains more than one gene name, only take the first gene name reader = ((c, intOrNone(beg), intOrNone(end), intOrNone(strand), gNames.split(' ')[0]) for (c, beg, end, strand, gNames) in reader) else: reader = ((c, intOrNone(beg), intOrNone(end), intOrNone(strand), gName) for (c, beg, end, strand, gName) in reader) else: raise ValueError("%s file is badly formatted" % fileName) return reader
def main(): # Arguments arguments = myTools.checkArgs([("phylTree.conf", myTools.File), ("proteinTree", myTools.File)], [("out:ancGenesFiles", str, ""), ("reuseNames", bool, False)], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) proteinTrees = myProteinTree.loadTree(arguments["proteinTree"]) count, dupCount, geneFamilies = extractGeneFamilies(phylTree, proteinTrees, arguments["reuseNames"]) outTemplate = arguments["out:ancGenesFiles"] if outTemplate: for (anc, lst) in geneFamilies.items(): print("Ecriture des familles de %s ..." % anc, end=' ', file=sys.stderr) f = myFile.openFile(outTemplate % phylTree.fileName[anc], "w") for gg in lst: print(" ".join(gg), file=f) f.close() print(len(lst), "OK", file=sys.stderr)
rankOfChrHasChanged = False if arguments['orderChromosomesBy'] == 'decreasingNbOfGenes': if not myTools.isSorted(list(genomeListByChr.items()), key=lambda x: len(x[1]), stricly=False, increasingOrder=False): genomeListByChr = collections.OrderedDict(sorted(list(genomeListByChr.items()), key=lambda x: len(x[1]), reverse=True)) rankOfChrHasChanged = True elif arguments['orderChromosomesBy'] == 'names': if not myTools.isSorted(list(genomeListByChr.items()), key=lambda x: myTools.keyNaturalSort(x[0])): genomeListByChr = collections.OrderedDict(sorted(list(genomeListByChr.items()), key=lambda x: myTools.keyNaturalSort(x[0]), reverse=True)) rankOfChrHasChanged = True if rankOfChrHasChanged: print('The rank of at least one chromosome has changed while sorting chrNames using the length of chromosomes', file=sys.stderr) # 2) If necessary rank genes by increasing beg coordinates geneRankHasChanged = False for chr, chrom in genomeListByChr.items(): if not myTools.isSorted(chrom, key=lambda x: x[0], stricly=False, increasingOrder=True): chrom.sort(key=lambda x: x[0]) geneRankHasChanged = True if geneRankHasChanged: print('The rank of at least one gene has changed while sorting using the 5\' extremities', file=sys.stderr) assert sum(len(chrom) for chrom in list(genomeListByChr.values())) == iniGenomeLength # 3) Print the genome f = myFile.openFile(arguments['out:genome'], 'w') for chr, chrom in genomeListByChr.items(): for (beg, end, s, gNames) in chrom: print(myFile.myTSV.printLine([chr, beg, end, s, gNames]), file=f) f.close()
# Les genes des especes modernes genes[e] = myGenomes.Genome(arguments["in:genesFiles"] % phylTree.fileName[e]) diags[e] = [] for (c, l) in genes[e].lstGenes.items(): diags[e].append([((c, i), l[i].strand) for i in range(len(l))]) for a in phylTree.listAncestr: # Les genes ancestraux genes[a] = myGenomes.Genome(arguments["in:ancGenesFiles"] % phylTree.fileName[a]) # Les diagonales diags[a] = [] # On en profite pour lister les diagonales et les genes seuls notseen = set(range(len(genes[a].lstGenes[None]))) f = myFile.openFile(arguments["in:diagsFiles"] % phylTree.fileName[a], "r") for l in f: t = l.split("\t") d = [int(x) for x in t[2].split()] s = [int(x) for x in t[3].split()] s = [2 * int(x >= 0) - 1 for x in s] diags[a].append(list(zip([(None, i) for i in d], s))) notseen.difference_update(d) f.close() assert len(notseen) == 0 # print >> sys.stderr, len(notseen) # diags[a].extend( [((None,g),0)] for g in notseen) # Creation des dictionnaires genes -> diags for (esp, lst) in diags.items(): dic = {}
#!/usr/bin/env python3 """ Convertit un genome (scaffolds = suite de contigs) en genome (uniquement des contigs) """ import sys from LibsDyogen import myDiags, myFile, myTools, myGenomes arguments = myTools.checkArgs([("scaffoldsFile", file), ("contigsFile", file)], [], __doc__) (diags, singletons) = myDiags.loadIntegr(arguments["scaffoldsFile"]) ref = {} f = myFile.openFile(arguments["contigsFile"], "r") for (i, l) in enumerate(f): ref[i + 1] = l f.close() for (chrom, weights) in diags: li = [] ls = [] lw = [] n = 0 for (i, (c, s)) in enumerate(chrom): t = ref.pop(c)[:-1].split("\t") if i >= 1: lw.append(weights[i - 1]) n += len(t[2].split()) if s > 0:
#print(extantGenes, file=sys.stderr) for x in lstAncGenomes: if arguments["except2XSpecies"] == "True": lstDescSpecies = [ y for y in phylTree.listSpecies if phylTree.dicParents[y][x] == x and y not in phylTree.lstEsp2X ] else: lstDescSpecies = [ y for y in phylTree.listSpecies if phylTree.dicParents[y][x] == x ] if len(lstDescSpecies) > 0: f = myFile.openFile( arguments["OUT.ancGenesFiles"] % phylTree.fileName[x], "w") print(x, lstDescSpecies, file=sys.stdout) for ancGene in phylTree.dicGenomes[x]: #print(ancGene, file=sys.stderr) nbDesc = {} ancGenename = ancGene.names[0] for descSpecies in lstDescSpecies: nbDesc[descSpecies] = 0 for modernGene in ancGene.names[1:]: if (modernGene in extantGenes): #print("modernGene:", modernGene,extantGenes[modernGene], file=sys.stderr) nbDesc[extantGenes[modernGene]] += 1 else: next
def sort_children(tree): """Sort children based on their numerical id""" for node, nodedata in tree.data.items(): nodedata.sort() if __name__ == '__main__': if set(('-h', '--help')) & set(argv[1:]): print(__doc__) exit() elif len(argv) > 3: print(__doc__) exit(1) else: try: outfile = argv[2] except IndexError: outfile = stdout try: infile = argv[1] except IndexError: infile = stdin with myFile.openFile(outfile, 'w') as out: for tree in myProteinTree.loadTree(infile): sort_children(tree) tree.printTree(out)
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", myTools.File), ("dirList", myTools.FileList(1))], \ [("diagsFile", str, "diags/integr/diags.%s.list.bz2"), ("outputODS", str, "")], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) # except KeyError: lstEspeces = sorted(set(phylTree.listAncestr)) allCutoff = arguments["dirList"] titles = [ "AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt", "Min", "25%", "50%", "75%", "N75", "N50", "N25", "Max", "Mean", "LongBlocks" ] alldata = {} alldiff = {} allEvents = [] for cutoff in allCutoff: # allEvents.append(cutoff.replace(".refine32-all.fuseSingletons-all.halfInsert-all.groups","").replace("denovo-","")) allEvents.append(cutoff) for events in allEvents: print(events, "...", end=' ', file=sys.stderr) # Recuperation des donnees de longueur de blocs alldata[events] = data = {} for e in lstEspeces: # print >> sys.stderr, e, "...", f = myFile.openFile( events + "/" + (arguments["diagsFile"] % phylTree.fileName[e]), "r") lst = [] sing = 0 tot = 0 interv = 0 for l in f: x = int(l.split("\t")[1]) tot += x if x >= 2: lst.append(x) interv += (x - 1) else: sing += 1 f.close() data[e] = [ e, phylTree.ages[e], tot, len(lst), tot - sing, (100. * (tot - sing)) / tot, interv, (100. * interv) / (tot - 20.) ] data[e].extend(myMaths.myStats.valSummary(lst)[:-2]) # on trie la liste des blocks par taille de blocks. lstSort = list(lst) lstSort.sort() # print >> sys.stderr, lst nbBlock = 0 ValKaryo75 = (tot - sing) * 75 / 100 Karyo75 = 0 while Karyo75 < ValKaryo75: tmp = lstSort.pop() Karyo75 += tmp nbBlock += 1 data[e].append(nbBlock) print(e, "...", nbBlock, "...", end=' ', file=sys.stderr) if events == allEvents[0]: ref = data print("OK", file=sys.stderr) if arguments["outputODS"] == "": for events in allEvents: print(events, file=sys.stdout) print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldata[events][e])) if events in alldiff: print( myFile.myTSV.printLine( ["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldiff[events][e])) else: import odf.opendocument from odfpy_datatable import DataTable textdoc = odf.opendocument.OpenDocumentSpreadsheet() for events in allEvents: # valevents = events.split("/")[-1] valevents = events # Premiere table avec les stats brutes val = [["Ancestor", "Age (My)"] + titles] for e in lstEspeces: val.append(alldata[events][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', valevents) textdoc.spreadsheet.addElement(t) # Table specifique pour un ancetre for esp in lstEspeces: # continue val = [["events"] + titles] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([valevents] + alldata[events][esp][2:]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', esp) textdoc.spreadsheet.addElement(t) # Resume final val = [["N50"] + ["events"] + [esp for esp in lstEspeces]] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][13] for e in lstEspeces]) val.append(["Mean"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][16] for e in lstEspeces]) val.append(["NbBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][3] for e in lstEspeces]) val.append(["MaxLength"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][15] for e in lstEspeces]) val.append(["LongBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][17] for e in lstEspeces]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "Summary") textdoc.spreadsheet.addElement(t) textdoc.save(arguments["outputODS"])
def calc(anc, val): nesp = len(phylTree.species[anc]) n2X = len(phylTree.lstEsp2X.intersection(phylTree.species[anc])) # La moitie des especes non 2X a vu la duplication (au minimum 1 espece) return round(max(1., val * (nesp - n2X)) / nesp, 3) - 2e-3 minDuplicationScore = {} try: # Une limite pour tout le monde val = float(arguments["cutoff"]) for anc in phylTree.listAncestr: minDuplicationScore[anc] = calc(anc, val) except ValueError: f = myFile.openFile(arguments["cutoff"], "r") for l in f: t = l.split() anc = phylTree.officialName[t[0]] minDuplicationScore[anc] = calc(anc, float(t[1])) f.close() print("minDuplicationScore:", minDuplicationScore, file=sys.stderr) # Les scores dans l'abre pour les especes modernes valent toujours 1, on doit toujours les accepter for esp in phylTree.listSpecies: minDuplicationScore[esp] = 0 myProteinTree.nextNodeID = arguments["newNodeID"] @myTools.memoize
def setupScoring(phylTree, scoreMethod=1, cutoff=-1): """Return a `hasLowScore` function that attributes a return True/False depending on whether a duplication node has a good confidence, with reference to a given species phylogenetic tree (phyltreefile).""" # Limites automatiques de score de duplication if scoreMethod in [1, 3]: def calc(anc, val): return val elif scoreMethod == 2: def calc(anc, val): nesp = len(phylTree.species[anc]) n2X = len(phylTree.lstEsp2X.intersection(phylTree.species[anc])) # La moitie des especes non 2X a vu la duplication (au minimum 1 espece) return round(max(1., val*(nesp-n2X)) / nesp, 3) - 2e-3 minDuplicationScore = {} try: # Une limite pour tout le monde val = float(cutoff) # Shortcut if val < 0: return alwaysFalse for anc in phylTree.listAncestr: minDuplicationScore[anc] = calc(anc, val) except ValueError: f = myFile.openFile(cutoff, "r") for l in f: t = l.split() anc = phylTree.officialName[t[0]] minDuplicationScore[anc] = calc(anc, float(t[1])) f.close() logger.debug("minDuplicationScore:\n%s", minDuplicationScore) # Les scores dans l'arbre pour les especes modernes valent toujours 1, on # doit toujours les accepter for esp in phylTree.listSpecies: minDuplicationScore[esp] = 0 @myTools.memoize def goodSpecies(anc): return phylTree.species[anc].difference(phylTree.lstEsp2X) # This is a Jaccard Index of species on each side of the duplication. def hasLowScore(tree, rnode): logger.debug("# hasLowScore is used.") @myTools.memoize def getSpeciesSets(node): if node in tree.data: return set().union(*(getSpeciesSets(x) for (x,_) in tree.data[node])) else: logger.debug('Node without data (leaf) at %r', tree.info[node]["taxon_name"]) assert tree.info[node]["taxon_name"] in phylTree.listSpecies return set([tree.info[node]["taxon_name"]]) if rnode not in tree.data: return False speciessets = [getSpeciesSets(x) for (x,_) in tree.data[rnode]] inters = set() for (s1,s2) in itertools.combinations(speciessets, 2): inters.update(s1.intersection(s2)) all = set().union(*speciessets) anc = tree.info[rnode]["taxon_name"] if scoreMethod == 3: inters.intersection_update(goodSpecies(anc)) all.intersection_update(goodSpecies(anc)) return ((len(inters) == 0) and (minDuplicationScore[anc] == 0)) or (len(inters) < (minDuplicationScore[anc] * len(all))) ###TODO: this should update the 'duplication_confidence_score' tag. return hasLowScore
Run the XMLfile BIOMART Query Usage: ./ENSEMBL.biomartQuery.py XMLfiles/BIOMART.HumanProteinCodingGene.xml -> will generate ouput.txt ./ENSEMBL.biomartQuery.py XMLfiles/BIOMART.HumanProteinCodingGene.xml -outputFileName=HumanProteinCodingGene.txt """ from __future__ import print_function import sys import urllib.request, urllib.parse, urllib.error from LibsDyogen import myFile, myTools # Arguments arguments = myTools.checkArgs( [("xmlRequest", myTools.File)], [("biomartServer", str, "http://www.ensembl.org/biomart/martservice"), ("outputFileName", str, "output.txt")], __doc__) # La requete with myFile.openFile(arguments["xmlRequest"], "r") as f: request = f.read() print("Downloading XML Query", end=' ', file=sys.stderr) urllib.request.urlretrieve(arguments["biomartServer"], filename=arguments["outputFileName"], data=urllib.parse.urlencode({ "query": request }).encode()) print("OK", file=sys.stderr)
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", file), ("dirList", myTools.FileList(1))], \ [("diagsFile", str, "diags/integr/final/anc/diags.%s.list.bz2"), ("outputODS", str, "")], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) todo = set(phylTree.listAncestr) try: l1 = phylTree.dicLinks["Euteleostomi"]["H**o sapiens"][:-1] todo.difference_update(l1) l2 = phylTree.dicLinks["Glires"]["Murinae"] todo.difference_update(l2) l3 = [e for e in todo if phylTree.isChildOf(e, "Mammalia")] l3 = sorted(l3, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l3) l4 = [e for e in todo if phylTree.isChildOf(e, "Clupeocephala")] l4 = sorted(l4, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l4) l5 = [e for e in todo if phylTree.isChildOf(e, "Amniota")] l5 = sorted(l5, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l5) l6 = sorted(todo, key=lambda e: phylTree.ages[e], reverse=True) lstEspeces = l6 + l5 + l4 + l1 + l3 + l2 except KeyError: lstEspeces = sorted(phylTree.listAncestr) # lstEspeces = l5 # lstEspeces = ["Euteleostomi", "Amniota", "Boreoeutheria"] allCutoff = arguments["dirList"] titles = [ "AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt", "Min", "25%", "50%", "75%", "N75", "N50", "N25", "WeigthedAverage", "Max", "Mean", "LongBlocks" ] alldata = {} alldiff = {} allEvents = [] for cutoff in allCutoff: # allEvents.append(cutoff.replace(".refine32-all.fuseSingletons-all.halfInsert-all.groups","").replace("denovo-","")) allEvents.append(cutoff) for events in allEvents: print(events, "...", end=' ', file=sys.stderr) # Recuperation des donnees de longueur de blocs alldata[events] = data = {} for e in lstEspeces: # print >> sys.stderr, e, "...", f = myFile.openFile( events + "/" + (arguments["diagsFile"] % phylTree.fileName[e]), "r") lst = [] sing = 0 tot = 0 interv = 0 for l in f: x = int(l.split("\t")[1]) tot += x if x >= 2: lst.append(x) interv += (x - 1) else: sing += 1 f.close() data[e] = [ e, phylTree.ages[e], tot, len(lst), tot - sing, (100. * (tot - sing)) / tot, interv, (100. * interv) / (tot - 20.) ] data[e].extend(myMaths.myStats.valSummary2(lst)[:-2]) # on trie la liste des blocks par taille de blocks. lstSort = list(lst) lstSort.sort() # print >> sys.stderr, lst nbBlock = 0 ValKaryo75 = (tot - sing) * 75 / 100 Karyo75 = 0 while Karyo75 < ValKaryo75: tmp = lstSort.pop() Karyo75 += tmp nbBlock += 1 data[e].append(nbBlock) print(e, "...", nbBlock, "...", end=' ', file=sys.stderr) if events == allEvents[0]: ref = data # else: # alldiff[events] = diff = {} # for e in lstEspeces: # newdata = [(x-ref[e][i] if i >= 2 else x) for (i,x) in enumerate(data[e])] # newdata.insert(2, 100*(1.-float(newdata[4])/newdata[2]) if newdata[2] != 0 else None) # diff[e] = newdata print("OK", file=sys.stderr) if arguments["outputODS"] == "": for events in allEvents: print(events, file=sys.stdout) print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldata[events][e])) if events in alldiff: print( myFile.myTSV.printLine( ["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldiff[events][e])) else: import odf.opendocument from odfpy_datatable import DataTable textdoc = odf.opendocument.OpenDocumentSpreadsheet() for events in allEvents: # valevents = events.split("/")[-1] valevents = events # Premiere table avec les stats brutes val = [["Ancestor", "Age (My)"] + titles] for e in lstEspeces: val.append(alldata[events][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', valevents) textdoc.spreadsheet.addElement(t) # if events in alldiff: # # # Deuxieme table avec les differences par rapport a la reference # val = [["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles] # for e in lstEspeces: # val.append(alldiff[events][e]) # # table = DataTable(val) # table.datasourcehaslabels = "both" # t = table() # t.setAttribute('name', "d"+valevents) # textdoc.spreadsheet.addElement(t) # Table specifique pour un ancetre for esp in lstEspeces: # continue val = [["events"] + titles] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([valevents] + alldata[events][esp][2:]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', esp) textdoc.spreadsheet.addElement(t) # Resume final # val = [["events", "Mean gain", "Median gain", "N50 gain", "%Cov gain", "%CovInt gain", "BlockLength %gain (mean)", "BlockLength %gain (Median)", "BlockLength %gain (N50)", "Cov %gain", "CovInt %gain"]] # for events in allEvents: # valevents = events.split("/")[-1] # val.append( [valevents] + [myMaths.myStats.mean([alldiff[events][e][i] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] + # [myMaths.myStats.mean([100*float(alldata[events][e][i-1]-alldata[allEvents[0]][e][i-1])/alldata[allEvents[0]][e][i-1] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] # ) # table = DataTable(val) # table.datasourcehaslabels = "both" # t = table() # t.setAttribute('name', "events") # textdoc.spreadsheet.addElement(t) # Pour les courbes val = [["AncGenes"] + ["events"] + [esp for esp in lstEspeces]] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][2] for e in lstEspeces]) val.append(["WeigthedAverage"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [int(alldata[events][e][15]) for e in lstEspeces]) val.append(["N50"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][13] for e in lstEspeces]) val.append(["Mean"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][17] for e in lstEspeces]) val.append(["NbBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][3] for e in lstEspeces]) val.append(["MaxLength"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][16] for e in lstEspeces]) val.append(["LongBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][18] for e in lstEspeces]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "Summary") textdoc.spreadsheet.addElement(t) textdoc.save(arguments["outputODS"])
("OUT.geneInfo", str, "geneInfoFromTrees.txt"), ], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) ############################################## # Chargement de la base de donnees d'Ensembl # ############################################## # On charge les liens taxon_id -> species name ############################################### print("Chargement des liens taxon_id -> species_name ...", end=' ', file=sys.stderr) taxonName = {} f = myFile.openFile( os.path.join(arguments["IN.EnsemblURL"], arguments["IN.genome_db"]), "r") for ligne in myFile.myTSV.MySQLFileLoader(f): t = ligne.split("\t") taxonName[t[1]] = t[2] f.close() print(len(taxonName), "especes OK", file=sys.stderr) # On charge les liens member_id -> protein name ################################################ print("Chargement des liens member_id -> protein_name ...", end=' ', file=sys.stderr) tmpLinks = {} f = myFile.openFile( os.path.join(arguments["IN.EnsemblURL"], arguments["IN.member"]), "r") for ligne in myFile.myTSV.MySQLFileLoader(f):
info["gene_name"] = x print("%sinfo\t%s" % (indent, info)) if node in tree.items: indent = indent + "\t" for (e, l) in tree.items[node]: print("%slen\t%g" % (indent, l)) printTree(indent, e) printTree("", tree.root) arguments = myTools.checkArgs([("tree", myTools.File)], [], __doc__) f = myFile.openFile(arguments["tree"], "r") totalNbLines = sum(1 for line in f) f.close() f = myFile.openFile(arguments["tree"], "r") progressBar = myTools.ProgressBar(totalNbLines) cptLines = 0 for line in f: cptLines += 1 progressBar.printProgressIn(sys.stderr, cptLines) if len(line.replace(" ", "").replace("\n", "")) == 0: #Do nothing : empty line continue elif line.find(";\n"): processData(line) else: raise NameError(
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", myTools.File), ("dirList", myTools.FileList(1))], \ [("diagsFile", str, "diags/integr/final/anc/diags.%s.list.bz2"), ("outputODS", str, "")], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) # Liste des especes dans le bon ordre todo = set(phylTree.listAncestr) try: l1 = phylTree.dicLinks["Euteleostomi"]["H**o sapiens"][:-1] todo.difference_update(l1) l2 = phylTree.dicLinks["Glires"]["Murinae"] todo.difference_update(l2) l3 = [e for e in todo if phylTree.isChildOf(e, "Mammalia")] l3 = sorted(l3, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l3) l4 = [e for e in todo if phylTree.isChildOf(e, "Clupeocephala")] l4 = sorted(l4, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l4) l5 = [e for e in todo if phylTree.isChildOf(e, "Amniota")] l5 = sorted(l5, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l5) l6 = sorted(todo, key=lambda e: phylTree.ages[e], reverse=True) lstEspeces = l6 + l5 + l4 + l1 + l3 + l2 except KeyError: lstEspeces = sorted(phylTree.listAncestr) # lstEspeces = l5 # lstEspeces = ["Euteleostomi", "Amniota", "Boreoeutheria"] allCutoff = arguments["dirList"] titles = ["AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt", "Min", "25%", "50%", "75%", "N75", "N50", "N25", "Max", "Mean"] alldata = {} alldiff = {} for cutoff in allCutoff: print(cutoff, "...", end=' ', file=sys.stderr) # Recuperation des donnees de longueur de blocs alldata[cutoff] = data = {} for e in lstEspeces: f = myFile.openFile(cutoff + "/" + (arguments["diagsFile"] % phylTree.fileName[e]), "r") lst = [] sing = 0 tot = 0 interv = 0 for l in f: x = int(l.split("\t")[1]) tot += x if x >= 2: lst.append(x) interv += (x - 1) else: sing += 1 f.close() data[e] = [e, phylTree.ages[e], tot, len(lst), tot - sing, (100. * (tot - sing)) / tot, interv, (100. * interv) / (tot - 20.)] data[e].extend(myMaths.myStats.valSummary(lst)[:-2]) if cutoff == allCutoff[0]: ref = data # else: alldiff[cutoff] = diff = {} for e in lstEspeces: newdata = [(x - ref[e][i] if i >= 2 else x) for (i, x) in enumerate(data[e])] newdata.insert(2, 100 * (1. - float(newdata[4]) / newdata[2]) if newdata[2] != 0 else None) diff[e] = newdata print("OK", file=sys.stderr) if arguments["outputODS"] == "": for cutoff in allCutoff: print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldata[cutoff][e])) if cutoff in alldiff: print(myFile.myTSV.printLine(["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldiff[cutoff][e])) else: import odf.opendocument from odfpy_datatable import DataTable textdoc = odf.opendocument.OpenDocumentSpreadsheet() for cutoff in allCutoff: valCutoff = cutoff.split("/")[-1] # Premiere table avec les stats brutes val = [["Ancestor", "Age (My)"] + titles] for e in lstEspeces: val.append(alldata[cutoff][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', valCutoff) textdoc.spreadsheet.addElement(t) if cutoff in alldiff: # Deuxieme table avec les differences par rapport a la reference val = [["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles] for e in lstEspeces: val.append(alldiff[cutoff][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "d" + valCutoff) textdoc.spreadsheet.addElement(t) # Table specifique pour un ancetre for esp in lstEspeces: # continue val = [["cutoff"] + titles] for cutoff in allCutoff: valCutoff = cutoff.split("/")[-1] val.append([valCutoff] + alldata[cutoff][esp][2:]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', esp) textdoc.spreadsheet.addElement(t) # Resume final val = [["cutoff", "Mean gain", "Median gain", "N50 gain", "%Cov gain", "%CovInt gain", "BlockLength %gain (mean)", "BlockLength %gain (Median)", "BlockLength %gain (N50)", "Cov %gain", "CovInt %gain"]] for cutoff in allCutoff: valCutoff = cutoff.split("/")[-1] val.append([valCutoff] + [myMaths.myStats.mean([alldiff[cutoff][e][i] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] + [myMaths.myStats.mean([100 * float( alldata[cutoff][e][i - 1] - alldata[allCutoff[0]][e][i - 1]) / alldata[allCutoff[0]][e][i - 1] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] ) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "cutoff") textdoc.spreadsheet.addElement(t) textdoc.save(arguments["outputODS"])
#! /usr/bin/env python """ Decoupe un fichier d'arbres en fichiers separes usage: ./splitTrees.py GeneTreeForest.phylTree.bz2 Fam.%s """ from LibsDyogen import myFile, myTools, myProteinTree arguments = myTools.checkArgs([("proteinTree", myTools.File), ("output", str)], [], __doc__) for (i, tree) in enumerate(myProteinTree.loadTree(arguments["proteinTree"])): print(i) f = myFile.openFile(arguments["output"] % (i + 1), "w") tree.printTree(f) f.close()
Min [Q25/Q50/Q75] [N75/N50/N25] Max [Mean/Stddev-Length] Usage: ./printStats.py filename ./printStats.py filename +long +colNames """ from LibsDyogen import myFile, myMaths, myTools arguments = myTools.checkArgs([("file", file)], [("long", bool, False), ("colNames", bool, False)], __doc__) lst = [] f = myFile.openFile(arguments["file"], 'r') for l in f: c = l.split() for x in c: try: x = int(x) except ValueError: x = float(x) lst.append(x) f.close() # returns results if arguments["long"]: if arguments["colNames"]:
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", myTools.File)], [("diags", str, ""), ("colNames", bool, False)], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) if (arguments["colNames"]): print(myFile.myTSV.printLine([ "Ancestor", "NbComp", "Nb(In/Out)Comp", "Nb(In/In)Comp", "Age", "MeanSize_OfBlocks", "N50Size_OfBlocks", "WASize_OfBlocks", "NbComp/Age" ]), file=sys.stdout) for anc in phylTree.listAncestr: # nb d'outgroup: ############### nb_outgroup = len(phylTree.outgroupSpecies[anc]) # nb d'Ingroups. ############## nbInSpec = [len(phylTree.species[x]) for (x, _) in phylTree.items[anc]] l = [len(phylTree.species[x]) for (x, _) in phylTree.items[anc]] # for (x,_) in phylTree.items[anc]: # print >> sys.stderr, phylTree.species[x] l.append(nb_outgroup) # Comp InSpecies/OutGroups ######################### compInOut = sum(nb_outgroup * n1 for n1 in nbInSpec) # Comp InSpecies/InSpecies ######################### compInIn = sum(n1 * n2 for (n1, n2) in itertools.combinations(nbInSpec, 2)) nbc = sum(n1 * n2 for (n1, n2) in itertools.combinations(l, 2)) # quid des blocs. ############### totalStat = [] if (arguments["diags"] != ""): r = [] f = myFile.openFile(arguments["diags"] % phylTree.fileName[anc], "r") for line in f: x = int(line.split("\t")[1]) if x > 1: r.append(x) f.close() #lll = float(sum(r)) / len(r) totalStat = myMaths.myStats.valSummary2(r) else: lll = "NONE" ############### print( myFile.myTSV.printLine([ anc, nbc, compInOut, compInIn, phylTree.ages[anc], totalStat[9], totalStat[6], int(totalStat[7]), float(nbc) / phylTree.ages[anc] ]))
newAnc, newLastWritten)) else: # when the node is a leaf allGenes = [tree.info[node]["gene_name"]] for a in toWrite: # 'a'= name of the ancestor to print geneFamilies[a].append( [currName] + allGenes ) # write the name of the gene of Anc followed by the names of the genes of the children (this is done for all the species in toWrite) #FIXME geneFamilies is defined in the main, it is modified whereas it is not even a parameter return allGenes # for the recurrence geneFamilies = collections.defaultdict(list) for tree in myProteinTree.loadTree( arguments["geneTreeForest"]): # for all gene trees in the forest extractGeneFamilies( tree.root, tree.info[tree.root]["tree_name"], None, None ) # FIXME this function modifies tree and geneFamilies even if tree and gene families are not parameters tree.printTree(sys.stdout) for (anc, lst) in geneFamilies.items(): print("Write %s family ..." % anc, end=' ', file=sys.stderr) f = myFile.openFile(arguments["out:ancGenes"] % speciesTree.fileName[anc], "w") for gg in lst: print(" ".join(gg), file=f) f.close() print(len(lst), "OK", file=sys.stderr)
""" import sys import io from LibsDyogen import myFile, myTools, myPhylTree arguments = myTools.checkArgs([("IN.protein_tree_tag", file)], [], __doc__) dicTaxonName = {} dicTaxonID = {} dicTaxonAlias = {} # Chargement des donnees print("Chargement des tags ...", end=' ', file=sys.stderr) f = myFile.openFile(arguments["IN.protein_tree_tag"], "r") for ligne in f: t = ligne[:-1].split("\t") if t[1] == "taxon_name": dicTaxonName[t[0]] = t[2] elif t[1] == "taxon_id": dicTaxonID[t[0]] = t[2] elif (t[1] == "taxon_alias") and (t[0] not in dicTaxonAlias): dicTaxonAlias[t[0]] = t[2] elif t[1] == "taxon_alias_mya": dicTaxonAlias[t[0]] = t[2] elif t[1] == "species_tree_string": tree = t[2] print("OK (lengths:", len(dicTaxonName), len(dicTaxonID),