def main(): logging.basicConfig() logger.setLevel(logging.INFO) arguments = myTools.checkArgs( [("phylTree.conf",myTools.File), ("ensemblTree",myTools.File)], [("flatten",bool,False), ("rebuild",bool,False), ("fam",bool,False), ("cutoff",str,"-1"), ("defaultFamName",str,"FAM%08d"), ("scoreMethod",int,[1,2,3]), ("newNodeID",float,1e8), ("recurs",bool,False), ("indicator",bool,False), ("debug",bool,False)], __doc__) if arguments['debug']: logger.setLevel(logging.DEBUG) myProteinTree.nextNodeID = int(arguments["newNodeID"]) # For the rebuild step. phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) hasLowScore = setupScoring(phylTree, arguments["scoreMethod"], arguments["cutoff"]) prottrees = myProteinTree.loadTree(arguments["ensemblTree"]) prottrees = process(prottrees, phylTree, hasLowScore, arguments["defaultFamName"], arguments["flatten"], arguments["rebuild"], arguments["recurs"], arguments["indicator"]) if arguments["fam"]: # Will not work on previous versions of ToolsDyogen. from treeTools.ALL.extractGeneFamilies import extractGeneFamilies count, dupCount, geneFamilies = extractGeneFamilies(phylTree, prottrees) else: for tree in prottrees: tree.printTree(sys.stdout)
def main(inputnwk, outputnwk, ensembl_version=ENSEMBL_VERSION, phyltreefile=PHYLTREEFILE, treebest=False, include_singleton_branches=False, include_singleton_root=False): phyltree = myPhylTree.PhylogeneticTree( phyltreefile.format(ensembl_version)) ancgene2sp = re.compile(r'(' + r'root|' + r'|'.join( list(phyltree.listSpecies) + sorted(phyltree.listAncestr, key=lambda a: len(a), reverse=True)). replace(' ', '\.') + r')(.*)$') genetree = ete3.Tree(inputnwk or stdin.read(), format=1) get_taxon = get_taxon_treebest if treebest else get_taxon with (open(outputnwk, 'w') if outputnwk else stdout) as out: for paralogy in buildparalogies(genetree, get_taxon, ancgene2sp, ensembl_version, include_singleton_branches): outtext = paralogy.write(format=1, format_root_node=True, features=['S', 'D', 'P', 'A']) out.write(outtext + '\n')
def main(forestfile, phyltreefile, speciesfile, invert=False): phyltree = myPhylTree.PhylogeneticTree(phyltreefile) with open(speciesfile) as f: badspecies = [line.rstrip() for line in f if not line.startswith('#')] subroot, subtree = phyltree.getSubTree(badspecies) for tree in myProteinTree.loadTree(forestfile):
def extractMultipleGeneTrees(proteinTree, family_name, field='family_name', toNewick=False, withAncSpeciesNames=False, withAncGenesNames=False, withTags=False, phyltree=None, output=None, force=False, mkdirs=False, firstmatch=False): if phyltree: phyltree = myPhylTree.PhylogeneticTree(phyltree) family_names = dict.fromkeys(family_name, 0) for tree in myProteinTree.loadTree(proteinTree): family = tree.info[tree.root][field].split('.')[0] if family in family_names: print("Found", family, end=' ', file=sys.stderr) wasfound = family_names[family] outfile = output.format(genetree=family) if output else '<stdout>' if os.path.isfile(outfile) and not wasfound and not firstmatch and not force: #if family_names[family] == 0: #FIXME so that you can omit the --force option but append to file print("%s exists. Skipping. (use --force)" % outfile, file=sys.stderr) family_names.pop(family) else: if phyltree is not None: #markLowScore(tree, hasLowScore) #flattenTree # tree.rebuildTree(phyltree) #TODO: start in new thread. filemode = 'a' if wasfound else 'w' try: out = open(outfile, filemode) if output else sys.stdout except IOError: if mkdirs: os.makedirs(os.path.split(outfile)[0]) out = open(outfile, filemode) else: raise if toNewick: print("Output to newick format", file=sys.stderr) tree.printNewick(out, withDist=True, withTags=withTags, withAncSpeciesNames=withAncSpeciesNames, withAncGenesNames=withAncGenesNames, withID=withTags) else: tree.printTree(out) if output: out.close() if firstmatch: family_names.pop(family) else: family_names[family] += 1 if firstmatch and not family_names: break notfound = set((fam for fam,wasfound in family_names.items() if not wasfound)) if notfound: print('WARNING: %d names were not found in field %r: %s' % ( len(notfound), field, ' '.join(notfound)), file=sys.stderr)
def load_phyltree(self, phyltreefile=None, ensembl_version=None): phyltreefile = phyltreefile if phyltreefile else self.phyltreefile ensembl_version = ensembl_version if ensembl_version else \ self.ensembl_version self.phyltree = PhylTree.PhylogeneticTree( os.path.expanduser( phyltreefile.format(ensembl_version))) self.taxa.sort(key=lambda t: -self.phyltree.ages[t]) self.taxa_evt.sort(key=lambda te: -self.phyltree.ages[te[0]])
def time_fromspeciestreeIO(treefile, phyltreefile, outfile=None): phyltree = myPhylTree.PhylogeneticTree(phyltreefile) with open(treefile) as f: lines = f.readlines() with (open(outfile, 'w') if outfile else stdout) as out: for treetxt in read_multinewick(lines): tree = ete3.Tree(treetxt, format=1) time_fromspeciestree(tree, phyltree) newick = tree.write(format=1, format_root_node=False) out.write(newick + '\n')
def main(phyltreefile, genomequal, refphyltreefile=None, column_qual='Qual'): phyltree = myPhylTree.PhylogeneticTree(phyltreefile) if refphyltreefile: refphyltree = myPhylTree.PhylogeneticTree(refphyltreefile) with open(genomequal, newline='') as gf: csvrd = csv.DictReader(gf, dialect='excel-tab') species_qual = {row['Species'].rstrip().replace('.', ' '): int(row[column_qual]) for row in csvrd} # Warning multiple subspecies per species with different qualities lstEsp2X = set() lstEsp6X = set() lstEspFull = set() commonNames = {} for sp in phyltree.listSpecies: try: q = species_qual[sp] except KeyError: logger.error('No quality found for %s', sp) q = 1 if q <= 3: lstEspFull.add(sp) elif q <= 4: lstEsp6X.add(sp) else: lstEsp2X.add(sp) if refphyltreefile: try: names = refphyltree.commonNames[sp] commonNames[sp] = [n for n in names if isinstance(n, str)] except KeyError: logger.error('No %r common names found in reference tree', sp) setattr(phyltree, 'lstEsp2X', lstEsp2X) setattr(phyltree, 'lstEsp6X', lstEsp6X) setattr(phyltree, 'lstEspFull', lstEspFull) if refphyltreefile: setattr(phyltree, 'commonNames', commonNames) phyltree.printNewick(commonnames=True, symbols=True)
def main(): arguments = myTools.checkArgs([("phylTree.conf", myTools.File)], [], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) for a in phylTree.listAncestr: for (f1, f2) in itertools.combinations([f for (f, _) in phylTree.items[a]], 2): l1 = [e for e in phylTree.species[f1]] l2 = [e for e in phylTree.species[f2]] for (e1, e2) in itertools.product(l1, l2): print("%s\t%s\t%s" % (e1, e2, a), file=sys.stdout)
def main(): arguments = myTools.checkArgs([("phylTree.conf", myTools.File), ("ensemblTree", myTools.File)], [("newNodeID", int, int(1e9)), ("reuseNames", bool, False)], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) setrecursionlimit(20000) # !important myProteinTree.nextNodeID = arguments["newNodeID"] count, dupCount, geneFamilies = extractGeneFamilies( phylTree, processTrees(arguments["ensemblTree"], phylTree), arguments["reuseNames"])
def main(phyltreefile, specieslistfile=None, reverse=False): phyltree = myPhylTree.PhylogeneticTree(phyltreefile) if specieslistfile is None: badspeciesset = phyltree.lstEsp2X | phyltree.lstEsp6X else: with open(specieslistfile) as f: badspeciesset = set(line.rstrip() for line in f) if badspeciesset - phyltree.listSpecies: print('WARNING: unknown species:', ', '.join(badspeciesset-phyltree.listSpecies), file=stderr) if reverse: badspeciesset = phyltree.listSpecies - badspeciesset phyltree.pruneSpecies(badspeciesset, inplace=True) phyltree.printNewick(commonnames=True, symbols=True)
def main(phyltreefile, attr=None, keys=None): phyltree = myPhylTree.PhylogeneticTree(phyltreefile) if not attr: output = ('Available attributes:\n' + '\n'.join( '%-25s %s' % (a, type(getattr(phyltree, a))) for a in sorted(phyltree.__dict__) if not a.startswith('_') and not callable(getattr(phyltree, a)))) print(output) return value = getattr(phyltree, attr) if isinstance(value, (str, int, float)): print(value) return if keys: def getvalue(key, default=''): try: return value[key] except KeyError: return '' try: values = [(k, getvalue(k)) for k in keys] except TypeError: def getvalue(key): return key in value # what if value is a string? values = [(k, getvalue(k)) for k in keys] if len(keys) > 1: output = '\n'.join('%s\t%s' % item for item in values) print(output) return else: value = values[0][1] try: output = '\n'.join('%s\t%s' % (k, v) for k, v in sorted(value.items())) except (AttributeError, TypeError): try: output = '\n'.join(str(v) for v in sorted(value)) except TypeError: output = '%s\n%s' % (type(value), value) print(output)
def detachAfter(phyltree, nodes): """Transform the given nodes into leaves (remove their subtrees).""" items = deepcopy(phyltree.items) officialname = deepcopy(phyltree.officialName) for node in nodes: for descendant in phyltree.allDescendants[node]: if descendant != node: officialname.pop(descendant) items.pop(descendant, None) # Needed here because of an assertion test in # `reinitTree` that doesn't like disconnected nodes. newtree = myPhylTree.PhylogeneticTree((items, phyltree.root, officialname)) newtree.reinitTree() return newtree
def main(treefile, speciesfile, ensembl_version, from_phyltree=False, outfile=None, keep_single_node_trees=False): if from_phyltree: phyltree = myPhylTree.PhylogeneticTree(speciesfile) specieslist = phyltree.listSpecies else: with open(speciesfile) as f: specieslist = [ line.rstrip() for line in f if not line.startswith('#') ] outtrees = [] with open(treefile) as newick: for tree in read_multinewick(newick): tree = ete3.Tree(tree, format=1) features = set.union(*(n.features for n in tree.traverse())) \ - set(('name', 'dist', 'support')) if from_phyltree: keptleaves = [ l for l in tree.iter_leaves() if (ultimate_seq2sp(l.name, ensembl_version) in specieslist ) ] else: keptleaves = [ l for l in tree.iter_leaves() if (ultimate_seq2sp( l.name, ensembl_version) not in specieslist) ] newtree = thin(tree, keptleaves) if newtree and (len(newtree) > 1 or keep_single_node_trees): newnewick = tree.write(outfile=None, format=1, format_root_node=True) outtrees.append(newnewick) if outfile is not None and outtrees: outfile = open(outfile, 'w') if outtrees: print('\n'.join(outtrees), file=outfile) if outfile is not None: outfile.close()
def main(conversionfile, treefile=None, parser='ete3'): conversion = load_conversion(conversionfile) if parser == 'ete3': if treefile is None: treefile = stdin.read() tree = ete3.Tree(treefile, format=1) elif parser == 'myPhylTree': from LibsDyogen import myPhylTree if treefile is None: treefile = stdin tree = PhylTree_to_ete3(myPhylTree.PhylogeneticTree(treefile), nosinglechild=False) rename(tree, conversion) print(tree.write(format=1, format_root_node=True))
def main(phyltreefile, forestfile=None): #with open(badspecieslistfile) as f: # badspecies = [line.rstrip() for line in f if not line.startswith('#')] phyltree = myPhylTree.PhylogeneticTree(phyltreefile) if forestfile is None: forestfile = stdin for tree in myProteinTree.loadTree(forestfile): keptleaves = set( (leaf for leaf in set(tree.info).difference(tree.data) if tree.info[leaf]['taxon_name'] in phyltree.allNames)) newroot, _ = thin_prottree(tree, tree.root, 0, keptleaves) #print('DEBUG: newroot =', newroot) #print('DEBUG: newdata =', tree.data) #print('DEBUG: newinfo =', ' '.join(str(x) for x in tree.info.keys())) if newroot is not None: fix_thinned_dups(phyltree, tree, newroot) tree.printTree(stdout, newroot) else: logger.warning('Discard tree %d', tree.root)
def main(): arguments = myTools.checkArgs([("phylTree.conf", myTools.File)], [("fromNewick", bool, True)], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) if arguments["fromNewick"]: # Returns the phyltree format (with indentation) def do(node, indent): node = node.replace("*", "") names = myFile.myTSV.printLine([node] + [ x for x in phylTree.commonNames.get(node, "") if isinstance(x, str) and (x != node) ], delim="|") print(("\t" * indent) + "%s" % names) if node in phylTree.items: for (f, _) in phylTree.items[node]: do(f, indent + 1) do(phylTree.root, 0) else: # Returns the newick tree def convertToFlatFile(anc): a = phylTree.fileName[anc] # anc.replace(' ', '.') if anc in phylTree.listSpecies: return a else: return "(" + ",".join([ convertToFlatFile(e) + ":" + str(l) for (e, l) in phylTree.items[anc] ]) + ")%s|%d" % (a, phylTree.ages[anc]) print(convertToFlatFile(phylTree.root), ";")
def main(): # Arguments arguments = myTools.checkArgs([("phylTree.conf", myTools.File), ("proteinTree", myTools.File)], [("out:ancGenesFiles", str, ""), ("reuseNames", bool, False)], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) proteinTrees = myProteinTree.loadTree(arguments["proteinTree"]) count, dupCount, geneFamilies = extractGeneFamilies(phylTree, proteinTrees, arguments["reuseNames"]) outTemplate = arguments["out:ancGenesFiles"] if outTemplate: for (anc, lst) in geneFamilies.items(): print("Ecriture des familles de %s ..." % anc, end=' ', file=sys.stderr) f = myFile.openFile(outTemplate % phylTree.fileName[anc], "w") for gg in lst: print(" ".join(gg), file=f) f.close() print(len(lst), "OK", file=sys.stderr)
def main(): arguments = myTools.checkArgs([("phylTree.conf", myTools.File), ("iniTree", myTools.File), ("rootSpecies", str)], [], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) # Returns a list of nodes under the new root species ######################################################### def search(node): if phylTree.isChildOf(tree.info[node]['taxon_name'], arguments["rootSpecies"]): return [node] elif node in tree.data: r = [] for (g, _) in tree.data[node]: r.extend(search(g)) return r else: return [] nb = 0 for tree in myProteinTree.loadTree(arguments["iniTree"]): l = search(tree.root) nb += len(l) if len(l) == 1: tree.info[l[0]]["tree_name"] = tree.info[tree.root]["tree_name"] myProteinTree.printTree(sys.stdout, tree.data, tree.info, l[0]) else: for (i, r) in enumerate(l): tree.info[r]["tree_name"] = tree.info[ tree.root]["tree_name"] + myProteinTree.getDupSuffix( i + 1, True) myProteinTree.printTree(sys.stdout, tree.data, tree.info, r) print(nb, "extracted trees", file=sys.stderr)
def main(): default_figsize = (15, 10) parser = ap.ArgumentParser(description=__doc__) parser.add_argument('speciestreefile') parser.add_argument('ratefile') parser.add_argument('ratecolumn', nargs='?', default='D') parser.add_argument('-o', '--outfile', help='Plot output file') args = parser.parse_args() #tree = ete3.Tree(args.speciestreefile, format=1) phyltree = myPhylTree.PhylogeneticTree(args.speciestreefile) rates = pd.read_csv(args.ratefile, sep='\t', names=['taxon', 'length', 'D', 'L', 'T'], index_col=0) #edge_colors = pd.Series({n: rates.D[n.name] for n in tree.traverse()}) if args.outfile is None: plt.switch_backend('TkAgg') lines, anc_coords, _ = plottree( phyltree, get_logitems, #phyltree_methods.get_items, phyltree_methods.get_label, edge_colors=rates[args.ratecolumn], #edge_colors) edge_cmap='viridis', label_nodes=True, label_params={'alpha': 0.7}, age_from_root=True) #lines.axes.set_xscale('log') # Fail plt.gcf().set_size_inches(default_figsize) if args.outfile: plt.savefig(args.outfile, bbox_inches=False) else: plt.show()
#!/usr/bin/env python3 """ Extrait (des genomes reels) la liste des evenements de duplications/pertes/gains sur chaque branche de l'arbre """ from LibsDyogen import myMaths, myTools, myGenomes, myPhylTree arguments = myTools.checkArgs([("phylTree.conf", file)], [("rootSpecies", str, ""), ("genesFile", str, ""), ("ancGenesFile", str, "")], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) @myTools.memoize def getGenome(e): if e in phylTree.listSpecies: return myGenomes.Genome(arguments["genesFile"] % phylTree.fileName[e]) else: return myGenomes.Genome(arguments["ancGenesFile"] % phylTree.fileName[e]) def transformName(esp, xxx_todo_changeme): (c, i) = xxx_todo_changeme return getGenome(esp).lstGenes[c][i].names[0] def do(node):
def iter_from_phyltree(treefile, *args, **kwargs): from LibsDyogen import myPhylTree yield myPhylTree.PhylogeneticTree(treefile, *args, **kwargs)
) stat_params['alfsahmmc'] = stat_params['alfsa'] = stat_params['al'] stat_params['codemlfsahmmc'] = stat_params['codemlfsa'] = stat_params['codeml'] = stat_params['cl'] stat_params['cleaningfsa'] = stat_params['cleaning'] dataset_params = ["freq_null_dist", "freq_null_t", "freq_null_dS", "freq_null_dN", "null_dist_before", "null_t_before", "null_dS_before", "null_dN_before", "null_dist_after", "null_t_after", "null_dS_after", "null_dN_after"] # + ["Ndup", "Nspe"] rate_params = [statname % m for statname in ('%s_rate', '%s_rate_std') for m in MEASURES] dataset_params_dS = ['freq_null_dS', 'null_dS_before', 'null_dS_after'] rate_params_dS = ['dS_rate_local', 'dS_rate_std_local', 'dS_rate_nonlocal', 'dS_rate_std_nonlocal'] workspace = Path.home() / 'ws7' phyltree = myPhylTree.PhylogeneticTree(str(workspace / 'DUPLI_data93/PhylTree.TimeTree201901.Ensembl93-like.goodQual.nwk')) timetree_ages_CI = pd.read_csv(str(workspace / 'databases/timetree/Primates_conf-int_201909.txt'), sep='\s+', header=None, index_col=0, names=['taxon', 'timetree_CI95_inf', 'timetree_CI95_sup'])#.rename_axis(index='taxon') dosreis_ages_CI = pd.read_csv( str(workspace / 'databases/DosReis2018_Primates-timescale/Primates_dates.tsv'), sep='\t', header=0, index_col=0, keep_default_na=False) ordered_simii_anc = ['Platyrrhini', 'Cebidae', 'Catarrhini', 'Cercopithecidae', 'Cercopithecinae', 'Papionini', 'Macaca', 'Hominoidea',
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", myTools.File), ("dirList", myTools.FileList(1))], \ [("diagsFile", str, "diags/integr/diags.%s.list.bz2"), ("outputODS", str, "")], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) # except KeyError: lstEspeces = sorted(set(phylTree.listAncestr)) allCutoff = arguments["dirList"] titles = [ "AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt", "Min", "25%", "50%", "75%", "N75", "N50", "N25", "Max", "Mean", "LongBlocks" ] alldata = {} alldiff = {} allEvents = [] for cutoff in allCutoff: # allEvents.append(cutoff.replace(".refine32-all.fuseSingletons-all.halfInsert-all.groups","").replace("denovo-","")) allEvents.append(cutoff) for events in allEvents: print(events, "...", end=' ', file=sys.stderr) # Recuperation des donnees de longueur de blocs alldata[events] = data = {} for e in lstEspeces: # print >> sys.stderr, e, "...", f = myFile.openFile( events + "/" + (arguments["diagsFile"] % phylTree.fileName[e]), "r") lst = [] sing = 0 tot = 0 interv = 0 for l in f: x = int(l.split("\t")[1]) tot += x if x >= 2: lst.append(x) interv += (x - 1) else: sing += 1 f.close() data[e] = [ e, phylTree.ages[e], tot, len(lst), tot - sing, (100. * (tot - sing)) / tot, interv, (100. * interv) / (tot - 20.) ] data[e].extend(myMaths.myStats.valSummary(lst)[:-2]) # on trie la liste des blocks par taille de blocks. lstSort = list(lst) lstSort.sort() # print >> sys.stderr, lst nbBlock = 0 ValKaryo75 = (tot - sing) * 75 / 100 Karyo75 = 0 while Karyo75 < ValKaryo75: tmp = lstSort.pop() Karyo75 += tmp nbBlock += 1 data[e].append(nbBlock) print(e, "...", nbBlock, "...", end=' ', file=sys.stderr) if events == allEvents[0]: ref = data print("OK", file=sys.stderr) if arguments["outputODS"] == "": for events in allEvents: print(events, file=sys.stdout) print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldata[events][e])) if events in alldiff: print( myFile.myTSV.printLine( ["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldiff[events][e])) else: import odf.opendocument from odfpy_datatable import DataTable textdoc = odf.opendocument.OpenDocumentSpreadsheet() for events in allEvents: # valevents = events.split("/")[-1] valevents = events # Premiere table avec les stats brutes val = [["Ancestor", "Age (My)"] + titles] for e in lstEspeces: val.append(alldata[events][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', valevents) textdoc.spreadsheet.addElement(t) # Table specifique pour un ancetre for esp in lstEspeces: # continue val = [["events"] + titles] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([valevents] + alldata[events][esp][2:]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', esp) textdoc.spreadsheet.addElement(t) # Resume final val = [["N50"] + ["events"] + [esp for esp in lstEspeces]] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][13] for e in lstEspeces]) val.append(["Mean"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][16] for e in lstEspeces]) val.append(["NbBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][3] for e in lstEspeces]) val.append(["MaxLength"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][15] for e in lstEspeces]) val.append(["LongBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][17] for e in lstEspeces]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "Summary") textdoc.spreadsheet.addElement(t) textdoc.save(arguments["outputODS"])
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", myTools.File), ("dirList", myTools.FileList(1))], \ [("diagsFile", str, "diags/integr/final/anc/diags.%s.list.bz2"), ("outputODS", str, "")], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) # Liste des especes dans le bon ordre todo = set(phylTree.listAncestr) try: l1 = phylTree.dicLinks["Euteleostomi"]["H**o sapiens"][:-1] todo.difference_update(l1) l2 = phylTree.dicLinks["Glires"]["Murinae"] todo.difference_update(l2) l3 = [e for e in todo if phylTree.isChildOf(e, "Mammalia")] l3 = sorted(l3, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l3) l4 = [e for e in todo if phylTree.isChildOf(e, "Clupeocephala")] l4 = sorted(l4, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l4) l5 = [e for e in todo if phylTree.isChildOf(e, "Amniota")] l5 = sorted(l5, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l5) l6 = sorted(todo, key=lambda e: phylTree.ages[e], reverse=True) lstEspeces = l6 + l5 + l4 + l1 + l3 + l2 except KeyError: lstEspeces = sorted(phylTree.listAncestr) # lstEspeces = l5 # lstEspeces = ["Euteleostomi", "Amniota", "Boreoeutheria"] allCutoff = arguments["dirList"] titles = ["AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt", "Min", "25%", "50%", "75%", "N75", "N50", "N25", "Max", "Mean"] alldata = {} alldiff = {} for cutoff in allCutoff: print(cutoff, "...", end=' ', file=sys.stderr) # Recuperation des donnees de longueur de blocs alldata[cutoff] = data = {} for e in lstEspeces: f = myFile.openFile(cutoff + "/" + (arguments["diagsFile"] % phylTree.fileName[e]), "r") lst = [] sing = 0 tot = 0 interv = 0 for l in f: x = int(l.split("\t")[1]) tot += x if x >= 2: lst.append(x) interv += (x - 1) else: sing += 1 f.close() data[e] = [e, phylTree.ages[e], tot, len(lst), tot - sing, (100. * (tot - sing)) / tot, interv, (100. * interv) / (tot - 20.)] data[e].extend(myMaths.myStats.valSummary(lst)[:-2]) if cutoff == allCutoff[0]: ref = data # else: alldiff[cutoff] = diff = {} for e in lstEspeces: newdata = [(x - ref[e][i] if i >= 2 else x) for (i, x) in enumerate(data[e])] newdata.insert(2, 100 * (1. - float(newdata[4]) / newdata[2]) if newdata[2] != 0 else None) diff[e] = newdata print("OK", file=sys.stderr) if arguments["outputODS"] == "": for cutoff in allCutoff: print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldata[cutoff][e])) if cutoff in alldiff: print(myFile.myTSV.printLine(["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldiff[cutoff][e])) else: import odf.opendocument from odfpy_datatable import DataTable textdoc = odf.opendocument.OpenDocumentSpreadsheet() for cutoff in allCutoff: valCutoff = cutoff.split("/")[-1] # Premiere table avec les stats brutes val = [["Ancestor", "Age (My)"] + titles] for e in lstEspeces: val.append(alldata[cutoff][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', valCutoff) textdoc.spreadsheet.addElement(t) if cutoff in alldiff: # Deuxieme table avec les differences par rapport a la reference val = [["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles] for e in lstEspeces: val.append(alldiff[cutoff][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "d" + valCutoff) textdoc.spreadsheet.addElement(t) # Table specifique pour un ancetre for esp in lstEspeces: # continue val = [["cutoff"] + titles] for cutoff in allCutoff: valCutoff = cutoff.split("/")[-1] val.append([valCutoff] + alldata[cutoff][esp][2:]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', esp) textdoc.spreadsheet.addElement(t) # Resume final val = [["cutoff", "Mean gain", "Median gain", "N50 gain", "%Cov gain", "%CovInt gain", "BlockLength %gain (mean)", "BlockLength %gain (Median)", "BlockLength %gain (N50)", "Cov %gain", "CovInt %gain"]] for cutoff in allCutoff: valCutoff = cutoff.split("/")[-1] val.append([valCutoff] + [myMaths.myStats.mean([alldiff[cutoff][e][i] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] + [myMaths.myStats.mean([100 * float( alldata[cutoff][e][i - 1] - alldata[allCutoff[0]][e][i - 1]) / alldata[allCutoff[0]][e][i - 1] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] ) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "cutoff") textdoc.spreadsheet.addElement(t) textdoc.save(arguments["outputODS"])
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", file), ("dirList", myTools.FileList(1))], \ [("diagsFile", str, "diags/integr/final/anc/diags.%s.list.bz2"), ("outputODS", str, "")], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) todo = set(phylTree.listAncestr) try: l1 = phylTree.dicLinks["Euteleostomi"]["H**o sapiens"][:-1] todo.difference_update(l1) l2 = phylTree.dicLinks["Glires"]["Murinae"] todo.difference_update(l2) l3 = [e for e in todo if phylTree.isChildOf(e, "Mammalia")] l3 = sorted(l3, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l3) l4 = [e for e in todo if phylTree.isChildOf(e, "Clupeocephala")] l4 = sorted(l4, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l4) l5 = [e for e in todo if phylTree.isChildOf(e, "Amniota")] l5 = sorted(l5, key=lambda e: phylTree.ages[e], reverse=True) todo.difference_update(l5) l6 = sorted(todo, key=lambda e: phylTree.ages[e], reverse=True) lstEspeces = l6 + l5 + l4 + l1 + l3 + l2 except KeyError: lstEspeces = sorted(phylTree.listAncestr) # lstEspeces = l5 # lstEspeces = ["Euteleostomi", "Amniota", "Boreoeutheria"] allCutoff = arguments["dirList"] titles = [ "AncGenes", "Blocks", "Genes in blocks", "%Cov", "NbInt", "%CovInt", "Min", "25%", "50%", "75%", "N75", "N50", "N25", "WeigthedAverage", "Max", "Mean", "LongBlocks" ] alldata = {} alldiff = {} allEvents = [] for cutoff in allCutoff: # allEvents.append(cutoff.replace(".refine32-all.fuseSingletons-all.halfInsert-all.groups","").replace("denovo-","")) allEvents.append(cutoff) for events in allEvents: print(events, "...", end=' ', file=sys.stderr) # Recuperation des donnees de longueur de blocs alldata[events] = data = {} for e in lstEspeces: # print >> sys.stderr, e, "...", f = myFile.openFile( events + "/" + (arguments["diagsFile"] % phylTree.fileName[e]), "r") lst = [] sing = 0 tot = 0 interv = 0 for l in f: x = int(l.split("\t")[1]) tot += x if x >= 2: lst.append(x) interv += (x - 1) else: sing += 1 f.close() data[e] = [ e, phylTree.ages[e], tot, len(lst), tot - sing, (100. * (tot - sing)) / tot, interv, (100. * interv) / (tot - 20.) ] data[e].extend(myMaths.myStats.valSummary2(lst)[:-2]) # on trie la liste des blocks par taille de blocks. lstSort = list(lst) lstSort.sort() # print >> sys.stderr, lst nbBlock = 0 ValKaryo75 = (tot - sing) * 75 / 100 Karyo75 = 0 while Karyo75 < ValKaryo75: tmp = lstSort.pop() Karyo75 += tmp nbBlock += 1 data[e].append(nbBlock) print(e, "...", nbBlock, "...", end=' ', file=sys.stderr) if events == allEvents[0]: ref = data # else: # alldiff[events] = diff = {} # for e in lstEspeces: # newdata = [(x-ref[e][i] if i >= 2 else x) for (i,x) in enumerate(data[e])] # newdata.insert(2, 100*(1.-float(newdata[4])/newdata[2]) if newdata[2] != 0 else None) # diff[e] = newdata print("OK", file=sys.stderr) if arguments["outputODS"] == "": for events in allEvents: print(events, file=sys.stdout) print(myFile.myTSV.printLine(["Ancestor", "Age (My)"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldata[events][e])) if events in alldiff: print( myFile.myTSV.printLine( ["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles)) for e in lstEspeces: print(myFile.myTSV.printLine(alldiff[events][e])) else: import odf.opendocument from odfpy_datatable import DataTable textdoc = odf.opendocument.OpenDocumentSpreadsheet() for events in allEvents: # valevents = events.split("/")[-1] valevents = events # Premiere table avec les stats brutes val = [["Ancestor", "Age (My)"] + titles] for e in lstEspeces: val.append(alldata[events][e]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', valevents) textdoc.spreadsheet.addElement(t) # if events in alldiff: # # # Deuxieme table avec les differences par rapport a la reference # val = [["Ancestor", "Age (My)", "%Useful Gene Loss"] + titles] # for e in lstEspeces: # val.append(alldiff[events][e]) # # table = DataTable(val) # table.datasourcehaslabels = "both" # t = table() # t.setAttribute('name', "d"+valevents) # textdoc.spreadsheet.addElement(t) # Table specifique pour un ancetre for esp in lstEspeces: # continue val = [["events"] + titles] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([valevents] + alldata[events][esp][2:]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', esp) textdoc.spreadsheet.addElement(t) # Resume final # val = [["events", "Mean gain", "Median gain", "N50 gain", "%Cov gain", "%CovInt gain", "BlockLength %gain (mean)", "BlockLength %gain (Median)", "BlockLength %gain (N50)", "Cov %gain", "CovInt %gain"]] # for events in allEvents: # valevents = events.split("/")[-1] # val.append( [valevents] + [myMaths.myStats.mean([alldiff[events][e][i] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] + # [myMaths.myStats.mean([100*float(alldata[events][e][i-1]-alldata[allEvents[0]][e][i-1])/alldata[allEvents[0]][e][i-1] for e in lstEspeces]) for i in [17, 12, 14, 6, 8]] # ) # table = DataTable(val) # table.datasourcehaslabels = "both" # t = table() # t.setAttribute('name', "events") # textdoc.spreadsheet.addElement(t) # Pour les courbes val = [["AncGenes"] + ["events"] + [esp for esp in lstEspeces]] for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][2] for e in lstEspeces]) val.append(["WeigthedAverage"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [int(alldata[events][e][15]) for e in lstEspeces]) val.append(["N50"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][13] for e in lstEspeces]) val.append(["Mean"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][17] for e in lstEspeces]) val.append(["NbBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][3] for e in lstEspeces]) val.append(["MaxLength"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][16] for e in lstEspeces]) val.append(["LongBlocks"] + ["events"] + [esp for esp in lstEspeces]) for events in allEvents: # valevents = events.split("/")[-1] valevents = events val.append([""] + [valevents] + [alldata[events][e][18] for e in lstEspeces]) table = DataTable(val) table.datasourcehaslabels = "both" t = table() t.setAttribute('name', "Summary") textdoc.spreadsheet.addElement(t) textdoc.save(arguments["outputODS"])
import sys import collections from LibsDyogen import myFile from LibsDyogen import myTools from LibsDyogen import myPhylTree from LibsDyogen import myProteinTree # arguments arguments = myTools.checkArgs([("speciesTree", myTools.File), ("geneTreeForest", myTools.File)], [("out:ancGenes", str, ""), ("reuseNames", bool, False)], __doc__) speciesTree = myPhylTree.PhylogeneticTree(arguments["speciesTree"]) # duplication counter dupCount = collections.defaultdict(int) def futureName(name, dup): if dup >= 2: dupCount[name] += 1 # if there is a duplication we need to add a suffix return name + myProteinTree.getDupSuffix(dupCount[name], False) else: return name def getRoots(node, previousAnc, lastWrittenAnc): """finds out the roots in gene families"""
def main(): # Arguments arguments = myTools.checkArgs( \ [("phylTree.conf", myTools.File)], [("diags", str, ""), ("colNames", bool, False)], \ __doc__ \ ) # L'arbre phylogenetique phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) if (arguments["colNames"]): print(myFile.myTSV.printLine([ "Ancestor", "NbComp", "Nb(In/Out)Comp", "Nb(In/In)Comp", "Age", "MeanSize_OfBlocks", "N50Size_OfBlocks", "WASize_OfBlocks", "NbComp/Age" ]), file=sys.stdout) for anc in phylTree.listAncestr: # nb d'outgroup: ############### nb_outgroup = len(phylTree.outgroupSpecies[anc]) # nb d'Ingroups. ############## nbInSpec = [len(phylTree.species[x]) for (x, _) in phylTree.items[anc]] l = [len(phylTree.species[x]) for (x, _) in phylTree.items[anc]] # for (x,_) in phylTree.items[anc]: # print >> sys.stderr, phylTree.species[x] l.append(nb_outgroup) # Comp InSpecies/OutGroups ######################### compInOut = sum(nb_outgroup * n1 for n1 in nbInSpec) # Comp InSpecies/InSpecies ######################### compInIn = sum(n1 * n2 for (n1, n2) in itertools.combinations(nbInSpec, 2)) nbc = sum(n1 * n2 for (n1, n2) in itertools.combinations(l, 2)) # quid des blocs. ############### totalStat = [] if (arguments["diags"] != ""): r = [] f = myFile.openFile(arguments["diags"] % phylTree.fileName[anc], "r") for line in f: x = int(line.split("\t")[1]) if x > 1: r.append(x) f.close() #lll = float(sum(r)) / len(r) totalStat = myMaths.myStats.valSummary2(r) else: lll = "NONE" ############### print( myFile.myTSV.printLine([ anc, nbc, compInOut, compInIn, phylTree.ages[anc], totalStat[9], totalStat[6], int(totalStat[7]), float(nbc) / phylTree.ages[anc] ]))
def main(): args = myTools.checkArgs([('phyltree', myTools.File)], [], __doc__) phyltree = myPhylTree.PhylogeneticTree(args['phyltree']) for taxon, age in sorted(phyltree.ages.items(), key=lambda x: (x[1], x[0])): print(taxon + '\t' + '%7g' % age)
# Context phyl_items = { 'Hominoidea': [('Hominidae', 1), ('Nomascus leucogenys', 1)], 'Hominidae': [('Homininae', 1), ('Pongo abelii', 1)], 'Homininae': [('HomoPan', 1), ('Gorilla gorilla', 1)], 'HomoPan': [('Pan', 1), ('H**o sapiens', 1)], 'Pan': [('Pan troglodytes', 1), ('Pan paniscus', 1)] } phyl_officialNames = { name: name for name in (set(phyl_items) | set(t for v in phyl_items.values() for t, _ in v)) } phyltree = myPhylTree.PhylogeneticTree( (phyl_items, 'Hominoidea', phyl_officialNames), skipInit=False, # No effect if giving items stream=stderr) phyltree.reinitTree(stream=stderr) data = [ myProteinTree.ProteinTree(data={1: [(2, 0.1), (3, 0.1)]}, info={ 1: { 'Duplication': 2, 'taxon_name': 'H**o sapiens' }, 2: { 'Duplication': 0, 'taxon_name': 'H**o sapiens' }, 3: {
def processData(data): tree = myPhylTree.PhylogeneticTree(io.StringIO(data)) def printTree(indent, node): global nodeid, ntree print("%sid\t%d" % (indent, nodeid)) nodeid += 1 info = {} if "B" in tree.info[node]: info["Bootstrap"] = int(tree.info[node]["B"]) if "D" in tree.info[node] and tree.info[node]["D"] == "N": info["Duplication"] = 0 if "E" in tree.info[node]: info["taxon_lost"] = tree.info[node]["E"].split( "=-$")[1].split("-") if "S" in tree.info[node]: info["taxon_name"] = string.capitalize( tree.info[node]["S"]) else: if "S" in tree.info[node]: info["taxon_name"] = string.capitalize( tree.info[node]["S"]) elif "D" in tree.info[node] and tree.info[node]["D"] == "Y": if "DD" in tree.info[node] and tree.info[node]["DD"] == "Y": info["Duplication"] = 1 info["dubious_duplication"] = 1 if "E" in tree.info[node]: info["taxon_lost"] = tree.info[node]["E"].split( "=$-")[1].split("-") if "S" in tree.info[node]: info["taxon_name"] = string.capitalize( tree.info[node]["S"]) if "SIS" in tree.info[node]: info["duplication_confidence_score"] = float( tree.info[node]["SIS"]) / 100 else: if "S" in tree.info[node]: info["taxon_name"] = string.capitalize( tree.info[node]["S"]) if "SIS" in tree.info[node]: info["duplication_confidence_score"] = float( tree.info[node]["SIS"]) / 100 else: info["Duplication"] = 2 if "E" in tree.info[node]: info["taxon_lost"] = tree.info[node]["E"].split( "=$-")[1].split("-") if "S" in tree.info[node]: info["taxon_name"] = string.capitalize( tree.info[node]["S"]) if "SIS" in tree.info[node]: info["duplication_confidence_score"] = float( tree.info[node]["SIS"]) / 100 else: if "S" in tree.info[node]: info["taxon_name"] = string.capitalize( tree.info[node]["S"]) if "SIS" in tree.info[node]: info["duplication_confidence_score"] = float( tree.info[node]["SIS"]) / 100 elif "E" in tree.info[node]: info["taxon_lost"] = tree.info[node]["E"].split("=$-")[1].split( "-") if "S" in tree.info[node]: info["Duplication"] = 0 info["taxon_name"] = string.capitalize(tree.info[node]["S"]) elif "S" in tree.info[node]: info["Duplication"] = 0 info["taxon_name"] = string.capitalize(tree.info[node]["S"]) if indent == "": info["tree_name"] = "TreeBeST%06d" % ntree ntree += 1 if node not in tree.items: # modifié par alex pour garder le "_" dans le nom de gene! ############################################################ #x = node.rpartition("_")[0] x = node # fin de modif Alex info["gene_name"] = x print("%sinfo\t%s" % (indent, info)) if node in tree.items: indent = indent + "\t" for (e, l) in tree.items[node]: print("%slen\t%g" % (indent, l)) printTree(indent, e) printTree("", tree.root)