def phylomedump_tree_iterator( tarfn,verbose=0 ): """PhylomeDB all_trees.tar.gz dump treeobj generator.""" #open tarfile if tarfn.endswith(".gz"): tar = tarfile.open(tarfn, "r:gz") else: tar = tarfile.open(tarfn, "r") i = k = 0 #process entries for m in tar: #if i>100: break if not m.isfile(): continue #load tree if m.name.endswith(".nw"): i += 1 #get nw nw = tar.extractfile(m).readline() t = PhyloTree(nw) ##add seedid and method info #Phy000CWA9_YEAST.JTT.nw --> Phy000CWA9_YEAST JTT seedid, method = os.path.basename(m.name).split(".")[:2] t.seedid = seedid t.method = method #or add lk, seedid, method and lk to treeobj elif m.name.endswith(".lk"): seedid, method, lk = tar.extractfile(m).readline().split('\t')[:3] t.lk = float(lk) if not t.lk: sys.stderr.write( " Err: Zero likelihood (%s) for: %s\n" % (t.lk, ", ".join((t.seedid, t.method)))) continue if seedid!=t.seedid or t.method != method: sys.stderr.write( " Err: Seedid and/or method doesn't match: %s\n" % ", ".join((seedid, t.seedid, method, t.method))) continue k += 1 if verbose and not i%100: sys.stderr.write( " %6i\r" % i ) yield t if verbose: sys.stderr.write( " %s out of %s trees succesfully parsed [memory: %s KB]\n" % (k, i, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))