def GET(self, tracker): cache = Cache.Cache(tracker, mode="r") data = DataTree.fromCache(cache) table, row_headers, col_headers = DataTree.tree2table(data) return render.data_table(table, row_headers, col_headers)
def getData(self, path): """get data for track and slice. Save data in persistent cache for further use. For functions, path should be an empty tuple. """ if path: key = DataTree.path2str(path) else: key = "all" result, fromcache = None, False # trackers with options are not cached if not self.nocache and not self.tracker_options: try: result = self.cache[key] fromcache = True except KeyError: pass except RuntimeError as msg: raise RuntimeError( "error when accessing key %s from cache: %s " "- potential problem with unpickable object?" % (key, msg)) kwargs = {} if self.tracker_options: kwargs['options'] = self.tracker_options if result is None: try: result = self.tracker(*path, **kwargs) except Exception as msg: self.warn("exception for tracker '%s', path '%s': msg=%s" % (str(self.tracker), DataTree.path2str( path), msg)) if VERBOSE: self.warn(traceback.format_exc()) raise # store in cache if not self.nocache and not fromcache: # exception - do not store data frames # test with None fails for some reason self.cache[key] = result return result
def render(self, data): # initiate output structure results = ResultBlocks(title='user') labels = DataTree.getPaths(data) # iterate over all items at leaf for path, branch in DataTree.getNodes(data, len(labels) - 2): for key in Utils.TrackerKeywords: if key in branch: # add a result block results.append(ResultBlock(branch[key], title=path2str(path))) return results
def transform(self, data): # check if data is melted: if len(data.columns) != 1: raise ValueError( 'transformer requires dataframe with ' 'a single column, got %s' % data.columns) column = data.columns[0] # iterate over lowest levels to build a dictionary of # sets genesets = {} nlevels = Utils.getDataFrameLevels(data) for key, group in data.groupby(level=list(range(nlevels))): if "background" in key and not self.background: continue genesets[key] = set(group[column]) values = [] if len(genesets) == 2: a = set(genesets[list(genesets.keys())[0]]) b = set(genesets[list(genesets.keys())[1]]) values.append(("10", len(a - b))) values.append(("01", len(b - a))) values.append(("11", len(a & b))) values.append( ("labels", list(map(path2str, list(genesets.keys()))))) elif len(genesets) == 3: a = set(genesets[list(genesets.keys())[0]]) b = set(genesets[list(genesets.keys())[1]]) c = set(genesets[list(genesets.keys())[2]]) values.append(("100", len(a - b - c))) values.append(("010", len(b - a - c))) values.append(("001", len(c - a - b))) values.append(("110", len((a & b) - c))) values.append(("101", len((a & c) - b))) values.append(("011", len((b & c) - a))) values.append(("111", len((a & b) & c))) values.append( ("labels", list(map(path2str, list(genesets.keys()))))) else: raise ValueError( "Can currently only cope with 2 or 3 way intersections") return DataTree.listAsDataFrame(values)
def __call__(self, *args, **kwargs): try: self.parseArguments(*args, **kwargs) except: self.error("%s: exception in parsing" % self) return ResultBlocks(ResultBlocks(Utils.buildException("parsing"))) # collect no data if tracker is the empty tracker # and go straight to rendering try: if self.tracker.getTracks() == ["empty"]: # is instance does not work because of module mapping # type(Tracker.Empty) == CGATReport.Tracker.Empty # type(self.tracker) == Tracker.Empty # if isinstance(self.tracker, Tracker.Empty): result = self.renderer() return ResultBlocks(result) except AttributeError: # for function trackers pass self.debug("profile: started: tracker: %s" % (self.tracker)) # collecting data try: self.collect() except: self.error("%s: exception in collection" % self) return ResultBlocks(ResultBlocks( Utils.buildException("collection"))) finally: self.debug("profile: finished: tracker: %s" % (self.tracker)) if self.tree is None or len(self.tree) == 0: self.info("%s: no data - processing complete" % self.tracker) return None data_paths = DataTree.getPaths(self.tree) self.debug("%s: after collection: %i data_paths: %s" % (self, len(data_paths), str(data_paths))) # special Renderers - do not process data further but render # directly. Note that no transformations will be applied. if isinstance(self.renderer, Renderer.User): results = ResultBlocks(title="main") results.append(self.renderer(self.tree)) return results elif isinstance(self.renderer, Renderer.Debug): results = ResultBlocks(title="main") results.append(self.renderer(self.tree)) return results # merge all data to hierarchical indexed dataframe self.data = DataTree.asDataFrame(self.tree) self.debug("dataframe memory usage: total=%i,data=%i,index=%i,col=%i" % (self.data.values.nbytes + self.data.index.nbytes + self.data.columns.nbytes, self.data.values.nbytes, self.data.index.nbytes, self.data.columns.nbytes)) # if tracks are set by tracker, call tracker with dataframe if self.indexFromTracker: self.tracker.setIndex(self.data) # transform data try: self.transform() except: self.error("%s: exception in transformation" % self) return ResultBlocks(ResultBlocks( Utils.buildException("transformation"))) # data_paths = DataTree.getPaths(self.data) # self.debug("%s: after transformation: %i data_paths: %s" % # (self, len(data_paths), str(data_paths))) # restrict try: self.filterPaths(self.restrict_paths, mode="restrict") except: self.error("%s: exception in restrict" % self) return ResultBlocks(ResultBlocks( Utils.buildException("restrict"))) # data_paths = DataTree.getPaths(self.data) # self.debug("%s: after restrict: %i data_paths: %s" % # (self, len(data_paths), str(data_paths))) # exclude try: self.filterPaths(self.exclude_paths, mode="exclude") except: self.error("%s: exception in exclude" % self) return ResultBlocks(ResultBlocks(Utils.buildException("exclude"))) # data_paths = DataTree.getPaths(self.data) # self.debug("%s: after exclude: %i data_paths: %s" % # (self, len(data_paths), str(data_paths))) # No pruning - maybe enable later as a user option self.pruned = [] # try: # self.prune() # except: # self.error("%s: exception in pruning" % self) # return ResultBlocks(ResultBlocks(Utils.buildException("pruning"))) # data_paths = DataTree.getPaths(self.data) # self.debug("%s: after pruning: %i data_paths: %s" % # (self, len(data_paths), str(data_paths))) try: self.group() except: self.error("%s: exception in grouping" % self) return ResultBlocks(ResultBlocks(Utils.buildException("grouping"))) # data_paths = DataTree.getPaths(self.data) # self.debug("%s: after grouping: %i data_paths: %s" % # (self, len(data_paths), str(data_paths))) if self.renderer is not None: self.debug("profile: started: renderer: %s" % (self.renderer)) try: result = self.render() except: self.error("%s: exception in rendering" % self) return ResultBlocks(ResultBlocks( Utils.buildException("rendering"))) finally: self.debug("profile: finished: renderer: %s" % (self.renderer)) else: result = ResultBlocks(title="") return result
def collect(self): '''collect all data. Data is stored in a multi-level dictionary (DataTree) ''' self.tree = odict() self.debug("%s: collecting data paths." % (self.tracker)) is_function, datapaths = self.getDataPaths(self.tracker) self.debug("%s: collected data paths." % (self.tracker)) # if function, no datapaths if is_function: d = self.getData(()) # save in data tree as leaf DataTree.setLeaf(self.tree, ("all",), d) self.debug("%s: collecting data finished for function." % (self.tracker)) return # if no tracks, error if len(datapaths) == 0 or len(datapaths[0]) == 0: self.warn("%s: no tracks found - no output" % self.tracker) return # filter data paths self.debug("%s: filtering data paths: %s" % (self.tracker, datapaths)) datapaths = self.filterDataPaths(datapaths) self.debug("%s: filtered data paths: %s" % (self.tracker, datapaths)) # if no tracks, error if len(datapaths) == 0 or len(datapaths[0]) == 0: self.warn( "%s: no tracks remain after filtering " "- no output" % self.tracker) return self.debug("%s: building all_paths" % (self.tracker)) if len(datapaths) > MAX_PATH_NESTING: self.warn("%s: number of nesting in data paths too large: %i" % ( self.tracker, len(datapaths))) raise ValueError( "%s: number of nesting in data paths too large: %i" % ( self.tracker, len(datapaths))) all_paths = list(itertools.product(*datapaths)) self.debug( "%s: collecting data started for %i data paths" % ( self.tracker, len(all_paths))) self.tree = odict() for path in all_paths: d = self.getData(path) # ignore empty data sets if d is None: continue # save in data tree as leaf DataTree.setLeaf(self.tree, path, d) self.debug( "%s: collecting data finished for %i data paths" % ( self.tracker, len(all_paths))) return self.tree
def transform(self, data): # check if data is melted: if len(data.columns) != 1: raise ValueError( 'transformer requires dataframe with' 'a single column, got %s' % data.columns) column = data.columns[0] # iterate over lowest levels to build a dictionary of # sets genesets = {} nlevels = Utils.getDataFrameLevels(data) for key, group in data.groupby(level=range(nlevels)): genesets[path2str(key)] = set(group[column]) keys = genesets.keys() background = None foreground = [] for key in keys: if "background" in key: background = genesets[key] else: foreground.append(key) if len(keys) < 3 or background is None: raise ValueError( "Expected at least 3 lists, with one called background, " "instead got %i lists called %s" % (len(keys), ", ".join(keys))) missing = { y: [str(x) for x in genesets[y] if x not in background] for y in foreground} if any([len(missing[x]) > 0 for x in missing]): missing_items = "\n\t".join( ["%s:\t%s" % (x, ",".join(missing[x])) for x in missing]) raise ValueError( "Found items in lists not in background. " "Missing items:\n\t %s" % missing_items) M = len(set(background)) if len(keys) == 2: n = len(set(genesets[keys[1]])) N = len(set(genesets[keys[0]])) x = len(set(genesets[keys[0]]) & set(genesets[keys[1]])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) values = [("Enrichment", fc), ("P-value", p)] else: enrichments = [] pvals = [] As = [] Bs = [] for a, b in itertools.combinations(keys, 2): N = len(set(genesets[a])) n = len(set(genesets[b])) x = len(set(genesets[a]) & set(genesets[b])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) As.append(a) Bs.append(b) pvals.append(p) enrichments.append(fc) values = [("ListA", As), ("ListB", Bs), ("Enrichment", enrichments), ("P-value", pvals)] return DataTree.listAsDataFrame(values, values_are_rows=True)
def asSpreadSheet(self, dataframe, row_headers, col_headers, title): '''save the table as an xls file. Multiple files of the same Renderer/Tracker combination are distinguished by the title. ''' self.debug("%s: saving %i x %i table as spread-sheet'" % (id(self), len(row_headers), len(col_headers))) is_hierarchical = isinstance(dataframe.index, pandas.core.index.MultiIndex) split = is_hierarchical and len(dataframe.index.levels) > 1 quick = len(dataframe) > 10000 if quick and not split: # quick writing, only append method works wb = openpyxl.Workbook(optimized_write=True) def fillWorksheet(ws, dataframe, title): ws.append([""] + list(col_headers)) for x, row in enumerate(dataframe.iterrows()): ws.append([path2str(row[0])] + list(row[1])) # patch: maximum title length seems to be 31 ws.title = title[:30] else: # do it cell-by-cell, this might be slow wb = openpyxl.Workbook(optimized_write=False) def fillWorksheet(ws, dataframe, title): # regex to detect rst hypelinks regex_link = re.compile('`(.*) <(.*)>`_') # write row names for row, row_name in enumerate(dataframe.index): # rows and columns start at 1 c = ws.cell(row=row + 2, column=1) c.value = row_name # write columns for column, column_name in enumerate(dataframe.columns): # set column title # rows and columns start at 1 c = ws.cell(row=1, column=column + 2) c.value = column_name # set column values dataseries = dataframe[column_name] if dataseries.dtype == object: for row, value in enumerate(dataseries): c = ws.cell(row=row + 2, column=column + 2) value = str(value) if value.startswith('`'): c.value, c.hyperlink =\ regex_link.match(value).groups() else: c.value = value else: for row, value in enumerate(dataseries): c = ws.cell(row=row + 2, column=column + 2) c.value = value # patch: maximum title length seems to be 31 ws.title = re.sub("/", "_", title)[:30] if len(wb.worksheets) == 0: wb.create_sheet() if split: # create separate worksheets for nested indices nlevels = len(dataframe.index.levels) paths = map(tuple, DataTree.unique( [x[:nlevels - 1] for x in dataframe.index.unique()])) ws = wb.worksheets[0] ws.title = 'Summary' ws.append( [""] * (nlevels - 1) + ["Worksheet", "Rows"]) for row, path in enumerate(paths): # select data frame as cross-section work = dataframe.xs(path, axis=0) title = path2str(path) if len(title) > 30: title = "sheet%i" % row ws.append(list(path) + [title, len(work)]) c = ws.cell(row=row + 1, column=nlevels) # this does not work in oocalc c.hyperlink = "#%s!A1" % title fillWorksheet(wb.create_sheet(), work, title=title) else: fillWorksheet(wb.worksheets[0], dataframe, title=title) # write result block lines = [] lines.append("`%i x %i table <#$xls %s$#>`__" % (len(row_headers), len(col_headers), title)) lines.append("") r = ResultBlock("\n".join(lines), title=title) r.xls = wb self.debug("%s: saved %i x %i table as spread-sheet'" % (id(self), len(row_headers), len(col_headers))) return r
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("-v", "--verbose", dest="loglevel", type="int", help="loglevel. The higher, the more output [default=%default]") parser.add_option("-i", "--view", dest="view", action="store_true", help="view keys in cache [default=%default]") parser.add_option("-t", "--tracker", dest="tracker", type="string", help="tracker to use [default=%default]") parser.add_option("-a", "--tracks", dest="tracks", type="string", help="tracks to include [default=%default]") parser.add_option("-s", "--slices", dest="slices", type="string", help="slices to include [default=%default]") parser.add_option("-g", "--groupby", dest="groupby", type="choice", choices=("track", "slice", "all"), help="groupby by track or slice [default=%default]") parser.add_option("-f", "--format", dest="format", type="choice", choices=("tsv", "csv"), help="output format [default=%default]") parser.set_defaults( loglevel=2, view=False, tracker=None, tracks=None, slices=None, groupby="slice", format="tsv", ) (options, args) = parser.parse_args() if len(args) != 1 and options.tracker == None: print(USAGE) raise ValueError("please supply a tracker.""") if options.tracker: tracker = options.tracker else: tracker = args[0] cache = Cache.Cache(tracker, mode="r") if options.view: keys = [x.split("/") for x in list(cache.keys())] sys.stdout.write("# available tracks\n") sys.stdout.write("track\n%s" % "\n".join(set([x[0] for x in keys]))) sys.stdout.write("\n") sys.stdout.write("# available slices\n") sys.stdout.write("slice\n%s" % "\n".join(set([x[1] for x in keys]))) sys.stdout.write("\n") return data = DataTree.fromCache(cache, tracks=options.tracks, slices=options.slices, groupby=options.groupby) table, row_headers, col_headers = DataTree.tree2table(data) if options.format in ("tsv", "csv"): if options.format == "tsv": sep = "\t" elif options.format == "csv": sep = "," sys.stdout.write(sep + sep.join(col_headers) + "\n") for h, row in zip(row_headers, table): sys.stdout.write("%s%s%s\n" % (h, sep, sep.join(row)))