def render(self, dataframe, path): blocks = ResultBlocks() options = self.get_slideshow_options() lines = [self.prefix % options] for title, row in dataframe.iterrows(): row = row[row.notnull()] values = row.tolist() headers = list(row.index) dataseries = dict(zip(headers, values)) try: # return value is a series filename = dataseries['filename'] except KeyError: self.warn( "no 'filename' key in path %s" % (path2str(path))) return blocks try: # return value is a series name = dataseries['name'] except KeyError: self.warn( "no 'name' key in path %s" % (path2str(path))) return blocks description, title = os.path.split(name) lines.extend(self.add_image(filename, title, description)) lines.append("""</div>""") lines.append(self.skin % options) lines.append("""</div>""") lines = "\n".join(lines).split("\n") lines = [".. only::html\n"] +\ [" .. raw:: html\n"] +\ [" " + x for x in lines] lines = "\n".join(lines) blocks.append(ResultBlock(text=lines, title=path2str(path))) return blocks
def fillWorksheet(ws, dataframe, title): ws.append([""] + list(col_headers)) for x, row in enumerate(dataframe.iterrows()): ws.append([path2str(row[0])] + list(row[1])) # patch: maximum title length seems to be 31 ws.title = title[:30]
def __call__(self, dataframe, path): results = ResultBlocks() if dataframe is None: return results title = path2str(path) row_headers = dataframe.index col_headers = dataframe.columns # do not output large matrices as rst files if self.separate or (not self.force and (len(row_headers) > self.max_rows or len(col_headers) > self.max_cols)): if self.large == "xls": results.append(self.asSpreadSheet(dataframe, row_headers, col_headers, title)) else: results.append(self.asFile(dataframe, row_headers, col_headers, title)) results.append(self.asRST(dataframe, row_headers, col_headers, title)) return results
def __call__(self, dataframe, path): result = ResultBlocks() texts = [] if self.head or self.tail: if self.head: texts.append(str(dataframe.head(self.head))) if self.tail: texts.append(str(dataframe.tail(self.tail))) elif self.summary: texts.append(str(dataframe.describe())) else: texts.append(str(dataframe)) # add indentation texts = ['\n'.join([' %s' % y for y in x.split('\n')]) for x in texts] formatted = ''' :: %s ''' % '\n ...\n'.join(texts) result.append(ResultBlock(formatted, title=path2str(path))) return result
def endPlot(self, work, path): # currently: collects only single plots. figid = getCurrentRDevice() blocks = ResultBlocks( ResultBlock("\n".join(("#$rpl %i$#" % (figid), "")), title=path2str(path))) return blocks
def endPlot(self, layout, legend, path): title = path2str(path) r = ResultBlock( text="#$hv {}$#".format(title), title=title) r.hv = layout return ResultBlocks(r)
def render(self, work, path): """render the data. """ results = ResultBlocks(title=path) matrix, rows, columns = self.buildMatrix(work) title = path2str(path) if len(rows) == 0: return ResultBlocks(ResultBlock("", title=title)) # do not output large matrices as rst files # separate and force need to be mixed in. if self.separate or (not self.force and (len(rows) > self.max_rows or len(columns) > self.max_cols)): return ResultBlocks(self.asFile(pandas.DataFrame(matrix, index=rows, columns=columns), rows, columns, title), title=path) lines = [] lines.append(".. csv-table:: %s" % title) lines.append(' :header: "track","%s" ' % '","'.join(columns)) lines.append('') for row in range(len(rows)): lines.append( ' "%s","%s"' % (rows[row], '","'.join( [self.toString(x) for x in matrix[row]]))) lines.append("") if path is None: subtitle = "" else: subtitle = path2str(path) results.append(ResultBlock("\n".join(lines), title=subtitle)) return results
def __call__(self, dataframe, path): # convert to dataframe # index has test names # columns are description, info, status columns = ('description', 'info', 'status', 'name') if set(dataframe.columns) != set(columns): raise ValueError("invalid columns: expected '%s', got '%s' " % (columns, dataframe.columns)) lines = [] dirname = os.path.join(os.path.dirname( sys.modules["CGATReport"].__file__), "images") descriptions = {} title = "status" # add header lines.append(".. csv-table:: %s" % "table") lines.append(' :header: "Track", "Test", "", "Status", "Info"') lines.append('') rows = [] for index, values in dataframe.iterrows(): testname = values['name'] status = values['status'] try: image = ".. image:: {}\n :width: 32".format( os.path.join(dirname, self.map_code2image[status.upper()])) except KeyError: image = "" rows.append({ "test": testname, "description": values["description"], "info": values['info'], "status": status, "track": path2str(index), "image": image, }) descriptions[testname] = values["description"] # filter and sort table table = [self.columns] table.extend([[row[x] for x in self.columns] for row in rows]) lines = Utils.table2rst(table).split("\n") if self.display_legend: lines.append(".. glossary::") lines.append("") for test, description in descriptions.items(): lines.append('%s\n%s\n' % (Utils.indent(test, 3), Utils.indent(description, 6))) return ResultBlocks(ResultBlock("\n".join(lines), title=""))
def render(self, dataframe, path): R.library('ggplot2') # add all indices as columns dataframe.reset_index(inplace=True) rframe = pandas.rpy.common.convert_to_r_dataframe(dataframe) # for the issue below, see: # http://stackoverflow.com/questions/12865218/getting-rid-of-asis-class-attribute unAsIs = R('''function (x) { if(typeof(x) %in% c("integer","double")) { class(x) <- "numeric" return (x)} else if (typeof(x) == "character") { class(x) <- "character" return (x) } else { return(x) } }''') rframe = R["as.data.frame"](R.lapply(rframe, unAsIs)) R.assign("rframe", rframe) # start plot R('''gp = ggplot(rframe)''') # add aesthetics and geometries try: pp = R('''gp + %s ''' % self.statement) except ValueError as msg: raise ValueError( "could not interprete R statement: " "gp + %s; msg=%s" % (self.statement, msg)) figname = re.sub('/', '_', path2str(path)) r = ResultBlock('#$ggplot %s$#' % figname, title=path2str(path)) r.rggplot = pp r.figname = figname return ResultBlocks(r)
def render(self, dataframe, path): self.startPlot() dataseries = Utils.toMultipleSeries(dataframe) names = [path2str(x[0]) for x in dataseries] data = [x[1] for x in dataseries] R.boxplot(data, names=names) return self.endPlot(dataframe, path)
def render(self, dataframe, path): R.library('ggplot2') # add all indices as columns dataframe.reset_index(inplace=True) rframe = rpy2.robjects.pandas2ri.py2ri(dataframe) # for the issue below, see: # http://stackoverflow.com/questions/12865218/getting-rid-of-asis-class-attribute unAsIs = R('''function (x) { if("AsIs" %in% class(x)) { class(x) <- class(x)[-match("AsIs", class(x))] } return (x) } ''') rframe = R["as.data.frame"](R.lapply(rframe, unAsIs)) R.assign("rframe", rframe) # start plot R('''gp = ggplot(rframe)''') # add aesthetics and geometries try: pp = R('''gp + %s ''' % self.statement) except ValueError as msg: raise ValueError( "could not interprete R statement: " "gp + %s; msg=%s" % (self.statement, msg)) figname = re.sub('/', '_', path2str(path)) r = ResultBlock('#$ggplot %s$#' % figname, title=path2str(path)) r.rggplot = pp r.figname = figname return ResultBlocks(r)
def __call__(self, dataframe, path): results = ResultBlocks() if dataframe is None: return results title = path2str(path) row_headers = dataframe.index col_headers = dataframe.columns results.append(self.asSpreadSheet(dataframe, row_headers, col_headers, title)) return results
def __call__(self, dataframe, path): # modify table (adding/removing columns) according to user options # matrix, row_headers, col_headers = \ # self.modifyTable(matrix, row_headers, col_headers) dataframe = self.modifyTable(dataframe) title = path2str(path) results = ResultBlocks() row_headers = dataframe.index col_headers = dataframe.columns # as of sphinx 1.3.1, tables with more than 100 columns cause an # error: # Exception occurred: # File "/ifs/apps/apps/python-2.7.9/lib/python2.7/site-packages/docutils/writers/html4css1/__init__.py", line 642, in write_colspecs # colwidth = int(node['colwidth'] * 100.0 / width + 0.5) # ZeroDivisionError: float division by zero # # Thus, for table with more than 100 columns, force will be # disabled and max_cols set to a low value in order to make # sure the table is not displayed inline if len(col_headers) >= 90: self.force = False self.max_cols = 10 # do not output large matrices as rst files if self.separate or (not self.force and (len(row_headers) > self.max_rows or len(col_headers) > self.max_cols)): if self.large == "xls": results.append(self.asSpreadSheet(dataframe, row_headers, col_headers, title)) else: results.append(self.asFile(dataframe, row_headers, col_headers, title)) if self.preview: raise NotImplementedError('preview not implemented') row_headers = row_headers[:self.max_rows] col_headers = col_headers[:self.max_cols] # matrix = [x[:self.max_cols] for x in # matrix[:self.max_rows]] else: return results results.append(self.asCSV(dataframe, row_headers, col_headers, title)) return results
def __call__(self, dataframe, path): # convert to dataframe # index has test names # columns are description, info, status columns = ('description', 'info', 'status', 'name') if set(dataframe.columns) != set(columns): raise ValueError("invalid columns: expected '%s', got '%s' " % (columns, dataframe.columns)) lines = [] dirname = os.path.join(os.path.dirname( sys.modules["CGATReport"].__file__), "images") descriptions = {} title = "status" # add header lines.append(".. csv-table:: %s" % "table") lines.append(' :header: "Track", "Test", "", "Status", "Info"') lines.append('') for index, values in dataframe.iterrows(): testname = values['name'] description = values['description'] info = values['info'] status = values['status'] track = path2str(index) descriptions[testname] = description try: image = ".. image:: %s" %\ os.path.join(dirname, self.map_code2image[status.upper()]) except KeyError: image = "" lines.append( ' "%(track)s", ":term:`%(testname)s`", "%(image)s", "%(status)s", "%(info)s"' % locals()) lines.append("") lines.append(".. glossary::") lines.append("") for test, description in descriptions.items(): lines.append('%s\n%s\n' % (Utils.indent(test, 3), Utils.indent(description, 6))) return ResultBlocks(ResultBlock("\n".join(lines), title=""))
def __call__(self, dataframe, path): results = ResultBlocks() if dataframe is None: return results title = path2str(path) lines = [] lines.append(".. glossary::") lines.append("") for x, row in enumerate(dataframe.iterrows()): header, data = row txt = "\n ".join([x.strip() for x in str(data).split("\n")]) lines.append(' %s\n %s\n' % (path2str(header), txt)) lines.append("") results.append(ResultBlock("\n".join(lines), title=title)) return results
def render(self, dataframe, path): self.startPlot() dataseries = Utils.toMultipleSeries(dataframe) # import pdb; pdb.set_trace() # rframe = pandas.rpy.common.convert_to_r_dataframe(dataframe) # R.boxplot(rframe) names = [path2str(x[0]) for x in dataseries] data = [x[1] for x in dataseries] R.boxplot(data, names=names) return self.endPlot(dataframe, path)
def render(self, data): # initiate output structure results = ResultBlocks(title='user') labels = DataTree.getPaths(data) # iterate over all items at leaf for path, branch in DataTree.getNodes(data, len(labels) - 2): for key in Utils.TrackerKeywords: if key in branch: # add a result block results.append(ResultBlock(branch[key], title=path2str(path))) return results
def endPlot(self, plts, legends, path): """close plots. """ title = path2str(path) figid = 10 lines = [] figid = self.bokeh_figure._id lines.append("") lines.append("#$bkh %s$#" % figid) lines.append("") r = ResultBlock("\n".join(lines), title=title) r.bokeh = self.bokeh_figure return ResultBlocks(r)
def endPlot(self, plots, legends, path): """close plots. """ result = ResultBlocks() title = path2str(path) for plot in plots: figid = plot._id lines = [] lines.append("") lines.append("#$bkh %s$#" % figid) lines.append("") r = ResultBlock("\n".join(lines), title=title) r.bokeh = plot result.append(r) return result
def render(self, dataframe, path): fig = self.startPlot() labels = dataframe.index.levels paths = list(itertools.product(*labels)) self.initPlot(fig, dataframe, path) nplotted = 0 for idx in range(0, len(paths), 2): self.initLine(path, dataframe) xpath = paths[idx] ypath = paths[idx + 1] xvalues, yvalues = dataframe.ix[xpath], dataframe.ix[ypath] if len(xvalues) != len(yvalues): raise ValueError( "length of x,y tuples not consistent: %i != %i" % len(xvalues), len(yvalues)) R.plot(xvalues, yvalues) self.initCoords(xvalues, yvalues) nplotted += 1 self.finishPlot(fig, dataframe, path) figid = getCurrentRDevice() blocks = ResultBlocks( ResultBlock("\n".join(("#$rpl %i$#" % (figid), "")), path2str(path))) return blocks
def __call__(self, dataframe, path): # modify table (adding/removing columns) according to user options # matrix, row_headers, col_headers = \ # self.modifyTable(matrix, row_headers, col_headers) dataframe = self.modifyTable(dataframe) title = path2str(path) results = ResultBlocks() row_headers = dataframe.index col_headers = dataframe.columns # do not output large matrices as rst files if self.separate or (not self.force and (len(row_headers) > self.max_rows or len(col_headers) > self.max_cols)): if self.large == "xls": results.append(self.asSpreadSheet(dataframe, row_headers, col_headers, title)) else: results.append(self.asFile(dataframe, row_headers, col_headers, title)) if self.preview: raise NotImplementedError('preview not implemented') row_headers = row_headers[:self.max_rows] col_headers = col_headers[:self.max_cols] # matrix = [x[:self.max_cols] for x in # matrix[:self.max_rows]] else: return results results.append(self.asCSV(dataframe, row_headers, col_headers, title)) return results
def transform(self, data): # check if data is melted: if len(data.columns) != 1: raise ValueError( 'transformer requires dataframe with' 'a single column, got %s' % data.columns) column = data.columns[0] # iterate over lowest levels to build a dictionary of # sets genesets = {} nlevels = Utils.getDataFrameLevels(data) for key, group in data.groupby(level=range(nlevels)): genesets[path2str(key)] = set(group[column]) keys = genesets.keys() background = None foreground = [] for key in keys: if "background" in key: background = genesets[key] else: foreground.append(key) if len(keys) < 3 or background is None: raise ValueError( "Expected at least 3 lists, with one called background, " "instead got %i lists called %s" % (len(keys), ", ".join(keys))) missing = { y: [str(x) for x in genesets[y] if x not in background] for y in foreground} if any([len(missing[x]) > 0 for x in missing]): missing_items = "\n\t".join( ["%s:\t%s" % (x, ",".join(missing[x])) for x in missing]) raise ValueError( "Found items in lists not in background. " "Missing items:\n\t %s" % missing_items) M = len(set(background)) if len(keys) == 2: n = len(set(genesets[keys[1]])) N = len(set(genesets[keys[0]])) x = len(set(genesets[keys[0]]) & set(genesets[keys[1]])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) values = [("Enrichment", fc), ("P-value", p)] else: enrichments = [] pvals = [] As = [] Bs = [] for a, b in itertools.combinations(keys, 2): N = len(set(genesets[a])) n = len(set(genesets[b])) x = len(set(genesets[a]) & set(genesets[b])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) As.append(a) Bs.append(b) pvals.append(p) enrichments.append(fc) values = [("ListA", As), ("ListB", Bs), ("Enrichment", enrichments), ("P-value", pvals)] return DataTree.listAsDataFrame(values, values_are_rows=True)
def __call__(self, dataframe, path): '''iterate over leaves/branches in data structure. This method will call the:meth:`render` method ''' result = ResultBlocks() if not self.split_at: result.extend(self.render(dataframe, path)) else: # split dataframe at first index level = Utils.getGroupLevels(dataframe) grouper = dataframe.groupby(level=level) if len(grouper) < self.split_at: result.extend(self.render(dataframe, path)) else: # build groups always, remove_always = [], set() if self.split_always: for key, work in grouper: for pat in self.split_always: rx = re.compile(pat) if rx.search(path2str(key)): always.append((key, work)) remove_always.add(key) grouper = dataframe.groupby(level=level) def _group_group(grouper, always, remove_always): group = always[:] for key, work in grouper: if key in remove_always: continue group.append((key, work)) if len(group) >= self.split_at: yield group group = always[:] # reconcile index names yield group first = True for group in _group_group(grouper, always, remove_always): # do not plot last dataframe that contains # only the common tracks to plot if not first and len(group) == len(always): continue first = False df = pandas.concat( [x[1] for x in group]) # reconcile index names df.index.names = dataframe.index.names result.extend(self.render(df, path)) return result
def __call__(self, dataframe, path): '''iterate over leaves/branches in data structure. This method will call the:meth:`render` method. Large dataframes are split into multiple, smaller rendered objects if self.split_at is not zero. By default, dataframes are split along the hierachical index. However, if there is only a single index, but multiple columns, the split is performed on the columns instead. This is used when splitting coordinate data as a result of the histogram transformation. ''' result = ResultBlocks() if not self.split_at: result.extend(self.render(dataframe, path)) else: # split dataframe at first index level = Utils.getGroupLevels(dataframe) grouper = dataframe.groupby(level=level) # split dataframe column wise if only one index # and multiple columns if len(grouper) == 1 and len(dataframe.columns) > self.split_at: columns = list(dataframe.columns) always = [] if self.split_keep_first_column: always.append(columns[0]) # columns to always keep always.extend([c for c in columns if c in self.split_always]) columns = [c for c in columns if c not in always] for x in range(0, len(columns), self.split_at): # extract a set of columns result.extend(self.render( dataframe.loc[:, always+columns[x:x+self.split_at]], path)) # split dataframe along index elif len(grouper) >= self.split_at: # build groups always, remove_always = [], set() if self.split_always: for key, work in grouper: for pat in self.split_always: rx = re.compile(pat) if rx.search(path2str(key)): always.append((key, work)) remove_always.add(key) grouper = dataframe.groupby(level=level) def _group_group(grouper, always, remove_always): group = always[:] for key, work in grouper: if key in remove_always: continue group.append((key, work)) if len(group) >= self.split_at: yield group group = always[:] # reconcile index names yield group first = True for group in _group_group(grouper, always, remove_always): # do not plot last dataframe that contains # only the common tracks to plot if not first and len(group) == len(always): continue first = False df = pandas.concat( [x[1] for x in group]) # reconcile index names df.index.names = dataframe.index.names result.extend(self.render(df, path)) else: # do not split dataframe result.extend(self.render(dataframe, path)) return result
def asSpreadSheet(self, dataframe, row_headers, col_headers, title): '''save the table as an xls file. Multiple files of the same Renderer/Tracker combination are distinguished by the title. ''' self.debug("%s: saving %i x %i table as spread-sheet'" % (id(self), len(row_headers), len(col_headers))) is_hierarchical = isinstance(dataframe.index, pandas.core.index.MultiIndex) split = is_hierarchical and len(dataframe.index.levels) > 1 quick = len(dataframe) > 10000 if quick and not split: # quick writing, only append method works wb = openpyxl.Workbook(optimized_write=True) def fillWorksheet(ws, dataframe, title): ws.append([""] + list(col_headers)) for x, row in enumerate(dataframe.iterrows()): ws.append([path2str(row[0])] + list(row[1])) # patch: maximum title length seems to be 31 ws.title = title[:30] else: # do it cell-by-cell, this might be slow wb = openpyxl.Workbook(optimized_write=False) def fillWorksheet(ws, dataframe, title): # regex to detect rst hypelinks regex_link = re.compile('`(.*) <(.*)>`_') # write row names for row, row_name in enumerate(dataframe.index): # rows and columns start at 1 c = ws.cell(row=row + 2, column=1) c.value = row_name # write columns for column, column_name in enumerate(dataframe.columns): # set column title # rows and columns start at 1 c = ws.cell(row=1, column=column + 2) c.value = column_name # set column values dataseries = dataframe[column_name] if dataseries.dtype == object: for row, value in enumerate(dataseries): c = ws.cell(row=row + 2, column=column + 2) value = str(value) if value.startswith('`'): c.value, c.hyperlink =\ regex_link.match(value).groups() else: c.value = value else: for row, value in enumerate(dataseries): c = ws.cell(row=row + 2, column=column + 2) c.value = value # patch: maximum title length seems to be 31 ws.title = re.sub("/", "_", title)[:30] if len(wb.worksheets) == 0: wb.create_sheet() if split: # create separate worksheets for nested indices nlevels = len(dataframe.index.levels) paths = map(tuple, DataTree.unique( [x[:nlevels - 1] for x in dataframe.index.unique()])) ws = wb.worksheets[0] ws.title = 'Summary' ws.append( [""] * (nlevels - 1) + ["Worksheet", "Rows"]) for row, path in enumerate(paths): # select data frame as cross-section work = dataframe.xs(path, axis=0) title = path2str(path) if len(title) > 30: title = "sheet%i" % row ws.append(list(path) + [title, len(work)]) c = ws.cell(row=row + 1, column=nlevels) # this does not work in oocalc c.hyperlink = "#%s!A1" % title fillWorksheet(wb.create_sheet(), work, title=title) else: fillWorksheet(wb.worksheets[0], dataframe, title=title) # write result block lines = [] lines.append("`%i x %i table <#$xls %s$#>`__" % (len(row_headers), len(col_headers), title)) lines.append("") r = ResultBlock("\n".join(lines), title=title) r.xls = wb self.debug("%s: saved %i x %i table as spread-sheet'" % (id(self), len(row_headers), len(col_headers))) return r