def transform(self, data): # check if data is melted: if len(data.columns) != 1: raise ValueError( 'transformer requires dataframe with ' 'a single column, got %s' % data.columns) column = data.columns[0] # iterate over lowest levels to build a dictionary of # sets genesets = {} nlevels = Utils.getDataFrameLevels(data) for key, group in data.groupby(level=list(range(nlevels))): if "background" in key and not self.background: continue genesets[key] = set(group[column]) values = [] if len(genesets) == 2: a = set(genesets[list(genesets.keys())[0]]) b = set(genesets[list(genesets.keys())[1]]) values.append(("10", len(a - b))) values.append(("01", len(b - a))) values.append(("11", len(a & b))) values.append( ("labels", list(map(path2str, list(genesets.keys()))))) elif len(genesets) == 3: a = set(genesets[list(genesets.keys())[0]]) b = set(genesets[list(genesets.keys())[1]]) c = set(genesets[list(genesets.keys())[2]]) values.append(("100", len(a - b - c))) values.append(("010", len(b - a - c))) values.append(("001", len(c - a - b))) values.append(("110", len((a & b) - c))) values.append(("101", len((a & c) - b))) values.append(("011", len((b & c) - a))) values.append(("111", len((a & b) & c))) values.append( ("labels", list(map(path2str, list(genesets.keys()))))) else: raise ValueError( "Can currently only cope with 2 or 3 way intersections") return DataTree.listAsDataFrame(values)
def render(self): '''supply the:class:`Renderer.Renderer` with the data to render. The data supplied will depend on the ``groupby`` option. returns a ResultBlocks data structure. ''' self.debug("%s: rendering data started for %i items" % (self, len(self.data))) # initiate output structure results = ResultBlocks(title="") dataframe = self.data # dataframe.write_csv("test.csv") if dataframe is None: self.warn("%s: no data after conversion" % self) raise ValueError("no data for renderer") # special patch: set column names to pruned levels # if there are no column names if len(dataframe.columns) == len(self.pruned): if list(dataframe.columns) == list(range(len(dataframe.columns))): dataframe.columns = [x[1] for x in self.pruned] nlevels = Utils.getDataFrameLevels(dataframe) self.debug("%s: rendering data started. " "levels=%i, group_level=%s" % (self, nlevels, str(self.group_level))) if self.group_level < 0: # no grouping for renderers that will accept # a dataframe with any level of indices and no explicit # grouping has been asked for. results.append(self.renderer(dataframe, path=())) else: level = Utils.getGroupLevels( dataframe, max_level=self.group_level+1) self.debug("%s: grouping by levels: %s" % (self, str(level))) for key, work in dataframe.groupby(level=level): try: results.append(self.renderer(work, path=key)) except: self.error("%s: exception in rendering" % self) results.append( ResultBlocks(Utils.buildException("rendering"))) if len(results) == 0: self.warn("renderer returned no data.") raise ValueError("renderer returned no data.") self.debug("%s: rendering data finished with %i blocks" % (self.tracker, len(results))) return results
def group(self): '''rearrange dataframe for desired grouping. Through grouping the dataframe is rearranged such that the level at which data will be grouped will be the first level in hierarchical index. If grouping by "track" is set, additional level will be added to ensure that grouping will happen. If grouping by "slice" is set, the first two levels will be swopped. The group level indicates at which level in the nested dictionary the data will be grouped with 0 indicating that everything will grouped together. ''' nlevels = Utils.getDataFrameLevels(self.data) try: default_level = self.renderer.group_level except AttributeError: # User rendere that is pure functions does not # have a group_level attribute default_level = 0 groupby = self.groupby if str(default_level).startswith("force"): groupby = default_level[len("force-"):] elif groupby == "default": groupby = default_level if groupby == "none": self.group_level = nlevels - 1 elif groupby == "track": self.group_level = 0 elif groupby == "slice": # rearrange first two levels in data tree if nlevels > 1: self.data = self.data.reorder_levels( [1, 0] + range(2, nlevels)) self.group_level = 0 elif groupby == "all": # group everything together self.group_level = -1 elif isinstance(groupby, int): # get group level from Renderer if groupby < 0: # negative levels - subtract from lowest # level g = nlevels + groupby - 1 else: g = groupby self.group_level = g else: self.group_level = 0 self.debug("grouping: nlevel=%i, groupby=%s, default=%s, group=%i" % (nlevels, groupby, str(default_level), self.group_level)) return self.data
def transform(self, data): # check if data is melted: if len(data.columns) != 1: raise ValueError( 'transformer requires dataframe with' 'a single column, got %s' % data.columns) column = data.columns[0] # iterate over lowest levels to build a dictionary of # sets genesets = {} nlevels = Utils.getDataFrameLevels(data) for key, group in data.groupby(level=range(nlevels)): genesets[path2str(key)] = set(group[column]) keys = genesets.keys() background = None foreground = [] for key in keys: if "background" in key: background = genesets[key] else: foreground.append(key) if len(keys) < 3 or background is None: raise ValueError( "Expected at least 3 lists, with one called background, " "instead got %i lists called %s" % (len(keys), ", ".join(keys))) missing = { y: [str(x) for x in genesets[y] if x not in background] for y in foreground} if any([len(missing[x]) > 0 for x in missing]): missing_items = "\n\t".join( ["%s:\t%s" % (x, ",".join(missing[x])) for x in missing]) raise ValueError( "Found items in lists not in background. " "Missing items:\n\t %s" % missing_items) M = len(set(background)) if len(keys) == 2: n = len(set(genesets[keys[1]])) N = len(set(genesets[keys[0]])) x = len(set(genesets[keys[0]]) & set(genesets[keys[1]])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) values = [("Enrichment", fc), ("P-value", p)] else: enrichments = [] pvals = [] As = [] Bs = [] for a, b in itertools.combinations(keys, 2): N = len(set(genesets[a])) n = len(set(genesets[b])) x = len(set(genesets[a]) & set(genesets[b])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) As.append(a) Bs.append(b) pvals.append(p) enrichments.append(fc) values = [("ListA", As), ("ListB", Bs), ("Enrichment", enrichments), ("P-value", pvals)] return DataTree.listAsDataFrame(values, values_are_rows=True)
def as_dataframe(data, tracker=None): '''convert data tree to pandas DataFrame. The data frame is multi-indexed according to the depth within the data tree. If the data-tree has only one level, the data will be single-indexed because pandas will not tolerate a single level MultiIndex. The code assumes that the data tree has a uniform depth and structure. The inner-most level in the *data* tree will be columns. However, if *data* is only a single-level dictionary, the keys in the dictionary will be row labels and the resultant dataframe will have only one column. Depending on the type of the leaf, the data frame is constructed as follows: Leaves are multiple arrays of the same size The data is assumed to be coordinate type data (x,y,z values). Leaves will be added to a dataframe as multiple columns. Leaves are a single array or arrays with dissimilar size A melted data frame will be constructed where the hierarchical index contains the path information and the data frame has a single column with the value. Leaf is a dataframe Dataframes will be concatenated. Existing indices of the dataframes will be preserved with the exception of the trivial index for the row numbers. Requires: All dataframes need to have the same columns. Leaf is a scalar Dataframes will be built from a nested dictionary Special cases for backwards compatibility: 1. Lowest level dictionary contains the following arrays: rows, columns, matrix - numpy matrix, convert to dataframe and apply as above 2. Lowest level dictionary contains the following keys: '01', '10', '11' - Venn 2-set data, convert columns '001', '010', ... - Venn 3-set data, convert columns Pandas attempts to find a column data type that will fit all values in a column. Thus, if a column is numeric, but contains values such as "inf", "Inf", as well, the column type might be set to object or char. ''' if data is None or len(data) == 0: return None logger = Component.get_logger() levels = getDepths(data) if len(levels) == 0: return None mi, ma = min(levels), max(levels) if mi != ma: raise NotImplementedError( 'data tree not of uniform depth, min=%i, max=%i' % (mi, ma)) labels = getPaths(data) ###################################################### ###################################################### ###################################################### # check special cases MATRIX = ('rows', 'columns', 'matrix') VENN2 = ('10', '01', '11') VENN3 = ('010', '001', '011') dataframe_prune_index = True branches = list(getNodes(data, len(labels) - 2)) for path, branch in branches: # numpy matrix - dictionary with keys matrix, rows, columns if len(set(branch.keys()).intersection(MATRIX)) == len(MATRIX): df = pandas.DataFrame(branch['matrix'], columns=branch['columns'], index=branch['rows']) setLeaf(data, path, df) dataframe_prune_index = False elif len(set(branch.keys()).intersection(VENN2)) == len(VENN2) or \ len(set(branch.keys()).intersection(VENN3)) == len(VENN3): # sort so that 'labels' is not the first item # specify data such that 'labels' will a single tuple entry values = sorted(branch.items()) df = listAsDataFrame(values) dataframe_prune_index = False setLeaf(data, path, df) ###################################################### ###################################################### ###################################################### labels = getPaths(data) # build multi-index leaves = list(getNodes(data, len(labels) - 1)) # if set to a number, any superfluous levels in the # hierarchical index of the final dataframe will # be removed. expected_levels = None leaf = leaves[0][1] if is_array(leaf): # build dataframe from arrays dataframes = [] index_tuples = [] # not a nested dictionary if len(labels) == 1: branches = [(('all',), data)] else: branches = list(getNodes(data, max(0, len(labels) - 2))) # check if it is coordinate data # All arrays need to have the same length is_coordinate = True for path, subtree in branches: lengths = [len(x) for x in list(subtree.values())] if len(lengths) == 0: continue # all arrays have the same length - coordinate data if len(lengths) == 1 or min(lengths) != max(lengths): is_coordinate = False break if is_coordinate: logger.debug('dataframe conversion: from array - coordinates') for path, leaves in branches: # skip empty leaves if len(leaves) == 0: continue dataframes.append(pandas.DataFrame(leaves)) index_tuples.append(path) else: logger.debug('dataframe conversion: from array - series') # arrays of unequal length are measurements # build a melted data frame with a single column # given by the name of the path. for key, leave in leaves: # skip empty leaves if len(leave) == 0: continue index_tuples.append(key) dataframes.append(pandas.DataFrame(leave, columns=('value',))) expected_levels = len(index_tuples[0]) df = concatDataFrames(dataframes, index_tuples) elif is_dataframe(leaf): logger.debug('dataframe conversion: from dataframe') # build dataframe from list of dataframes # by concatenation. # Existing indices of the dataframes will # be added as columns. dataframes = [] index_tuples = [] path_lengths = [] levels = [] for path, dataframe in leaves: if len(dataframe) == 0: continue path_lengths.append(len(path)) if len(path) == 1: # if only one level, do not use tuple index_tuples.append(path[0]) else: index_tuples.append(path) dataframes.append(dataframe) levels.append(Utils.getDataFrameLevels( dataframe, test_for_trivial=True)) if len(path_lengths) == 0: return None assert min(path_lengths) == max(path_lengths) assert min(levels) == max(levels) # if only a single dataframe without given # tracks, return dataframe if index_tuples == ["all"]: df = dataframes[0] # if index is a simple numeric list, change to "all" if isinstance(df.index, pandas.Int64Index) and \ df.index.name is None: df.index = ["all"] * len(df) return df expected_levels = min(path_lengths) + min(levels) df = concatDataFrames(dataframes, index_tuples) else: logger.debug('dataframe conversion: from values') if len(labels) == 1: # { 'x': 1, 'y': 2 } -> DF with one row and two columns (x, y) df = pandas.DataFrame(list(data.values()), index=list(data.keys())) elif len(labels) == 2: # { 'a': {'x':1, 'y':2}, 'b': {'y',2} # -> DF with two columns(x,y) and two rows(a,b) df = pandas.DataFrame.from_dict(data).transpose() # reorder so that order of columns corresponds to data df = df[labels[-1]] else: # We are dealing with a simple nested dictionary branches = list(getNodes(data, max(0, len(labels) - 3))) dataframes = [] index_tuples = [] for path, nested_dict in branches: # transpose to invert columns and rows # in cgatreport convention, the deeper # level in a dictionary in cgatreport are columns, while # in pandas they are rows. df = pandas.DataFrame(nested_dict).transpose() dataframes.append(df) index_tuples.extend([path]) df = concatDataFrames(dataframes, index_tuples) # remove index with row numbers if expected_levels is not None and dataframe_prune_index: Utils.pruneDataFrameIndex(df, expected_levels) # rename levels in hierarchical index is_hierarchical = isinstance(df.index, pandas.core.index.MultiIndex) if is_hierarchical: n = list(df.index.names) try: if tracker is not None: l = getattr(tracker, "levels") except AttributeError: l = ["track", "slice"] + ["level%i" % x for x in range(len(n))] for x, y in enumerate(n): if y is None: n[x] = l[x] df.index.names = n else: df.index.name = 'track' return df