def summarize(self, dframe, groups=[], no_cache=False): """Build and return a summary of the data in this dataset. Return a summary of dframe grouped by `groups`, or the overall summary if no groups are specified. :param dframe: An optional dframe to summarize, if None fetch a dframe :param groups: A list of columns to group on. :param no_cache: Do not fetch a cached summary. :returns: A summary of the dataset as a dict. Numeric columns will be summarized by the arithmetic mean, standard deviation, and percentiles. Dimensional columns will be summarized by counts. """ self.reload() return summarize(self, dframe, groups, no_cache)
def summarize(self, dframe, groups=[], no_cache=False, update=False, flat=False): """Build and return a summary of the data in this dataset. Return a summary of dframe grouped by `groups`, or the overall summary if no groups are specified. :param dframe: dframe to summarize :param groups: A list of columns to group on. :param no_cache: Do not fetch a cached summary. :param flat: Return a flattened list of groups. :returns: A summary of the dataset as a dict. Numeric columns will be summarized by the arithmetic mean, standard deviation, and percentiles. Dimensional columns will be summarized by counts. """ self.reload() summary = summarize(self, dframe, groups, no_cache, update=update) if flat: flat_summary = [] for cols, v in summary.iteritems(): cols = self.split_groups(cols) for k, data in v.iteritems(): col_values = self.split_groups(k) col_values = [ strip_pattern.sub(',', i)[1:-1] for i in col_values ] flat_summary.append( combine_dicts(dict(zip(cols, col_values)), data)) summary = flat_summary return summary
def summarize(self, query=None, select=None, group_str=None, limit=0, order_by=None): """Build and return a summary of the data in this dataset. Return a summary of the rows/values filtered by `query` and `select` and grouped by `group_str`, or the overall summary if no group is specified. :param query: An optional MongoDB query to limit the data summarized. :param select: An optional select to limit the columns summarized. :param group_str: A column in the dataset as a string or a list comma separated columns to group on. :returns: A JSON summary of the dataset. Numeric columns will be summarized by the arithmetic mean, standard deviation, and percentiles. Dimensional columns will be summarized by counts. """ # interpret none as all if not group_str: group_str = self.ALL # split group in case of multigroups groups = self.split_groups(group_str) # if select append groups to select if select: select = json.loads(select) if not isinstance(select, dict): raise ArgumentError('select argument must be a JSON dictionary' ', found: %s.' % select) select.update(dict(zip(groups, [1] * len(groups)))) select = json.dumps(select) self.reload() dframe = self.dframe(query=query, select=select, limit=limit, order_by=order_by) return summarize(self, dframe, groups, group_str, query or select)
def summarize(self, dframe, groups=[], no_cache=False, update=False, flat=False): """Build and return a summary of the data in this dataset. Return a summary of dframe grouped by `groups`, or the overall summary if no groups are specified. :param dframe: dframe to summarize :param groups: A list of columns to group on. :param no_cache: Do not fetch a cached summary. :param flat: Return a flattened list of groups. :returns: A summary of the dataset as a dict. Numeric columns will be summarized by the arithmetic mean, standard deviation, and percentiles. Dimensional columns will be summarized by counts. """ self.reload() summary = summarize(self, dframe, groups, no_cache, update=update) if flat: flat_summary = [] for cols, v in summary.iteritems(): cols = self.split_groups(cols) for k, data in v.iteritems(): col_values = self.split_groups(k) col_values = [strip_pattern.sub(',', i)[1:-1] for i in col_values] flat_summary.append( combine_dicts(dict(zip(cols, col_values)), data)) summary = flat_summary return summary