Example #1
0
File: dataset.py Project: j/bamboo
    def summarize(self, dframe, groups=[], no_cache=False):
        """Build and return a summary of the data in this dataset.

        Return a summary of dframe grouped by `groups`, or the overall
        summary if no groups are specified.

        :param dframe: An optional dframe to summarize, if None fetch a dframe
        :param groups: A list of columns to group on.
        :param no_cache: Do not fetch a cached summary.

        :returns: A summary of the dataset as a dict. Numeric columns will be
            summarized by the arithmetic mean, standard deviation, and
            percentiles. Dimensional columns will be summarized by counts.
        """
        self.reload()

        return summarize(self, dframe, groups, no_cache)
Example #2
0
    def summarize(self,
                  dframe,
                  groups=[],
                  no_cache=False,
                  update=False,
                  flat=False):
        """Build and return a summary of the data in this dataset.

        Return a summary of dframe grouped by `groups`, or the overall
        summary if no groups are specified.

        :param dframe: dframe to summarize
        :param groups: A list of columns to group on.
        :param no_cache: Do not fetch a cached summary.
        :param flat: Return a flattened list of groups.

        :returns: A summary of the dataset as a dict. Numeric columns will be
            summarized by the arithmetic mean, standard deviation, and
            percentiles. Dimensional columns will be summarized by counts.
        """
        self.reload()

        summary = summarize(self, dframe, groups, no_cache, update=update)

        if flat:
            flat_summary = []

            for cols, v in summary.iteritems():
                cols = self.split_groups(cols)

                for k, data in v.iteritems():
                    col_values = self.split_groups(k)
                    col_values = [
                        strip_pattern.sub(',', i)[1:-1] for i in col_values
                    ]
                    flat_summary.append(
                        combine_dicts(dict(zip(cols, col_values)), data))

            summary = flat_summary

        return summary
Example #3
0
    def summarize(self, query=None, select=None,
                  group_str=None, limit=0, order_by=None):
        """Build and return a summary of the data in this dataset.

        Return a summary of the rows/values filtered by `query` and `select`
        and grouped by `group_str`, or the overall summary if no group is
        specified.

        :param query: An optional MongoDB query to limit the data summarized.
        :param select: An optional select to limit the columns summarized.
        :param group_str: A column in the dataset as a string or a list comma
          separated columns to group on.

        :returns: A JSON summary of the dataset. Numeric columns will be
            summarized by the arithmetic mean, standard deviation, and
            percentiles. Dimensional columns will be summarized by counts.
        """
        # interpret none as all
        if not group_str:
            group_str = self.ALL

        # split group in case of multigroups
        groups = self.split_groups(group_str)

        # if select append groups to select
        if select:
            select = json.loads(select)
            if not isinstance(select, dict):
                raise ArgumentError('select argument must be a JSON dictionary'
                                    ', found: %s.' % select)
            select.update(dict(zip(groups, [1] * len(groups))))
            select = json.dumps(select)

        self.reload()
        dframe = self.dframe(query=query, select=select,
                             limit=limit, order_by=order_by)

        return summarize(self, dframe, groups, group_str, query or select)
Example #4
0
    def summarize(self, dframe, groups=[], no_cache=False, update=False,
                  flat=False):
        """Build and return a summary of the data in this dataset.

        Return a summary of dframe grouped by `groups`, or the overall
        summary if no groups are specified.

        :param dframe: dframe to summarize
        :param groups: A list of columns to group on.
        :param no_cache: Do not fetch a cached summary.
        :param flat: Return a flattened list of groups.

        :returns: A summary of the dataset as a dict. Numeric columns will be
            summarized by the arithmetic mean, standard deviation, and
            percentiles. Dimensional columns will be summarized by counts.
        """
        self.reload()

        summary = summarize(self, dframe, groups, no_cache, update=update)

        if flat:
            flat_summary = []

            for cols, v in summary.iteritems():
                cols = self.split_groups(cols)

                for k, data in v.iteritems():
                    col_values = self.split_groups(k)
                    col_values = [strip_pattern.sub(',', i)[1:-1]
                                  for i in col_values]
                    flat_summary.append(
                        combine_dicts(dict(zip(cols, col_values)), data))

            summary = flat_summary

        return summary