Esempio n. 1
0
def summarize(dataset, dframe, groups, no_cache, update=False):
    """Raises a ColumnTypeError if grouping on a non-dimensional column."""
    # do not allow group by numeric types
    for group in groups:
        if not dataset.is_factor(group):
            raise ColumnTypeError("group: '%s' is not a dimension." % group)

    group_str = dataset.join_groups(groups) or dataset.ALL

    # check cached stats for group and update as necessary
    stats = dataset.stats
    group_stats = stats.get(group_str)

    if no_cache or not group_stats or update:
        group_stats = summarize_with_groups(dframe, groups, dataset) if groups else summarize_df(dframe, dataset)

        if not no_cache:
            if update:
                original_group_stats = stats.get(group_str, {})
                group_stats = combine_dicts(original_group_stats, group_stats)

            stats.update({group_str: group_stats})
            dataset.update({dataset.STATS: dict_for_mongo(stats)})

    stats_dict = dict_from_mongo(group_stats)

    if groups:
        stats_dict = {group_str: stats_dict}

    return stats_dict
Esempio n. 2
0
def summarize(dataset, dframe, groups, no_cache, update=False):
    """Raises a ColumnTypeError if grouping on a non-dimensional column."""
    # do not allow group by numeric types
    for group in groups:
        if not dataset.is_factor(group):
            raise ColumnTypeError("group: '%s' is not a dimension." % group)

    group_str = dataset.join_groups(groups) or dataset.ALL

    # check cached stats for group and update as necessary
    stats = dataset.stats
    group_stats = stats.get(group_str)

    if no_cache or not group_stats or update:
        group_stats = summarize_with_groups(dframe, groups, dataset) if\
            groups else summarize_df(dframe, dataset)

        if not no_cache:
            if update:
                original_group_stats = stats.get(group_str, {})
                group_stats = combine_dicts(original_group_stats, group_stats)

            stats.update({group_str: group_stats})
            dataset.update({dataset.STATS: dict_for_mongo(stats)})

    stats_dict = dict_from_mongo(group_stats)

    if groups:
        stats_dict = {group_str: stats_dict}

    return stats_dict
Esempio n. 3
0
    def encode(self, encoding, query):
        """Encode query, order_by, and select given an encoding.

        The query will be combined with the existing query.

        :param encoding: A dict to encode the QueryArgs fields with.
        :param query: An additional dict to combine with the existing query.
        """
        self.query = replace_keys(combine_dicts(self.query, query), encoding)
        self.order_by = self.order_by and replace_keys(dict(self.order_by),
                                                       encoding).items()
        self.select = self.select and replace_keys(self.select, encoding)
Esempio n. 4
0
    def encode(self, encoding, query):
        """Encode query, order_by, and select given an encoding.

        The query will be combined with the existing query.

        :param encoding: A dict to encode the QueryArgs fields with.
        :param query: An additional dict to combine with the existing query.
        """
        self.query = replace_keys(combine_dicts(self.query, query), encoding)
        self.order_by = self.order_by and replace_keys(dict(self.order_by),
                                                       encoding).items()
        self.select = self.select and replace_keys(self.select, encoding)
Esempio n. 5
0
def __create_aggregator(dataset, formula, name, groups, dframe=None):
    # TODO this should work with index eventually
    columns = parse_columns(dataset, formula, name, dframe, no_index=True)

    dependent_columns = Parser.dependent_columns(formula, dataset)
    aggregation = Parser.parse_aggregation(formula)

    # get dframe with only the necessary columns
    select = combine_dicts({group: 1 for group in groups},
                           {col: 1 for col in dependent_columns})

    # ensure at least one column (MONGO_ID) for the count aggregation
    query_args = QueryArgs(select=select or {MONGO_ID: 1})
    dframe = dataset.dframe(query_args=query_args, keep_mongo_keys=not select)

    return Aggregator(dframe, groups, aggregation, name, columns)
Esempio n. 6
0
    def update(cls, dataset, index, record):
        """Update a dataset row by index.

        The record dictionary will update, not replace, the data in the row at
        index.

        :param dataset: The dataset to update a row for.
        :param dex: The index of the row to update.
        :param record: The dictionary to update the row with.
        """
        previous_record = cls.find_one(dataset, index).record
        previous_record.pop(MONGO_ID)
        record = combine_dicts(previous_record, record)
        record = update_calculations(record, dataset)

        record = cls.encode(record, dataset=dataset)

        cls.delete(dataset, index)

        super(cls, cls()).save(record)
Esempio n. 7
0
    def update(cls, dataset, index, record):
        """Update a dataset row by index.

        The record dictionary will update, not replace, the data in the row at
        index.

        :param dataset: The dataset to update a row for.
        :param dex: The index of the row to update.
        :param record: The dictionary to update the row with.
        """
        previous_record = cls.find_one(dataset, index).record
        previous_record.pop(MONGO_ID)
        record = combine_dicts(previous_record, record)
        record = update_calculations(record, dataset)

        record = cls.encode(record, dataset=dataset)

        cls.delete(dataset, index)

        super(cls, cls()).save(record)
Esempio n. 8
0
    def summarize(self,
                  dframe,
                  groups=[],
                  no_cache=False,
                  update=False,
                  flat=False):
        """Build and return a summary of the data in this dataset.

        Return a summary of dframe grouped by `groups`, or the overall
        summary if no groups are specified.

        :param dframe: dframe to summarize
        :param groups: A list of columns to group on.
        :param no_cache: Do not fetch a cached summary.
        :param flat: Return a flattened list of groups.

        :returns: A summary of the dataset as a dict. Numeric columns will be
            summarized by the arithmetic mean, standard deviation, and
            percentiles. Dimensional columns will be summarized by counts.
        """
        self.reload()

        summary = summarize(self, dframe, groups, no_cache, update=update)

        if flat:
            flat_summary = []

            for cols, v in summary.iteritems():
                cols = self.split_groups(cols)

                for k, data in v.iteritems():
                    col_values = self.split_groups(k)
                    col_values = [
                        strip_pattern.sub(',', i)[1:-1] for i in col_values
                    ]
                    flat_summary.append(
                        combine_dicts(dict(zip(cols, col_values)), data))

            summary = flat_summary

        return summary
Esempio n. 9
0
    def summarize(self, dframe, groups=[], no_cache=False, update=False,
                  flat=False):
        """Build and return a summary of the data in this dataset.

        Return a summary of dframe grouped by `groups`, or the overall
        summary if no groups are specified.

        :param dframe: dframe to summarize
        :param groups: A list of columns to group on.
        :param no_cache: Do not fetch a cached summary.
        :param flat: Return a flattened list of groups.

        :returns: A summary of the dataset as a dict. Numeric columns will be
            summarized by the arithmetic mean, standard deviation, and
            percentiles. Dimensional columns will be summarized by counts.
        """
        self.reload()

        summary = summarize(self, dframe, groups, no_cache, update=update)

        if flat:
            flat_summary = []

            for cols, v in summary.iteritems():
                cols = self.split_groups(cols)

                for k, data in v.iteritems():
                    col_values = self.split_groups(k)
                    col_values = [strip_pattern.sub(',', i)[1:-1]
                                  for i in col_values]
                    flat_summary.append(
                        combine_dicts(dict(zip(cols, col_values)), data))

            summary = flat_summary

        return summary
Esempio n. 10
0
    def test_parse_formula_dependent_columns(self):
        formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS)

        for formula, column_list in formulas_to_deps.iteritems():
            columns = Parser.dependent_columns(formula, self.dataset)
            self.assertEqual(set(column_list), columns)
Esempio n. 11
0
    def test_parse_formula_dependent_columns(self):
        formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS)

        for formula, column_list in formulas_to_deps.iteritems():
            columns = Parser.dependent_columns(formula, self.dataset)
            self.assertEqual(set(column_list), columns)
Esempio n. 12
0
    def test_parse_formula_dependent_columns(self):
        formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS)

        for formula, column_list in formulas_to_deps.iteritems():
            functions, dependent_columns = self.parser.parse_formula(formula)
            self.assertEqual(set(column_list), dependent_columns)