Example #1
0
    def _test_calculator(self):
        self.dframe = self.dataset.dframe()

        columns = self.dframe.columns.tolist()
        self.start_num_cols = len(columns)
        self.added_num_cols = 0

        column_labels_to_slugs = {
            column_attrs[Dataset.LABEL]: (column_name) for
            (column_name, column_attrs) in self.dataset.schema.items()
        }
        self.label_list, self.slugified_key_list = [
            list(ary) for ary in zip(*column_labels_to_slugs.items())
        ]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx

            Parser.validate_formula(formula, self.dataset)

            calculation = Calculation()
            calculation.save(self.dataset, formula, name, self.group)
            self.now = now()
            calculate_columns(self.dataset, [calculation])

            self.column_labels_to_slugs = self.dataset.schema.labels_to_slugs

            self._test_calculation_results(name, formula)
Example #2
0
def parse_columns(dataset, formula, name, dframe=None, no_index=False):
    """Parse a formula and return columns resulting from its functions.

    Parse a formula into a list of functions then apply those functions to
    the Data Frame and return the resulting columns.

    :param formula: The formula to parse.
    :param name: Name of the formula.
    :param dframe: A DataFrame to apply functions to.
    :param no_index: Drop the index on result columns.
    """
    functions = Parser.parse_functions(formula)
    dependent_columns = Parser.dependent_columns(formula, dataset)

    # make select from dependent_columns
    if dframe is None:
        select = {col: 1 for col in dependent_columns or [MONGO_ID]}

        dframe = dataset.dframe(
            query_args=QueryArgs(select=select),
            keep_mongo_keys=True).set_index(MONGO_ID_ENCODED)

        if not dependent_columns:
            # constant column, use dummy
            dframe['dummy'] = 0

    return __build_columns(dataset, dframe, functions, name, no_index)
Example #3
0
class TestParser(TestBase):
    def setUp(self):
        self.parser = Parser()
        self.row = {"VAR": 1}
        TestBase.setUp(self)

    def _check_func(self, parse_result):
        functions = parse_result
        for func in functions:
            self.assertEqual(func.func.func_name, "eval")
        return functions[0]

    def test_parse_formula(self):
        func = self._check_func(self.parser.parse_formula("VAR"))
        self.assertEqual(func(self.row, self.parser.context), 1)

    def test_bnf(self):
        result = self.parser._Parser__build_bnf()
        self.assertNotEqual(self.parser.bnf, None)

    def test_parse_formula_with_var(self):
        func = self._check_func(self.parser.parse_formula("VAR + 1"))
        self.assertEqual(func(self.row, self.parser.context), 2)

    def test_parse_formula_bad_formula(self):
        bad_formulas = ["=BAD +++ FOR", "2 +>+ 1", "1 ** 2"]

        for bad_formula in bad_formulas:
            self.assertRaises(ParseError, self.parser.parse_formula, bad_formula)
    def _test_calculator(self):
        self.dframe = self.dataset.dframe()

        columns = self.dframe.columns.tolist()
        self.start_num_cols = len(columns)
        self.added_num_cols = 0

        column_labels_to_slugs = {
            column_attrs[Dataset.LABEL]: (column_name)
            for (column_name, column_attrs) in self.dataset.schema.items()
        }
        self.label_list, self.slugified_key_list = [
            list(ary) for ary in zip(*column_labels_to_slugs.items())
        ]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx

            Parser.validate_formula(formula, self.dataset)

            calculation = Calculation()
            calculation.save(self.dataset, formula, name, self.group)
            self.now = now()
            calculate_columns(self.dataset, [calculation])

            self.column_labels_to_slugs = self.dataset.schema.labels_to_slugs

            self._test_calculation_results(name, formula)
Example #5
0
def parse_columns(dataset, formula, name, dframe=None, no_index=False):
    """Parse a formula and return columns resulting from its functions.

    Parse a formula into a list of functions then apply those functions to
    the Data Frame and return the resulting columns.

    :param formula: The formula to parse.
    :param name: Name of the formula.
    :param dframe: A DataFrame to apply functions to.
    :param no_index: Drop the index on result columns.
    """
    functions = Parser.parse_functions(formula)
    dependent_columns = Parser.dependent_columns(formula, dataset)

    # make select from dependent_columns
    if dframe is None:
        select = {col: 1 for col in dependent_columns or [MONGO_ID]}

        dframe = dataset.dframe(
            query_args=QueryArgs(select=select),
            keep_mongo_keys=True).set_index(MONGO_ID_ENCODED)

        if not dependent_columns:
            # constant column, use dummy
            dframe['dummy'] = 0

    return __build_columns(dataset, dframe, functions, name, no_index)
Example #6
0
class TestCalculator(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(
            self.test_dataset_ids['good_eats_with_calculations.csv'])
        dframe = recognize_dates(
            self.get_data('good_eats_with_calculations.csv'))
        self.dataset.save_observations(dframe)
        self.group = None
        self.parser = Parser(self.dataset)
        self.places = 5

    def _equal_msg(self, calculated, stored, formula):
        return '(calculated %s) %s != (stored %s) %s ' % (type(calculated),
               calculated, type(stored), stored) +\
            '(within %s places), formula: %s' % (self.places, formula)

    def _test_calculator(self):
        self.dframe = self.dataset.dframe()
        row = self.dframe.irow(0)

        columns = self.dframe.columns.tolist()
        self.start_num_cols = len(columns)
        self.added_num_cols = 0

        column_labels_to_slugs = {
            column_attrs[Dataset.LABEL]: (column_name) for
            (column_name, column_attrs) in self.dataset.schema.items()
        }
        self.label_list, self.slugified_key_list = [
            list(ary) for ary in zip(*column_labels_to_slugs.items())
        ]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx
            self.parser.validate_formula(formula, row)

            calculator = Calculator(self.dataset)

            groups = self.dataset.split_groups(self.group)
            calculation = Calculation()
            calculation.save(self.dataset, formula, name, self.group)
            calculator.calculate_columns([calculation])

            self.column_labels_to_slugs = self.dataset.schema.labels_to_slugs

            self._test_calculation_results(name, formula)
    def save(self, dataset, formula, name, group_str=None):
        """Parse, save, and calculate a formula.

        Validate `formula` and `group_str` for the given `dataset`. If the
        formula and group are valid for the dataset, then save a new
        calculation for them under `name`. Finally, create a background task
        to compute the calculation.

        Calculations are initially saved in a **pending** state, after the
        calculation has finished processing it will be in a **ready** state.

        :param dataset: The DataSet to save.
        :param formula: The formula to save.
        :param name: The name of the formula.
        :param group_str: Columns to group on.
        :type group_str: String, list or strings, or None.

        :raises: `ParseError` if an invalid formula was supplied.
        """
        # ensure that the formula is parsable
        groups = self.split_groups(group_str) if group_str else []
        Parser.validate(dataset, formula, groups)
        aggregation = Parser.parse_aggregation(formula)

        if aggregation:
            # set group if aggregation and group unset
            group_str = group_str or ''

            # check that name is unique for aggregation
            aggregated_dataset = dataset.aggregated_dataset(groups)

            if aggregated_dataset:
                name = _check_name_and_make_unique(name, aggregated_dataset)

        else:
            # set group if aggregation and group unset
            name = _check_name_and_make_unique(name, dataset)

        record = {
            DATASET_ID: dataset.dataset_id,
            self.AGGREGATION: aggregation,
            self.FORMULA: formula,
            self.GROUP: group_str,
            self.NAME: name,
            self.STATE: self.STATE_PENDING,
        }
        super(self.__class__, self).save(record)

        return self
Example #8
0
    def save(self, dataset, formula, name, group_str=None):
        """Parse, save, and calculate a formula.

        Validate `formula` and `group_str` for the given `dataset`. If the
        formula and group are valid for the dataset, then save a new
        calculation for them under `name`. Finally, create a background task
        to compute the calculation.

        Calculations are initially saved in a **pending** state, after the
        calculation has finished processing it will be in a **ready** state.

        :param dataset: The DataSet to save.
        :param formula: The formula to save.
        :param name: The name of the formula.
        :param group_str: Columns to group on.
        :type group_str: String, list or strings, or None.

        :raises: `ParseError` if an invalid formula was supplied.
        """
        # ensure that the formula is parsable
        groups = self.split_groups(group_str) if group_str else []
        Parser.validate(dataset, formula, groups)
        aggregation = Parser.parse_aggregation(formula)

        if aggregation:
            # set group if aggregation and group unset
            group_str = group_str or ''

            # check that name is unique for aggregation
            aggregated_dataset = dataset.aggregated_dataset(groups)

            if aggregated_dataset:
                name = _check_name_and_make_unique(name, aggregated_dataset)

        else:
            # set group if aggregation and group unset
            name = _check_name_and_make_unique(name, dataset)

        record = {
            DATASET_ID: dataset.dataset_id,
            self.AGGREGATION: aggregation,
            self.FORMULA: formula,
            self.GROUP: group_str,
            self.NAME: name,
            self.STATE: self.STATE_PENDING,
        }
        super(self.__class__, self).save(record)

        return self
Example #9
0
def __create_aggregator(dataset, formula, name, groups, dframe=None):
    # TODO this should work with index eventually
    columns = parse_columns(dataset, formula, name, dframe, no_index=True)

    dependent_columns = Parser.dependent_columns(formula, dataset)
    aggregation = Parser.parse_aggregation(formula)

    # get dframe with only the necessary columns
    select = combine_dicts({group: 1 for group in groups},
                           {col: 1 for col in dependent_columns})

    # ensure at least one column (MONGO_ID) for the count aggregation
    query_args = QueryArgs(select=select or {MONGO_ID: 1})
    dframe = dataset.dframe(query_args=query_args, keep_mongo_keys=not select)

    return Aggregator(dframe, groups, aggregation, name, columns)
Example #10
0
 def setUp(self):
     TestBase.setUp(self)
     self.dataset = Dataset()
     self.dataset.save(
         self.test_dataset_ids['good_eats_with_calculations.csv'])
     dframe = recognize_dates(
         self.get_data('good_eats_with_calculations.csv'))
     self.dataset.save_observations(dframe)
     self.group = None
     self.parser = Parser(self.dataset)
     self.places = 5
Example #11
0
class TestParser(TestBase):
    def setUp(self):
        TestBase.setUp(self)
        self.dataset_id = self._post_file()
        self.dataset = Dataset.find_one(self.dataset_id)
        self.parser = Parser(self.dataset)
        self.row = {"amount": 1}

    def _parse_and_check_func(self, formula):
        functions, _ = self.parser.parse_formula(formula)
        for func in functions:
            self.assertEqual(func.func.func_name, "eval")
        return functions[0]

    def test_parse_formula(self):
        func = self._parse_and_check_func("amount")
        self.assertEqual(func(self.row, self.parser.dataset), 1)

    def test_bnf(self):
        self.parser._Parser__build_bnf()
        self.assertNotEqual(self.parser.bnf, None)

    def test_parse_formula_with_var(self):
        func = self._parse_and_check_func("amount + 1")
        self.assertEqual(func(self.row, self.parser.dataset), 2)

    def test_parse_formula_dependent_columns(self):
        formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS)

        for formula, column_list in formulas_to_deps.iteritems():
            functions, dependent_columns = self.parser.parse_formula(formula)
            self.assertEqual(set(column_list), dependent_columns)

    def test_parse_formula_bad_formula(self):
        bad_formulas = ["=BAD +++ FOR", "2 +>+ 1", "1 ** 2"]

        for bad_formula in bad_formulas:
            self.assertRaises(ParseError, self.parser.parse_formula, bad_formula)
Example #12
0
class TestParser(TestBase):

    def setUp(self):
        self.parser = Parser()
        self.row = {'VAR': 1}
        TestBase.setUp(self)

    def _check_func(self, parse_result):
        agg, functions = parse_result
        for func in functions:
            self.assertEqual(func.func.func_name, 'eval')
        return functions[0]

    def test_parse_formula(self):
        func = self._check_func(
            self.parser.parse_formula('VAR'))
        self.assertEqual(func(self.row, self.parser.context), 1)

    def test_bnf(self):
        result = self.parser._build_bnf()
        self.assertNotEqual(self.parser.bnf, None)

    def test_parse_formula_with_var(self):
        func = self._check_func(
            self.parser.parse_formula('VAR + 1'))
        self.assertEqual(func(self.row, self.parser.context), 2)

    def test_parse_formula_bad_formula(self):
        bad_formulas = [
            '=BAD +++ FOR',
            '2 +>+ 1',
            '1 ** 2',
        ]

        for bad_formula in bad_formulas:
            self.assertRaises(ParseError, self.parser.parse_formula,
                              bad_formula)
Example #13
0
def __add_calculations(dataset, new_dframe):
    labels_to_slugs = dataset.schema.labels_to_slugs

    for calculation in dataset.calculations(include_aggs=False):
        function = Parser.parse_function(calculation.formula)
        new_column = new_dframe.apply(function, axis=1, args=(dataset, ))
        potential_name = calculation.name

        if potential_name not in dataset.dframe().columns:
            if potential_name in labels_to_slugs:
                new_column.name = labels_to_slugs[potential_name]
        else:
            new_column.name = potential_name

        new_dframe = new_dframe.join(new_column)

    return new_dframe
class TestParser(TestBase):
    def setUp(self):
        TestBase.setUp(self)
        self.dataset_id = self._post_file()
        self.dataset = Dataset.find_one(self.dataset_id)
        self.parser = Parser()
        self.row = {'amount': 1}

    def _parse_and_check_func(self, formula):
        functions = Parser.parse_functions(formula)
        for func in functions:
            self.assertEqual(func.func.func_name, 'eval')
        return functions[0]

    def test_parse_formula(self):
        func = self._parse_and_check_func('amount')
        self.assertEqual(func(self.row, self.dataset), 1)

    def test_bnf(self):
        bnf = self.parser._Parser__build_bnf()
        self.assertNotEqual(bnf, None)

    def test_parse_formula_with_var(self):
        func = self._parse_and_check_func('amount + 1')
        self.assertEqual(func(self.row, self.dataset), 2)

    def test_parse_formula_dependent_columns(self):
        formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS)

        for formula, column_list in formulas_to_deps.iteritems():
            columns = Parser.dependent_columns(formula, self.dataset)
            self.assertEqual(set(column_list), columns)

    def test_parse_formula_bad_formula(self):
        bad_formulas = [
            '=BAD +++ FOR',
            '2 +>+ 1',
            '1 ** 2',
        ]

        for bad_formula in bad_formulas:
            self.assertRaises(ParseError, Parser.parse, bad_formula)
def calculate_task(calculations, dataset):
    """Background task to run a calculation.

    Set calculation to failed and raise if an exception occurs.

    :param calculation: Calculation to run.
    :param dataset: Dataset to run calculation on.
    """
    # block until other calculations for this dataset are finished
    calculations[0].restart_if_has_pending(dataset, calculations[1:])

    calculate_columns(dataset.reload(), calculations)

    for calculation in calculations:
        calculation.add_dependencies(
            dataset, Parser.dependent_columns(calculation.formula, dataset))

        if calculation.aggregation is not None:
            aggregated_id = dataset.aggregated_datasets_dict[calculation.group]
            calculation.set_aggregation_id(aggregated_id)

        calculation.ready()
Example #16
0
def calculate_task(calculations, dataset):
    """Background task to run a calculation.

    Set calculation to failed and raise if an exception occurs.

    :param calculation: Calculation to run.
    :param dataset: Dataset to run calculation on.
    """
    # block until other calculations for this dataset are finished
    calculations[0].restart_if_has_pending(dataset, calculations[1:])

    calculate_columns(dataset.reload(), calculations)

    for calculation in calculations:
        calculation.add_dependencies(
            dataset, Parser.dependent_columns(calculation.formula, dataset))

        if calculation.aggregation is not None:
            aggregated_id = dataset.aggregated_datasets_dict[calculation.group]
            calculation.set_aggregation_id(aggregated_id)

        calculation.ready()
Example #17
0
 def setUp(self):
     self.parser = Parser()
     self.row = {"VAR": 1}
     TestBase.setUp(self)
Example #18
0
 def setUp(self):
     TestBase.setUp(self)
     self.dataset_id = self._post_file()
     self.dataset = Dataset.find_one(self.dataset_id)
     self.parser = Parser()
     self.row = {"amount": 1}
Example #19
0
    def test_parse_formula_dependent_columns(self):
        formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS)

        for formula, column_list in formulas_to_deps.iteritems():
            columns = Parser.dependent_columns(formula, self.dataset)
            self.assertEqual(set(column_list), columns)
Example #20
0
 def _parse_and_check_func(self, formula):
     functions = Parser.parse_functions(formula)
     for func in functions:
         self.assertEqual(func.func.func_name, "eval")
     return functions[0]
Example #21
0
 def __init__(self, dataset):
     self.dataset = dataset.reload()
     self.parser = Parser(dataset)
Example #22
0
class Calculator(object):
    """Perform and store calculations and recalculations on update."""

    dframe = None

    def __init__(self, dataset):
        self.dataset = dataset.reload()
        self.parser = Parser(dataset)

    def validate(self, formula, groups):
        """Validate `formula` and `groups` for calculator's dataset.

        Validate the formula and group string by attempting to get a row from
        the dframe for the dataset and then running parser validation on this
        row. Additionally, ensure that the groups in the group string are
        columns in the dataset.

        :param formula: The formula to validate.
        :param groups: A list of columns to group by.

        :returns: The aggregation (or None) for the formula.
        """
        dframe = self.dataset.dframe(limit=1)
        row = dframe.irow(0) if len(dframe) else {}

        aggregation = self.parser.validate_formula(formula, row)

        for group in groups:
            if not group in dframe.columns:
                raise ParseError(
                    'Group %s not in dataset columns.' % group)

        return aggregation

    def calculate_column(self, formula, name, groups=None):
        """Calculate a new column based on `formula` store as `name`.

        The new column is joined to `dframe` and stored in `self.dataset`.
        The `group_str` is only applicable to aggregations and groups for
        aggregations.

        .. note::

            This can result in race-conditions when:

            - deleting ``controllers.Datasets.DELETE``
            - updating ``controllers.Datasets.POST([dataset_id])``

            Therefore, perform these actions asychronously.

        :param formula: The formula parsed by `self.parser` and applied to
            `self.dframe`.
        :param name: The name of the new column or aggregate column.
        :param groups: A list of columns to group on for aggregate
            calculations.
        """
        self._ensure_dframe()

        aggregation, new_columns = self.make_columns(formula, name)

        if aggregation:
            agg = Aggregator(self.dataset, self.dataset.dframe(),
                             groups, aggregation, name)
            agg.save(new_columns)
        else:
            self.dataset.replace_observations(self.dframe.join(new_columns[0]))

        # propagate calculation to any merged child datasets
        for merged_dataset in self.dataset.merged_datasets:
            merged_calculator = Calculator(merged_dataset)
            merged_calculator.propagate_column(self.dataset)

    def dependent_columns(self):
        return self.parser.context.dependent_columns

    def propagate_column(self, parent_dataset):
        """Propagate columns in `parent_dataset` to this dataset.

        This is used when there has been a new calculation added to
        a dataset and that new column needs to be propagated to all
        child (merged) datasets.

        :param parent_dataset: The dataset to propagate to `self.dataset`.
        """
        # delete the rows in this dataset from the parent
        self.dataset.remove_parent_observations(parent_dataset.dataset_id)

        # get this dataset without the out-of-date parent rows
        dframe = self.dataset.dframe(keep_parent_ids=True)

        # create new dframe from the upated parent and add parent id
        parent_dframe = parent_dataset.dframe().add_parent_column(
            parent_dataset.dataset_id)

        # merge this new dframe with the existing dframe
        updated_dframe = concat([dframe, parent_dframe])

        # save new dframe (updates schema)
        self.dataset.replace_observations(updated_dframe)
        self.dataset.clear_summary_stats()

        # recur
        for merged_dataset in self.dataset.merged_datasets:
            merged_calculator = Calculator(merged_dataset)
            merged_calculator.propagate_column(self.dataset)

    @task(default_retry_delay=5)
    def calculate_updates(self, new_data, new_dframe_raw=None,
                          parent_dataset_id=None, update_id=None):
        """Update dataset with `new_data`.

        This can result in race-conditions when:

        - deleting ``controllers.Datasets.DELETE``
        - updating ``controllers.Datasets.POST([dataset_id])``

        Therefore, perform these actions asychronously.

        :param new_data: Data to update this dataset with.
        :param new_dframe_raw: DataFrame to update this dataset with.
        :param parent_dataset_id: If passed add ID as parent ID to column,
            default is None.
        """
        self._ensure_dframe()
        self._ensure_ready(update_id)

        labels_to_slugs = self.dataset.schema.labels_to_slugs

        if new_dframe_raw is None:
            new_dframe_raw = self.dframe_from_update(new_data, labels_to_slugs)

        self._check_update_is_valid(new_dframe_raw)

        new_dframe = new_dframe_raw.recognize_dates_from_schema(
            self.dataset.schema)

        new_dframe, aggregations = self._add_calcs_and_find_aggregations(
            new_dframe, labels_to_slugs)

        # set parent id if provided
        if parent_dataset_id:
            new_dframe = new_dframe.add_parent_column(parent_dataset_id)

        existing_dframe = self.dataset.dframe(keep_parent_ids=True)

        # merge the two dframes
        updated_dframe = concat([existing_dframe, new_dframe])

        # update (overwrite) the dataset with the new merged dframe
        self.dframe = self.dataset.replace_observations(
            updated_dframe, set_num_columns=False)
        self.dataset.clear_summary_stats()

        self._update_aggregate_datasets(aggregations, new_dframe)
        self._update_merged_datasets(new_data, labels_to_slugs)
        self._update_joined_datasets(new_dframe_raw)

        self.dataset.update_complete(update_id)

    def _check_update_is_valid(self, new_dframe_raw):
        """Check if the update is valid.

        Check whether this is a right-hand side of any joins
        and deny the update if the update would produce an invalid
        join as a result.

        :raises: `NonUniqueJoinError` if update is illegal given joins of
            dataset.
        """
        if any([direction == 'left' for direction, _, on, __ in
                self.dataset.joined_datasets]):
            if on in new_dframe_raw.columns and on in self.dframe.columns:
                merged_join_column = concat(
                    [new_dframe_raw[on], self.dframe[on]])
                if len(merged_join_column) != merged_join_column.nunique():
                    raise NonUniqueJoinError(
                        'Cannot update. This is the right hand join and the'
                        'column "%s" will become non-unique.' % on)

    def make_columns(self, formula, name, dframe=None):
        """Parse formula into function and variables."""
        if dframe is None:
            dframe = self.dataset.dframe()

        aggregation, functions = self.parser.parse_formula(formula)

        new_columns = []

        for function in functions:
            new_column = dframe.apply(
                function, axis=1, args=(self.parser.context, ))
            new_column.name = name
            new_columns.append(new_column)

        return aggregation, new_columns

    def _ensure_dframe(self):
        """Ensure `dframe` for the calculator's dataset is defined."""
        if self.dframe is None:
            self.dframe = self.dataset.dframe()

    def _ensure_ready(self, update_id):
        # dataset must not be pending
        if not self.dataset.is_ready or (
                update_id and self.dataset.has_pending_updates(update_id)):
            self.dataset.reload()
            raise self.calculate_updates.retry()

    def _add_calcs_and_find_aggregations(self, new_dframe, labels_to_slugs):
        aggregations = []
        calculations = self.dataset.calculations()

        for calculation in calculations:
            if calculation.aggregation is not None:
                aggregations.append(calculation)
            else:
                _, function = self.parser.parse_formula(calculation.formula)
                new_column = new_dframe.apply(function[0], axis=1,
                                              args=(self.parser.context, ))
                potential_name = calculation.name

                if potential_name not in self.dframe.columns:
                    if potential_name in labels_to_slugs:
                        new_column.name = labels_to_slugs[potential_name]
                else:
                    new_column.name = potential_name

                new_dframe = new_dframe.join(new_column)

        return new_dframe, aggregations

    def _update_merged_datasets(self, new_data, labels_to_slugs):
        # store slugs as labels for child datasets
        slugified_data = []

        if not isinstance(new_data, list):
            new_data = [new_data]

        for row in new_data:
            for key, value in row.iteritems():
                if labels_to_slugs.get(key) and key not in MONGO_RESERVED_KEYS:
                    del row[key]
                    row[labels_to_slugs[key]] = value

            slugified_data.append(row)

        # update the merged datasets with new_dframe
        for merged_dataset in self.dataset.merged_datasets:
            merged_calculator = Calculator(merged_dataset)
            call_async(merged_calculator.calculate_updates,
                       merged_calculator,
                       slugified_data,
                       parent_dataset_id=self.dataset.dataset_id)

    def _update_joined_datasets(self, new_dframe_raw):
        # update any joined datasets
        for direction, other_dataset, on, joined_dataset in\
                self.dataset.joined_datasets:
            if direction == 'left':
                if on in new_dframe_raw.columns:
                    # only proceed if on in new dframe
                    other_dframe = other_dataset.dframe(padded=True)

                    if len(set(new_dframe_raw[on]).intersection(
                            set(other_dframe[on]))):
                        # only proceed if new on value is in on column in lhs
                        merged_dframe = other_dframe.join_dataset(
                            self.dataset, on)
                        joined_dataset.replace_observations(merged_dframe)
            else:
                merged_dframe = new_dframe_raw

                if on in merged_dframe:
                    merged_dframe = new_dframe_raw.join_dataset(
                        other_dataset, on)

                joined_calculator = Calculator(joined_dataset)
                call_async(joined_calculator.calculate_updates,
                           joined_calculator, merged_dframe.to_jsondict(),
                           parent_dataset_id=self.dataset.dataset_id)

    def dframe_from_update(self, new_data, labels_to_slugs):
        """Make a single-row dataframe for the additional data to add."""
        self._ensure_dframe()

        if not isinstance(new_data, list):
            new_data = [new_data]

        filtered_data = []
        columns = self.dframe.columns
        dframe_empty = not len(columns)

        if dframe_empty:
            columns = self.dataset.schema.keys()

        for row in new_data:
            filtered_row = dict()
            for col, val in row.iteritems():
                # special case for reserved keys (e.g. _id)
                if col in MONGO_RESERVED_KEYS:
                    if (not len(columns) or col in columns) and\
                            col not in filtered_row.keys():
                        filtered_row[col] = val
                else:
                    # if col is a label take slug, if it's a slug take col
                    slug = labels_to_slugs.get(
                        col, col if col in labels_to_slugs.values() else None)

                    # if slug is valid of there is an empty dframe
                    if (slug or col in labels_to_slugs.keys()) and (
                            dframe_empty or slug in columns):
                        filtered_row[slug] = self.dataset.schema.convert_type(
                            slug, val)

            filtered_data.append(filtered_row)

        return BambooFrame(filtered_data)

    def _update_aggregate_datasets(self, calculations, new_dframe):
        calcs_to_data = self._create_calculations_to_groups_and_datasets(
            calculations)

        for formula, slug, group_str, dataset in calcs_to_data:
            groups = self.dataset.split_groups(group_str)
            self._update_aggregate_dataset(formula, new_dframe, slug, groups,
                                           dataset)

    def _update_aggregate_dataset(self, formula, new_dframe, name, groups,
                                  agg_dataset):
        """Update the aggregated dataset built for `self` with `calculation`.

        Proceed with the following steps:

            - delete the rows in this dataset from the parent
            - recalculate aggregated dataframe from aggregation
            - update aggregated dataset with new dataframe and add parent id
            - recur on all merged datasets descending from the aggregated
              dataset

        :param formula: The formula to execute.
        :param new_dframe: The DataFrame to aggregate on.
        :param name: The name of the aggregation.
        :param groups: A column or columns to group on.
        :type group: String, list of strings, or None.
        :param agg_dataset: The DataSet to store the aggregation in.
        """
        # parse aggregation and build column arguments
        aggregation, new_columns = self.make_columns(
            formula, name, new_dframe)

        agg = Aggregator(self.dataset, self.dframe,
                         groups, aggregation, name)
        new_agg_dframe = agg.update(agg_dataset, self, formula, new_columns)

        # jsondict from new dframe
        new_data = new_agg_dframe.to_jsondict()

        for merged_dataset in agg_dataset.merged_datasets:
            # remove rows in child from this merged dataset
            merged_dataset.remove_parent_observations(
                agg_dataset.dataset_id)

            # calculate updates on the child
            merged_calculator = Calculator(merged_dataset)
            call_async(merged_calculator.calculate_updates, merged_calculator,
                       new_data, parent_dataset_id=agg_dataset.dataset_id)

    def _create_calculations_to_groups_and_datasets(self, calculations):
        """Create list of groups and calculations."""
        calcs_to_data = defaultdict(list)

        names_to_formulas = {
            calc.name: calc.formula for calc in calculations
        }
        calculations = set([calc.name for calc in calculations])

        for group, dataset in self.dataset.aggregated_datasets.items():
            labels_to_slugs = dataset.schema.labels_to_slugs
            calculations_for_dataset = list(set(
                labels_to_slugs.keys()).intersection(calculations))

            for calc in calculations_for_dataset:
                calcs_to_data[calc].append((
                    names_to_formulas[calc],
                    labels_to_slugs[calc],
                    group,
                    dataset
                ))

        return [
            item for sublist in calcs_to_data.values() for item in sublist
        ]

    def __getstate__(self):
        """Get state for pickle."""
        return [self.dataset, self.parser]

    def __setstate__(self, state):
        self.dataset, self.parser = state
 def _parse_and_check_func(self, formula):
     functions = Parser.parse_functions(formula)
     for func in functions:
         self.assertEqual(func.func.func_name, 'eval')
     return functions[0]
    def test_parse_formula_dependent_columns(self):
        formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS)

        for formula, column_list in formulas_to_deps.iteritems():
            columns = Parser.dependent_columns(formula, self.dataset)
            self.assertEqual(set(column_list), columns)
 def setUp(self):
     TestBase.setUp(self)
     self.dataset_id = self._post_file()
     self.dataset = Dataset.find_one(self.dataset_id)
     self.parser = Parser()
     self.row = {'amount': 1}