class TestParser(TestBase): def setUp(self): self.parser = Parser() self.row = {"VAR": 1} TestBase.setUp(self) def _check_func(self, parse_result): functions = parse_result for func in functions: self.assertEqual(func.func.func_name, "eval") return functions[0] def test_parse_formula(self): func = self._check_func(self.parser.parse_formula("VAR")) self.assertEqual(func(self.row, self.parser.context), 1) def test_bnf(self): result = self.parser._Parser__build_bnf() self.assertNotEqual(self.parser.bnf, None) def test_parse_formula_with_var(self): func = self._check_func(self.parser.parse_formula("VAR + 1")) self.assertEqual(func(self.row, self.parser.context), 2) def test_parse_formula_bad_formula(self): bad_formulas = ["=BAD +++ FOR", "2 +>+ 1", "1 ** 2"] for bad_formula in bad_formulas: self.assertRaises(ParseError, self.parser.parse_formula, bad_formula)
class TestParser(TestBase): def setUp(self): TestBase.setUp(self) self.dataset_id = self._post_file() self.dataset = Dataset.find_one(self.dataset_id) self.parser = Parser(self.dataset) self.row = {"amount": 1} def _parse_and_check_func(self, formula): functions, _ = self.parser.parse_formula(formula) for func in functions: self.assertEqual(func.func.func_name, "eval") return functions[0] def test_parse_formula(self): func = self._parse_and_check_func("amount") self.assertEqual(func(self.row, self.parser.dataset), 1) def test_bnf(self): self.parser._Parser__build_bnf() self.assertNotEqual(self.parser.bnf, None) def test_parse_formula_with_var(self): func = self._parse_and_check_func("amount + 1") self.assertEqual(func(self.row, self.parser.dataset), 2) def test_parse_formula_dependent_columns(self): formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS) for formula, column_list in formulas_to_deps.iteritems(): functions, dependent_columns = self.parser.parse_formula(formula) self.assertEqual(set(column_list), dependent_columns) def test_parse_formula_bad_formula(self): bad_formulas = ["=BAD +++ FOR", "2 +>+ 1", "1 ** 2"] for bad_formula in bad_formulas: self.assertRaises(ParseError, self.parser.parse_formula, bad_formula)
class TestParser(TestBase): def setUp(self): self.parser = Parser() self.row = {'VAR': 1} TestBase.setUp(self) def _check_func(self, parse_result): agg, functions = parse_result for func in functions: self.assertEqual(func.func.func_name, 'eval') return functions[0] def test_parse_formula(self): func = self._check_func( self.parser.parse_formula('VAR')) self.assertEqual(func(self.row, self.parser.context), 1) def test_bnf(self): result = self.parser._build_bnf() self.assertNotEqual(self.parser.bnf, None) def test_parse_formula_with_var(self): func = self._check_func( self.parser.parse_formula('VAR + 1')) self.assertEqual(func(self.row, self.parser.context), 2) def test_parse_formula_bad_formula(self): bad_formulas = [ '=BAD +++ FOR', '2 +>+ 1', '1 ** 2', ] for bad_formula in bad_formulas: self.assertRaises(ParseError, self.parser.parse_formula, bad_formula)
class Calculator(object): """Perform and store calculations and recalculations on update.""" dframe = None def __init__(self, dataset): self.dataset = dataset.reload() self.parser = Parser(dataset) def validate(self, formula, groups): """Validate `formula` and `groups` for calculator's dataset. Validate the formula and group string by attempting to get a row from the dframe for the dataset and then running parser validation on this row. Additionally, ensure that the groups in the group string are columns in the dataset. :param formula: The formula to validate. :param groups: A list of columns to group by. :returns: The aggregation (or None) for the formula. """ dframe = self.dataset.dframe(limit=1) row = dframe.irow(0) if len(dframe) else {} aggregation = self.parser.validate_formula(formula, row) for group in groups: if not group in dframe.columns: raise ParseError( 'Group %s not in dataset columns.' % group) return aggregation def calculate_column(self, formula, name, groups=None): """Calculate a new column based on `formula` store as `name`. The new column is joined to `dframe` and stored in `self.dataset`. The `group_str` is only applicable to aggregations and groups for aggregations. .. note:: This can result in race-conditions when: - deleting ``controllers.Datasets.DELETE`` - updating ``controllers.Datasets.POST([dataset_id])`` Therefore, perform these actions asychronously. :param formula: The formula parsed by `self.parser` and applied to `self.dframe`. :param name: The name of the new column or aggregate column. :param groups: A list of columns to group on for aggregate calculations. """ self._ensure_dframe() aggregation, new_columns = self.make_columns(formula, name) if aggregation: agg = Aggregator(self.dataset, self.dataset.dframe(), groups, aggregation, name) agg.save(new_columns) else: self.dataset.replace_observations(self.dframe.join(new_columns[0])) # propagate calculation to any merged child datasets for merged_dataset in self.dataset.merged_datasets: merged_calculator = Calculator(merged_dataset) merged_calculator.propagate_column(self.dataset) def dependent_columns(self): return self.parser.context.dependent_columns def propagate_column(self, parent_dataset): """Propagate columns in `parent_dataset` to this dataset. This is used when there has been a new calculation added to a dataset and that new column needs to be propagated to all child (merged) datasets. :param parent_dataset: The dataset to propagate to `self.dataset`. """ # delete the rows in this dataset from the parent self.dataset.remove_parent_observations(parent_dataset.dataset_id) # get this dataset without the out-of-date parent rows dframe = self.dataset.dframe(keep_parent_ids=True) # create new dframe from the upated parent and add parent id parent_dframe = parent_dataset.dframe().add_parent_column( parent_dataset.dataset_id) # merge this new dframe with the existing dframe updated_dframe = concat([dframe, parent_dframe]) # save new dframe (updates schema) self.dataset.replace_observations(updated_dframe) self.dataset.clear_summary_stats() # recur for merged_dataset in self.dataset.merged_datasets: merged_calculator = Calculator(merged_dataset) merged_calculator.propagate_column(self.dataset) @task(default_retry_delay=5) def calculate_updates(self, new_data, new_dframe_raw=None, parent_dataset_id=None, update_id=None): """Update dataset with `new_data`. This can result in race-conditions when: - deleting ``controllers.Datasets.DELETE`` - updating ``controllers.Datasets.POST([dataset_id])`` Therefore, perform these actions asychronously. :param new_data: Data to update this dataset with. :param new_dframe_raw: DataFrame to update this dataset with. :param parent_dataset_id: If passed add ID as parent ID to column, default is None. """ self._ensure_dframe() self._ensure_ready(update_id) labels_to_slugs = self.dataset.schema.labels_to_slugs if new_dframe_raw is None: new_dframe_raw = self.dframe_from_update(new_data, labels_to_slugs) self._check_update_is_valid(new_dframe_raw) new_dframe = new_dframe_raw.recognize_dates_from_schema( self.dataset.schema) new_dframe, aggregations = self._add_calcs_and_find_aggregations( new_dframe, labels_to_slugs) # set parent id if provided if parent_dataset_id: new_dframe = new_dframe.add_parent_column(parent_dataset_id) existing_dframe = self.dataset.dframe(keep_parent_ids=True) # merge the two dframes updated_dframe = concat([existing_dframe, new_dframe]) # update (overwrite) the dataset with the new merged dframe self.dframe = self.dataset.replace_observations( updated_dframe, set_num_columns=False) self.dataset.clear_summary_stats() self._update_aggregate_datasets(aggregations, new_dframe) self._update_merged_datasets(new_data, labels_to_slugs) self._update_joined_datasets(new_dframe_raw) self.dataset.update_complete(update_id) def _check_update_is_valid(self, new_dframe_raw): """Check if the update is valid. Check whether this is a right-hand side of any joins and deny the update if the update would produce an invalid join as a result. :raises: `NonUniqueJoinError` if update is illegal given joins of dataset. """ if any([direction == 'left' for direction, _, on, __ in self.dataset.joined_datasets]): if on in new_dframe_raw.columns and on in self.dframe.columns: merged_join_column = concat( [new_dframe_raw[on], self.dframe[on]]) if len(merged_join_column) != merged_join_column.nunique(): raise NonUniqueJoinError( 'Cannot update. This is the right hand join and the' 'column "%s" will become non-unique.' % on) def make_columns(self, formula, name, dframe=None): """Parse formula into function and variables.""" if dframe is None: dframe = self.dataset.dframe() aggregation, functions = self.parser.parse_formula(formula) new_columns = [] for function in functions: new_column = dframe.apply( function, axis=1, args=(self.parser.context, )) new_column.name = name new_columns.append(new_column) return aggregation, new_columns def _ensure_dframe(self): """Ensure `dframe` for the calculator's dataset is defined.""" if self.dframe is None: self.dframe = self.dataset.dframe() def _ensure_ready(self, update_id): # dataset must not be pending if not self.dataset.is_ready or ( update_id and self.dataset.has_pending_updates(update_id)): self.dataset.reload() raise self.calculate_updates.retry() def _add_calcs_and_find_aggregations(self, new_dframe, labels_to_slugs): aggregations = [] calculations = self.dataset.calculations() for calculation in calculations: if calculation.aggregation is not None: aggregations.append(calculation) else: _, function = self.parser.parse_formula(calculation.formula) new_column = new_dframe.apply(function[0], axis=1, args=(self.parser.context, )) potential_name = calculation.name if potential_name not in self.dframe.columns: if potential_name in labels_to_slugs: new_column.name = labels_to_slugs[potential_name] else: new_column.name = potential_name new_dframe = new_dframe.join(new_column) return new_dframe, aggregations def _update_merged_datasets(self, new_data, labels_to_slugs): # store slugs as labels for child datasets slugified_data = [] if not isinstance(new_data, list): new_data = [new_data] for row in new_data: for key, value in row.iteritems(): if labels_to_slugs.get(key) and key not in MONGO_RESERVED_KEYS: del row[key] row[labels_to_slugs[key]] = value slugified_data.append(row) # update the merged datasets with new_dframe for merged_dataset in self.dataset.merged_datasets: merged_calculator = Calculator(merged_dataset) call_async(merged_calculator.calculate_updates, merged_calculator, slugified_data, parent_dataset_id=self.dataset.dataset_id) def _update_joined_datasets(self, new_dframe_raw): # update any joined datasets for direction, other_dataset, on, joined_dataset in\ self.dataset.joined_datasets: if direction == 'left': if on in new_dframe_raw.columns: # only proceed if on in new dframe other_dframe = other_dataset.dframe(padded=True) if len(set(new_dframe_raw[on]).intersection( set(other_dframe[on]))): # only proceed if new on value is in on column in lhs merged_dframe = other_dframe.join_dataset( self.dataset, on) joined_dataset.replace_observations(merged_dframe) else: merged_dframe = new_dframe_raw if on in merged_dframe: merged_dframe = new_dframe_raw.join_dataset( other_dataset, on) joined_calculator = Calculator(joined_dataset) call_async(joined_calculator.calculate_updates, joined_calculator, merged_dframe.to_jsondict(), parent_dataset_id=self.dataset.dataset_id) def dframe_from_update(self, new_data, labels_to_slugs): """Make a single-row dataframe for the additional data to add.""" self._ensure_dframe() if not isinstance(new_data, list): new_data = [new_data] filtered_data = [] columns = self.dframe.columns dframe_empty = not len(columns) if dframe_empty: columns = self.dataset.schema.keys() for row in new_data: filtered_row = dict() for col, val in row.iteritems(): # special case for reserved keys (e.g. _id) if col in MONGO_RESERVED_KEYS: if (not len(columns) or col in columns) and\ col not in filtered_row.keys(): filtered_row[col] = val else: # if col is a label take slug, if it's a slug take col slug = labels_to_slugs.get( col, col if col in labels_to_slugs.values() else None) # if slug is valid of there is an empty dframe if (slug or col in labels_to_slugs.keys()) and ( dframe_empty or slug in columns): filtered_row[slug] = self.dataset.schema.convert_type( slug, val) filtered_data.append(filtered_row) return BambooFrame(filtered_data) def _update_aggregate_datasets(self, calculations, new_dframe): calcs_to_data = self._create_calculations_to_groups_and_datasets( calculations) for formula, slug, group_str, dataset in calcs_to_data: groups = self.dataset.split_groups(group_str) self._update_aggregate_dataset(formula, new_dframe, slug, groups, dataset) def _update_aggregate_dataset(self, formula, new_dframe, name, groups, agg_dataset): """Update the aggregated dataset built for `self` with `calculation`. Proceed with the following steps: - delete the rows in this dataset from the parent - recalculate aggregated dataframe from aggregation - update aggregated dataset with new dataframe and add parent id - recur on all merged datasets descending from the aggregated dataset :param formula: The formula to execute. :param new_dframe: The DataFrame to aggregate on. :param name: The name of the aggregation. :param groups: A column or columns to group on. :type group: String, list of strings, or None. :param agg_dataset: The DataSet to store the aggregation in. """ # parse aggregation and build column arguments aggregation, new_columns = self.make_columns( formula, name, new_dframe) agg = Aggregator(self.dataset, self.dframe, groups, aggregation, name) new_agg_dframe = agg.update(agg_dataset, self, formula, new_columns) # jsondict from new dframe new_data = new_agg_dframe.to_jsondict() for merged_dataset in agg_dataset.merged_datasets: # remove rows in child from this merged dataset merged_dataset.remove_parent_observations( agg_dataset.dataset_id) # calculate updates on the child merged_calculator = Calculator(merged_dataset) call_async(merged_calculator.calculate_updates, merged_calculator, new_data, parent_dataset_id=agg_dataset.dataset_id) def _create_calculations_to_groups_and_datasets(self, calculations): """Create list of groups and calculations.""" calcs_to_data = defaultdict(list) names_to_formulas = { calc.name: calc.formula for calc in calculations } calculations = set([calc.name for calc in calculations]) for group, dataset in self.dataset.aggregated_datasets.items(): labels_to_slugs = dataset.schema.labels_to_slugs calculations_for_dataset = list(set( labels_to_slugs.keys()).intersection(calculations)) for calc in calculations_for_dataset: calcs_to_data[calc].append(( names_to_formulas[calc], labels_to_slugs[calc], group, dataset )) return [ item for sublist in calcs_to_data.values() for item in sublist ] def __getstate__(self): """Get state for pickle.""" return [self.dataset, self.parser] def __setstate__(self, state): self.dataset, self.parser = state