def parse_columns(dataset, formula, name, dframe=None, no_index=False): """Parse a formula and return columns resulting from its functions. Parse a formula into a list of functions then apply those functions to the Data Frame and return the resulting columns. :param formula: The formula to parse. :param name: Name of the formula. :param dframe: A DataFrame to apply functions to. :param no_index: Drop the index on result columns. """ functions = Parser.parse_functions(formula) dependent_columns = Parser.dependent_columns(formula, dataset) # make select from dependent_columns if dframe is None: select = {col: 1 for col in dependent_columns or [MONGO_ID]} dframe = dataset.dframe( query_args=QueryArgs(select=select), keep_mongo_keys=True).set_index(MONGO_ID_ENCODED) if not dependent_columns: # constant column, use dummy dframe['dummy'] = 0 return __build_columns(dataset, dframe, functions, name, no_index)
def __create_aggregator(dataset, formula, name, groups, dframe=None): # TODO this should work with index eventually columns = parse_columns(dataset, formula, name, dframe, no_index=True) dependent_columns = Parser.dependent_columns(formula, dataset) aggregation = Parser.parse_aggregation(formula) # get dframe with only the necessary columns select = combine_dicts({group: 1 for group in groups}, {col: 1 for col in dependent_columns}) # ensure at least one column (MONGO_ID) for the count aggregation query_args = QueryArgs(select=select or {MONGO_ID: 1}) dframe = dataset.dframe(query_args=query_args, keep_mongo_keys=not select) return Aggregator(dframe, groups, aggregation, name, columns)
def calculate_task(calculations, dataset): """Background task to run a calculation. Set calculation to failed and raise if an exception occurs. :param calculation: Calculation to run. :param dataset: Dataset to run calculation on. """ # block until other calculations for this dataset are finished calculations[0].restart_if_has_pending(dataset, calculations[1:]) calculate_columns(dataset.reload(), calculations) for calculation in calculations: calculation.add_dependencies( dataset, Parser.dependent_columns(calculation.formula, dataset)) if calculation.aggregation is not None: aggregated_id = dataset.aggregated_datasets_dict[calculation.group] calculation.set_aggregation_id(aggregated_id) calculation.ready()
def test_parse_formula_dependent_columns(self): formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS) for formula, column_list in formulas_to_deps.iteritems(): columns = Parser.dependent_columns(formula, self.dataset) self.assertEqual(set(column_list), columns)