def __propagate_column(dataset, parent_dataset): """Propagate columns in `parent_dataset` to `dataset`. When a new calculation is added to a dataset this will propagate the new column to all child (merged) datasets. :param dataset: THe child dataet. :param parent_dataset: The dataset to propagate. """ # delete the rows in this dataset from the parent dataset.remove_parent_observations(parent_dataset.dataset_id) # get this dataset without the out-of-date parent rows dframe = dataset.dframe(keep_parent_ids=True) # create new dframe from the upated parent and add parent id parent_dframe = add_parent_column(parent_dataset.dframe(), parent_dataset.dataset_id) # merge this new dframe with the existing dframe updated_dframe = concat([dframe, parent_dframe]) # save new dframe (updates schema) dataset.replace_observations(updated_dframe) dataset.clear_summary_stats() # recur into merged dataset [__propagate_column(x, dataset) for x in dataset.merged_datasets]
def calculate_updates(dataset, new_data, new_dframe_raw=None, parent_dataset_id=None, update_id=None): """Update dataset with `new_data`. This can result in race-conditions when: - deleting ``controllers.Datasets.DELETE`` - updating ``controllers.Datasets.POST([dataset_id])`` Therefore, perform these actions asychronously. :param new_data: Data to update this dataset with. :param new_dframe_raw: DataFrame to update this dataset with. :param parent_dataset_id: If passed add ID as parent ID to column, default is None. """ __ensure_ready(dataset, update_id) if new_dframe_raw is None: new_dframe_raw = dframe_from_update(dataset, new_data) new_dframe = recognize_dates(new_dframe_raw, dataset.schema) new_dframe = __add_calculations(dataset, new_dframe) # set parent id if provided if parent_dataset_id: new_dframe = add_parent_column(new_dframe, parent_dataset_id) dataset.append_observations(new_dframe) dataset.clear_summary_stats() propagate(dataset, new_dframe=new_dframe, update={'add': new_dframe_raw}) dataset.update_complete(update_id)
def __merge_datasets(datasets, mapping): """Merge two or more datasets.""" dframes = [] if not mapping: mapping = {} for dataset in datasets: dframe = dataset.dframe() column_map = mapping.get(dataset.dataset_id) if column_map: dframe = dframe.rename(columns=column_map) dframe = add_parent_column(dframe, dataset.dataset_id) dframes.append(dframe) return concat(dframes, ignore_index=True)
def __merge_datasets(datasets, mapping): """Merge two or more datasets.""" dframes = [] if not mapping: mapping = {} for dataset in datasets: dframe = dataset.dframe() column_map = mapping.get(dataset.dataset_id) if column_map: dframe = dframe.rename(columns=column_map) dframe = add_parent_column(dframe, dataset.dataset_id) dframes.append(dframe) return concat(dframes, ignore_index=True)
def save(self, dataset): """Save this aggregation. If an aggregated dataset for this aggregations group already exists store in this dataset, if not create a new aggregated dataset and store the aggregation in this new aggregated dataset. """ new_dframe = self.aggregation.eval(self.columns) new_dframe = add_parent_column(new_dframe, dataset.dataset_id) a_dataset = dataset.aggregated_dataset(self.groups) if a_dataset is None: a_dataset = aggregated_dataset(dataset, new_dframe, self.groups) else: a_dframe = a_dataset.dframe() new_dframe = group_join(self.groups, a_dframe, new_dframe) a_dataset.replace_observations(new_dframe) self.new_dframe = new_dframe
def update(self, dataset, child_dataset, formula, reducible): """Attempt to reduce an update and store.""" parent_dataset_id = dataset.dataset_id # get dframe only including rows from this parent dframe = rows_for_parent_id(child_dataset.dframe( keep_parent_ids=True, reload_=True), parent_dataset_id) # remove rows in child from parent child_dataset.remove_parent_observations(parent_dataset_id) if reducible and self.__is_reducible(): dframe = self.aggregation.reduce(dframe, self.columns) else: dframe = self.updated_dframe(dataset, formula, dframe) new_a_dframe = concat([child_dataset.dframe(), dframe]) new_a_dframe = add_parent_column(new_a_dframe, parent_dataset_id) child_dataset.replace_observations(new_a_dframe) return child_dataset.dframe()
def save(self, dataset): """Save this aggregation. If an aggregated dataset for this aggregations group already exists store in this dataset, if not create a new aggregated dataset and store the aggregation in this new aggregated dataset. """ new_dframe = self.aggregation.eval(self.columns) new_dframe = add_parent_column(new_dframe, dataset.dataset_id) a_dataset = dataset.aggregated_dataset(self.groups) if a_dataset is None: a_dataset = aggregated_dataset(dataset, new_dframe, self.groups) else: a_dframe = a_dataset.dframe() new_dframe = group_join(self.groups, a_dframe, new_dframe) a_dataset.replace_observations(new_dframe) self.new_dframe = new_dframe
def update(self, dataset, child_dataset, formula, reducible): """Attempt to reduce an update and store.""" parent_dataset_id = dataset.dataset_id # get dframe only including rows from this parent dframe = rows_for_parent_id( child_dataset.dframe(keep_parent_ids=True, reload_=True), parent_dataset_id) # remove rows in child from parent child_dataset.remove_parent_observations(parent_dataset_id) if reducible and self.__is_reducible(): dframe = self.aggregation.reduce(dframe, self.columns) else: dframe = self.updated_dframe(dataset, formula, dframe) new_a_dframe = concat([child_dataset.dframe(), dframe]) new_a_dframe = add_parent_column(new_a_dframe, parent_dataset_id) child_dataset.replace_observations(new_a_dframe) return child_dataset.dframe()
def calculate_updates(dataset, new_data, new_dframe_raw=None, parent_dataset_id=None, update_id=None): """Update dataset with `new_data`. This can result in race-conditions when: - deleting ``controllers.Datasets.DELETE`` - updating ``controllers.Datasets.POST([dataset_id])`` Therefore, perform these actions asychronously. :param new_data: Data to update this dataset with. :param new_dframe_raw: DataFrame to update this dataset with. :param parent_dataset_id: If passed add ID as parent ID to column, default is None. """ if not __update_is_valid(dataset, new_dframe_raw): dataset.remove_pending_update(update_id) return __ensure_ready(dataset, update_id) if new_dframe_raw is None: new_dframe_raw = dframe_from_update(dataset, new_data) new_dframe = recognize_dates(new_dframe_raw, dataset.schema) new_dframe = __add_calculations(dataset, new_dframe) # set parent id if provided if parent_dataset_id: new_dframe = add_parent_column(new_dframe, parent_dataset_id) dataset.append_observations(new_dframe) dataset.clear_summary_stats() propagate(dataset, new_dframe=new_dframe, update={'add': new_dframe_raw}) dataset.update_complete(update_id)