def import_from_csv(self, csv_file): """Import data from a CSV file. .. note:: Write to a named tempfile in order to get a handle for pandas' `read_csv` function. :param csv_file: The CSV File to create a dataset from. :returns: The created dataset. """ if 'file' in dir(csv_file): csv_file = csv_file.file tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write(csv_file.read()) # pandas needs a closed file for *read_csv* tmpfile.close() call_async(import_dataset, self, partial( csv_file_reader, tmpfile.name, delete=True)) return self
def create_from_list_or_dict(cls, dataset, calculations): calculations = to_list(calculations) if not len(calculations) or not isinstance(calculations, list): raise ArgumentError('Improper format for JSON calculations.') parsed_calculations = [] # Pull out args to check JSON format try: for c in calculations: groups = c.get("groups") if not isinstance(groups, list): groups = [groups] for group in groups: parsed_calculations.append([ c[cls.FORMULA], c[cls.NAME], group]) except KeyError as e: raise ArgumentError('Required key %s not found in JSON' % e) calculations = [cls().save(dataset, formula, name, group) for formula, name, group in parsed_calculations] call_async(calculate_task, calculations, dataset.clear_cache())
def save(self, dframe, dataset): """Save data in `dframe` with the `dataset`. Encode `dframe` for MongoDB, and add fields to identify it with the passed in `dataset`. All column names in `dframe` are converted to slugs using the dataset's schema. The dataset is update to store the size of the stored data. A background task to cache a summary of the dataset is launched. :param dframe: The DataFrame (or BambooFrame) to store. :param dataset: The dataset to store the dframe in. """ # build schema for the dataset after having read it from file. if not dataset.schema: dataset.build_schema(dframe) # save the data, if there is any num_rows = 0 if dframe is not None: if not DATASET_OBSERVATION_ID in dframe.columns: dframe = dataset.add_id_column_to_dframe(dframe) self.batch_save(dframe) num_rows = len(dframe) # add metadata to dataset, discount ID column dataset.update({ dataset.NUM_ROWS: num_rows, dataset.STATE: self.STATE_READY, }) call_async(dataset.summarize, dataset)
def delete(self, dataset): """Delete this calculation. First ensure that there are no other calculations which depend on this one. If not, start a background task to delete the calculation. :param dataset: Dataset for this calculation. :raises: `DependencyError` if dependent calculations exist. :raises: `ArgumentError` if group is not in DataSet or calculation does not exist for DataSet. """ if len(self.dependent_calculations): msg = 'Cannot delete, calculations %s depend on this calculation.' raise DependencyError(msg % self.dependent_calculations) if not self.group is None: # it is an aggregate calculation dataset = dataset.aggregated_dataset(self.group) if not dataset: msg = 'Aggregation with group "%s" does not exist for dataset.' raise ArgumentError(msg % self.group) call_async(delete_task, self, dataset)
def merge_dataset_ids(dataset_ids): """Load a JSON array of dataset IDs and start a background merge task. :param dataset_ids: An array of dataset IDs to merge. :raises: `MergeError` if less than 2 datasets are provided. If a dataset cannot be found for a dataset ID it is ignored. Therefore if 2 dataset IDs are provided and one of them is bad an error is raised. However, if three dataset IDs are provided and one of them is bad, an error is not raised. """ dataset_ids = json.loads(dataset_ids) datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids] datasets = [dataset for dataset in datasets if dataset.record] if len(datasets) < 2: raise MergeError( 'merge requires 2 datasets (found %s)' % len(datasets)) new_dataset = Dataset() new_dataset.save() call_async(_merge_datasets_task, new_dataset, datasets) return new_dataset
def delete(self, query=None, countdown=0): """Delete this dataset. :param countdown: Delete dataset after this number of seconds. """ call_async(delete_task, self.clear_cache(), query=query, countdown=countdown)
def _update_joined_datasets(self, new_dframe_raw): # update any joined datasets for direction, other_dataset, on, joined_dataset in\ self.dataset.joined_datasets: if direction == 'left': if on in new_dframe_raw.columns: # only proceed if on in new dframe other_dframe = other_dataset.dframe(padded=True) if len(set(new_dframe_raw[on]).intersection( set(other_dframe[on]))): # only proceed if new on value is in on column in lhs merged_dframe = other_dframe.join_dataset( self.dataset, on) joined_dataset.replace_observations(merged_dframe) else: merged_dframe = new_dframe_raw if on in merged_dframe: merged_dframe = new_dframe_raw.join_dataset( other_dataset, on) joined_calculator = Calculator(joined_dataset) call_async(joined_calculator.calculate_updates, joined_calculator, merged_dframe.to_jsondict(), parent_dataset_id=self.dataset.dataset_id)
def import_from_csv(self, csv_file, na_values=[]): """Import data from a CSV file. .. note:: Write to a named tempfile in order to get a handle for pandas' `read_csv` function. :param csv_file: The CSV File to create a dataset from. :returns: The created dataset. """ if 'file' in dir(csv_file): csv_file = csv_file.file tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write(csv_file.read()) # pandas needs a closed file for *read_csv* tmpfile.close() call_async( import_dataset, self, partial(csv_file_reader, tmpfile.name, na_values=na_values, delete=True)) return self
def delete(self, dataset): """Delete this calculation. First ensure that there are no other calculations which depend on this one. If not, start a background task to delete the calculation. :param dataset: Dataset for this calculation. :raises: `DependencyError` if dependent calculations exist. :raises: `ArgumentError` if group is not in DataSet or calculation does not exist for DataSet. """ if len(self.dependent_calculations): raise DependencyError( 'Cannot delete, the calculations %s depend on this calculation' % self.dependent_calculations) if not self.group is None: # it is an aggregate calculation dataset = dataset.aggregated_dataset(self.group) if not dataset: raise ArgumentError( 'Aggregation with group "%s" does not exist for ' 'dataset' % self.group) call_async(delete_task, self, dataset)
def create_dataset_from_csv(csv_file): """Create a dataset from a CSV file. .. note:: Write to a named tempfile in order to get a handle for pandas' `read_csv` function. :param csv_file: The CSV File to create a dataset from. :returns: The created dataset. """ tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write(csv_file.file.read()) # pandas needs a closed file for *read_csv* tmpfile.close() dataset = Dataset() dataset.save() call_async(import_dataset, dataset, file_reader=partial(_file_reader, tmpfile.name, delete=True)) return dataset
def create_from_list_or_dict(cls, dataset, calculations): calculations = to_list(calculations) if not len(calculations) or not isinstance(calculations, list) or\ any([not isinstance(e, dict) for e in calculations]): raise ArgumentError('Improper format for JSON calculations.') parsed_calculations = [] # Pull out args to check JSON format try: for c in calculations: groups = c.get("groups") if not isinstance(groups, list): groups = [groups] for group in groups: parsed_calculations.append( [c[cls.FORMULA], c[cls.NAME], group]) except KeyError as e: raise ArgumentError('Required key %s not found in JSON' % e) # Save instead of create so that we calculate on all at once. calculations = [ cls().save(dataset, formula, name, group) for formula, name, group in parsed_calculations ] call_async(calculate_task, calculations, dataset.clear_cache())
def import_from_json(self, json_file): """Impor data from a JSON file. :param json_file: JSON file to import. """ content = json_file.file.read() call_async(import_dataset, self, partial(json_file_reader, content)) return self
def delete_observation(self, index): """Delete observation at index. :params index: The index of an observation to delete. """ Observation.delete(self, index) dframe = self.dframe() self.update({self.NUM_ROWS: len(dframe)}) self.build_schema(dframe, overwrite=True) call_async(propagate, self, update={'delete': index})
def add_observations(self, new_data): """Update `dataset` with `new_data`.""" update_id = uuid.uuid4().hex self.add_pending_update(update_id) new_data = to_list(new_data) # fetch data before other updates new_dframe_raw = dframe_from_update(self, new_data) call_async(calculate_updates, self, new_data, new_dframe_raw=new_dframe_raw, update_id=update_id)
def create_dataset_from_json(json_file): content = json_file.file.read() dataset = Dataset() dataset.save() def file_reader(content): return pd.DataFrame(json.loads(content)) call_async(import_dataset, dataset, file_reader=partial(file_reader, content)) return dataset
def import_data_from_json(dataset, json_file): """Impor data from a JSON file. :param dataset: Dataset to save in. :param json_file: JSON file to import. """ content = json_file.file.read() def file_reader(content): return pd.DataFrame(json.loads(content)) call_async(import_dataset, dataset, file_reader=partial(file_reader, content)) return dataset
def add_observations(self, json_data): """Update `dataset` with new `data`.""" record = self.record update_id = uuid.uuid4().hex self.add_pending_update(update_id) new_data = json.loads(json_data) calculator = Calculator(self) new_dframe_raw = calculator.dframe_from_update( new_data, self.schema.labels_to_slugs) calculator._check_update_is_valid(new_dframe_raw) call_async(calculator.calculate_updates, calculator, new_data, new_dframe_raw=new_dframe_raw, update_id=update_id)
def add_observations(self, new_data): """Update `dataset` with `new_data`.""" update_id = uuid.uuid4().hex self.add_pending_update(update_id) new_data = to_list(new_data) calculator = Calculator(self) new_dframe_raw = calculator.dframe_from_update( new_data, self.schema.labels_to_slugs) calculator._check_update_is_valid(new_dframe_raw) calculator.dataset.clear_cache() call_async(calculator.calculate_updates, calculator, new_data, new_dframe_raw=new_dframe_raw, update_id=update_id)
def import_from_url(self, url, na_values=[], allow_local_file=False): """Load a URL, read from a CSV, add data to dataset. :param url: URL to load file from. :param allow_local_file: Allow URL to refer to a local file. :raises: `IOError` for an unreadable file or a bad URL. :returns: The created dataset. """ if not allow_local_file and isinstance(url, basestring) and url[0:4] == "file": raise IOError call_async(import_dataset, self, partial(csv_file_reader, url, na_values=na_values)) return self
def import_schema_for_dataset(dataset, schema): """Create a dataset from a SDF schema file (JSON). :param schema: The SDF (JSON) file to create a dataset from. :returns: The created dataset. """ try: schema = json.loads(schema.file.read()) except AttributeError: schema = json.loads(schema) dataset.set_schema(schema) call_async(import_dataset, dataset) return dataset
def add_observations(self, new_data): """Update `dataset` with `new_data`.""" record = self.record update_id = uuid.uuid4().hex self.add_pending_update(update_id) if not isinstance(new_data, list): new_data = [new_data] calculator = Calculator(self) new_dframe_raw = calculator.dframe_from_update( new_data, self.schema.labels_to_slugs) calculator._check_update_is_valid(new_dframe_raw) call_async(calculator.calculate_updates, calculator, new_data, new_dframe_raw=new_dframe_raw, update_id=update_id)
def save(self, dataset, formula, name, group_str=None): """Parse, save, and calculate a formula. Validate `formula` and `group_str` for the given `dataset`. If the formula and group are valid for the dataset, then save a new calculation for them under `name`. Finally, create a background task to compute the calculation. Calculations are initially saved in a **pending** state, after the calculation has finished processing it will be in a **ready** state. :param dataset: The DataSet to save. :param formula: The formula to save. :param name: The name of the formula. :param group_str: Columns to group on. :type group_str: String, list or strings, or None. :raises: `ParseError` if an invalid formula was supplied. """ calculator = Calculator(dataset) # ensure that the formula is parsable groups = self.split_groups(group_str) if group_str else [] aggregation = calculator.validate(formula, groups) if aggregation: # set group if aggregation and group unset if not group_str: group_str = '' else: # ensure the name is unique name = make_unique(name, dataset.labels + dataset.schema.keys()) record = { DATASET_ID: dataset.dataset_id, self.AGGREGATION: aggregation, self.FORMULA: formula, self.GROUP: group_str, self.NAME: name, self.STATE: self.STATE_PENDING, } super(self.__class__, self).save(record) call_async(calculate_task, self, dataset) return record
def import_from_url(self, url, na_values=[], allow_local_file=False): """Load a URL, read from a CSV, add data to dataset. :param url: URL to load file from. :param allow_local_file: Allow URL to refer to a local file. :raises: `IOError` for an unreadable file or a bad URL. :returns: The created dataset. """ if not allow_local_file and isinstance(url, basestring)\ and url[0:4] == 'file': raise IOError call_async(import_dataset, self, partial(csv_file_reader, url, na_values=na_values)) return self
def import_data_from_url(dataset, url, allow_local_file=False): """Load a URL, read from a CSV, add data to dataset. :param dataset: Dataset to save in. :param url: URL to load file from. :param allow_local_file: Allow URL to refer to a local file. :raises: `IOError` for an unreadable file or a bad URL. :returns: The created dataset. """ if not allow_local_file and isinstance(url, basestring)\ and url[0:4] == 'file': raise IOError call_async(import_dataset, dataset, file_reader=partial(_file_reader, url)) return dataset
def create_dataset_from_url(url, allow_local_file=False): """Load a URL, read from a CSV, create a dataset and return the unique ID. :param url: URL to load file from. :param allow_local_file: Allow URL to refer to a local file. :raises: `IOError` for an unreadable file or a bad URL. :returns: The created dataset. """ if not allow_local_file and isinstance(url, basestring)\ and url[0:4] == 'file': raise IOError dataset = Dataset() dataset.save() call_async(import_dataset, dataset, file_reader=partial(_file_reader, url)) return dataset
def _update_aggregate_dataset(self, formula, new_dframe, name, groups, agg_dataset): """Update the aggregated dataset built for `self` with `calculation`. Proceed with the following steps: - delete the rows in this dataset from the parent - recalculate aggregated dataframe from aggregation - update aggregated dataset with new dataframe and add parent id - recur on all merged datasets descending from the aggregated dataset :param formula: The formula to execute. :param new_dframe: The DataFrame to aggregate on. :param name: The name of the aggregation. :param groups: A column or columns to group on. :type group: String, list of strings, or None. :param agg_dataset: The DataSet to store the aggregation in. """ # parse aggregation and build column arguments aggregation, new_columns = self.make_columns( formula, name, new_dframe) agg = Aggregator(self.dataset, self.dframe, groups, aggregation, name) new_agg_dframe = agg.update(agg_dataset, self, formula, new_columns) # jsondict from new dframe new_data = new_agg_dframe.to_jsondict() for merged_dataset in agg_dataset.merged_datasets: # remove rows in child from this merged dataset merged_dataset.remove_parent_observations( agg_dataset.dataset_id) # calculate updates on the child merged_calculator = Calculator(merged_dataset) call_async(merged_calculator.calculate_updates, merged_calculator, new_data, parent_dataset_id=agg_dataset.dataset_id)
def _update_merged_datasets(self, new_data, labels_to_slugs): # store slugs as labels for child datasets slugified_data = [] if not isinstance(new_data, list): new_data = [new_data] for row in new_data: for key, value in row.iteritems(): if labels_to_slugs.get(key) and key not in MONGO_RESERVED_KEYS: del row[key] row[labels_to_slugs[key]] = value slugified_data.append(row) # update the merged datasets with new_dframe for merged_dataset in self.dataset.merged_datasets: merged_calculator = Calculator(merged_dataset) call_async(merged_calculator.calculate_updates, merged_calculator, slugified_data, parent_dataset_id=self.dataset.dataset_id)
def merge_dataset_ids(dataset_ids, mapping): """Load a JSON array of dataset IDs and start a background merge task. :param dataset_ids: An array of dataset IDs to merge. :raises: `MergeError` if less than 2 datasets are provided. If a dataset cannot be found for a dataset ID it is ignored. Therefore if 2 dataset IDs are provided and one of them is bad an error is raised. However, if three dataset IDs are provided and one of them is bad, an error is not raised. """ datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids] datasets = [dataset for dataset in datasets if dataset.record] if len(datasets) < 2: raise MergeError('merge requires 2 datasets (found %s)' % len(datasets)) new_dataset = Dataset.create() call_async(__merge_datasets_task, new_dataset, datasets, mapping) return new_dataset
def create(cls, dataset, formula, name, group=None): calculation = super(cls, cls).create(dataset, formula, name, group) call_async(calculate_task, [calculation], dataset.clear_cache()) return calculation
def update_observation(self, index, data): # check that update is valid dframe_from_update(self, [data]) Observation.update(self, index, data) call_async(propagate, self, update={'edit': [index, data]})
def delete(self, countdown=0): """Delete this dataset.""" call_async(delete_task, self, countdown=countdown)
def delete(self, countdown=0): """Delete this dataset. :param countdown: Delete dataset after this number of seconds. """ call_async(delete_task, self, countdown=countdown)