Example #1
0
File: io.py Project: myf/bamboo
    def import_from_csv(self, csv_file):
        """Import data from a CSV file.

        .. note::

            Write to a named tempfile in order  to get a handle for pandas'
            `read_csv` function.

        :param csv_file: The CSV File to create a dataset from.

        :returns: The created dataset.
        """
        if 'file' in dir(csv_file):
            csv_file = csv_file.file

        tmpfile = tempfile.NamedTemporaryFile(delete=False)
        tmpfile.write(csv_file.read())

        # pandas needs a closed file for *read_csv*
        tmpfile.close()

        call_async(import_dataset, self, partial(
            csv_file_reader, tmpfile.name, delete=True))

        return self
Example #2
0
    def create_from_list_or_dict(cls, dataset, calculations):
        calculations = to_list(calculations)

        if not len(calculations) or not isinstance(calculations, list):
            raise ArgumentError('Improper format for JSON calculations.')

        parsed_calculations = []

        # Pull out args to check JSON format
        try:
            for c in calculations:
                groups = c.get("groups")

                if not isinstance(groups, list):
                    groups = [groups]

                for group in groups:
                    parsed_calculations.append([
                        c[cls.FORMULA],
                        c[cls.NAME], group])
        except KeyError as e:
            raise ArgumentError('Required key %s not found in JSON' % e)

        calculations = [cls().save(dataset, formula, name, group)
                        for formula, name, group in parsed_calculations]
        call_async(calculate_task, calculations, dataset.clear_cache())
Example #3
0
    def save(self, dframe, dataset):
        """Save data in `dframe` with the `dataset`.

        Encode `dframe` for MongoDB, and add fields to identify it with the
        passed in `dataset`. All column names in `dframe` are converted to
        slugs using the dataset's schema.  The dataset is update to store the
        size of the stored data. A background task to cache a summary of the
        dataset is launched.

        :param dframe: The DataFrame (or BambooFrame) to store.
        :param dataset: The dataset to store the dframe in.
        """
        # build schema for the dataset after having read it from file.
        if not dataset.schema:
            dataset.build_schema(dframe)

        # save the data, if there is any
        num_rows = 0

        if dframe is not None:
            if not DATASET_OBSERVATION_ID in dframe.columns:
                dframe = dataset.add_id_column_to_dframe(dframe)

            self.batch_save(dframe)
            num_rows = len(dframe)

        # add metadata to dataset, discount ID column
        dataset.update({
            dataset.NUM_ROWS: num_rows,
            dataset.STATE: self.STATE_READY,
        })

        call_async(dataset.summarize, dataset)
    def delete(self, dataset):
        """Delete this calculation.

        First ensure that there are no other calculations which depend on this
        one. If not, start a background task to delete the calculation.

        :param dataset: Dataset for this calculation.

        :raises: `DependencyError` if dependent calculations exist.
        :raises: `ArgumentError` if group is not in DataSet or calculation does
            not exist for DataSet.
        """
        if len(self.dependent_calculations):
            msg = 'Cannot delete, calculations %s depend on this calculation.'
            raise DependencyError(msg % self.dependent_calculations)

        if not self.group is None:
            # it is an aggregate calculation
            dataset = dataset.aggregated_dataset(self.group)

            if not dataset:
                msg = 'Aggregation with group "%s" does not exist for dataset.'
                raise ArgumentError(msg % self.group)

        call_async(delete_task, self, dataset)
Example #5
0
def merge_dataset_ids(dataset_ids):
    """Load a JSON array of dataset IDs and start a background merge task.

    :param dataset_ids: An array of dataset IDs to merge.

    :raises: `MergeError` if less than 2 datasets are provided. If a dataset
        cannot be found for a dataset ID it is ignored. Therefore if 2 dataset
        IDs are provided and one of them is bad an error is raised.  However,
        if three dataset IDs are provided and one of them is bad, an error is
        not raised.
    """
    dataset_ids = json.loads(dataset_ids)
    datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids]
    datasets = [dataset for dataset in datasets if dataset.record]

    if len(datasets) < 2:
        raise MergeError(
            'merge requires 2 datasets (found %s)' % len(datasets))

    new_dataset = Dataset()
    new_dataset.save()

    call_async(_merge_datasets_task, new_dataset, datasets)

    return new_dataset
Example #6
0
    def delete(self, query=None, countdown=0):
        """Delete this dataset.

        :param countdown: Delete dataset after this number of seconds.
        """
        call_async(delete_task, self.clear_cache(), query=query,
                   countdown=countdown)
Example #7
0
    def _update_joined_datasets(self, new_dframe_raw):
        # update any joined datasets
        for direction, other_dataset, on, joined_dataset in\
                self.dataset.joined_datasets:
            if direction == 'left':
                if on in new_dframe_raw.columns:
                    # only proceed if on in new dframe
                    other_dframe = other_dataset.dframe(padded=True)

                    if len(set(new_dframe_raw[on]).intersection(
                            set(other_dframe[on]))):
                        # only proceed if new on value is in on column in lhs
                        merged_dframe = other_dframe.join_dataset(
                            self.dataset, on)
                        joined_dataset.replace_observations(merged_dframe)
            else:
                merged_dframe = new_dframe_raw

                if on in merged_dframe:
                    merged_dframe = new_dframe_raw.join_dataset(
                        other_dataset, on)

                joined_calculator = Calculator(joined_dataset)
                call_async(joined_calculator.calculate_updates,
                           joined_calculator, merged_dframe.to_jsondict(),
                           parent_dataset_id=self.dataset.dataset_id)
Example #8
0
    def import_from_csv(self, csv_file, na_values=[]):
        """Import data from a CSV file.

        .. note::

            Write to a named tempfile in order  to get a handle for pandas'
            `read_csv` function.

        :param csv_file: The CSV File to create a dataset from.

        :returns: The created dataset.
        """
        if 'file' in dir(csv_file):
            csv_file = csv_file.file

        tmpfile = tempfile.NamedTemporaryFile(delete=False)
        tmpfile.write(csv_file.read())

        # pandas needs a closed file for *read_csv*
        tmpfile.close()

        call_async(
            import_dataset, self,
            partial(csv_file_reader,
                    tmpfile.name,
                    na_values=na_values,
                    delete=True))

        return self
Example #9
0
    def delete(self, dataset):
        """Delete this calculation.

        First ensure that there are no other calculations which depend on this
        one. If not, start a background task to delete the calculation.

        :param dataset: Dataset for this calculation.

        :raises: `DependencyError` if dependent calculations exist.
        :raises: `ArgumentError` if group is not in DataSet or calculation does
            not exist for DataSet.
        """
        if len(self.dependent_calculations):
            raise DependencyError(
                'Cannot delete, the calculations %s depend on this calculation'
                % self.dependent_calculations)

        if not self.group is None:
            # it is an aggregate calculation
            dataset = dataset.aggregated_dataset(self.group)

            if not dataset:
                raise ArgumentError(
                    'Aggregation with group "%s" does not exist for '
                    'dataset' % self.group)

        call_async(delete_task, self, dataset)
Example #10
0
File: io.py Project: helioid/bamboo
def create_dataset_from_csv(csv_file):
    """Create a dataset from a CSV file.

    .. note::

        Write to a named tempfile in order  to get a handle for pandas'
        `read_csv` function.

    :param csv_file: The CSV File to create a dataset from.

    :returns: The created dataset.
    """
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(csv_file.file.read())

    # pandas needs a closed file for *read_csv*
    tmpfile.close()

    dataset = Dataset()
    dataset.save()

    call_async(import_dataset, dataset,
               file_reader=partial(_file_reader, tmpfile.name, delete=True))

    return dataset
    def create_from_list_or_dict(cls, dataset, calculations):
        calculations = to_list(calculations)

        if not len(calculations) or not isinstance(calculations, list) or\
                any([not isinstance(e, dict) for e in calculations]):
            raise ArgumentError('Improper format for JSON calculations.')

        parsed_calculations = []

        # Pull out args to check JSON format
        try:
            for c in calculations:
                groups = c.get("groups")

                if not isinstance(groups, list):
                    groups = [groups]

                for group in groups:
                    parsed_calculations.append(
                        [c[cls.FORMULA], c[cls.NAME], group])
        except KeyError as e:
            raise ArgumentError('Required key %s not found in JSON' % e)

        # Save instead of create so that we calculate on all at once.
        calculations = [
            cls().save(dataset, formula, name, group)
            for formula, name, group in parsed_calculations
        ]
        call_async(calculate_task, calculations, dataset.clear_cache())
Example #12
0
    def import_from_json(self, json_file):
        """Impor data from a JSON file.

        :param json_file: JSON file to import.
        """
        content = json_file.file.read()
        call_async(import_dataset, self, partial(json_file_reader, content))

        return self
Example #13
0
    def delete(self, query=None, countdown=0):
        """Delete this dataset.

        :param countdown: Delete dataset after this number of seconds.
        """
        call_async(delete_task,
                   self.clear_cache(),
                   query=query,
                   countdown=countdown)
Example #14
0
    def import_from_json(self, json_file):
        """Impor data from a JSON file.

        :param json_file: JSON file to import.
        """
        content = json_file.file.read()
        call_async(import_dataset, self, partial(json_file_reader, content))

        return self
Example #15
0
    def delete_observation(self, index):
        """Delete observation at index.

        :params index: The index of an observation to delete.
        """
        Observation.delete(self, index)

        dframe = self.dframe()
        self.update({self.NUM_ROWS: len(dframe)})
        self.build_schema(dframe, overwrite=True)
        call_async(propagate, self, update={'delete': index})
Example #16
0
    def delete_observation(self, index):
        """Delete observation at index.

        :params index: The index of an observation to delete.
        """
        Observation.delete(self, index)

        dframe = self.dframe()
        self.update({self.NUM_ROWS: len(dframe)})
        self.build_schema(dframe, overwrite=True)
        call_async(propagate, self, update={'delete': index})
Example #17
0
    def add_observations(self, new_data):
        """Update `dataset` with `new_data`."""
        update_id = uuid.uuid4().hex
        self.add_pending_update(update_id)

        new_data = to_list(new_data)

        # fetch data before other updates
        new_dframe_raw = dframe_from_update(self, new_data)

        call_async(calculate_updates, self, new_data,
                   new_dframe_raw=new_dframe_raw, update_id=update_id)
Example #18
0
File: io.py Project: helioid/bamboo
def create_dataset_from_json(json_file):
    content = json_file.file.read()

    dataset = Dataset()
    dataset.save()

    def file_reader(content):
        return pd.DataFrame(json.loads(content))

    call_async(import_dataset, dataset,
               file_reader=partial(file_reader, content))

    return dataset
Example #19
0
    def add_observations(self, new_data):
        """Update `dataset` with `new_data`."""
        update_id = uuid.uuid4().hex
        self.add_pending_update(update_id)

        new_data = to_list(new_data)

        # fetch data before other updates
        new_dframe_raw = dframe_from_update(self, new_data)

        call_async(calculate_updates,
                   self,
                   new_data,
                   new_dframe_raw=new_dframe_raw,
                   update_id=update_id)
Example #20
0
def import_data_from_json(dataset, json_file):
    """Impor data from a JSON file.

    :param dataset: Dataset to save in.
    :param json_file: JSON file to import.
    """
    content = json_file.file.read()

    def file_reader(content):
        return pd.DataFrame(json.loads(content))

    call_async(import_dataset, dataset,
               file_reader=partial(file_reader, content))

    return dataset
Example #21
0
    def add_observations(self, json_data):
        """Update `dataset` with new `data`."""
        record = self.record
        update_id = uuid.uuid4().hex
        self.add_pending_update(update_id)

        new_data = json.loads(json_data)
        calculator = Calculator(self)

        new_dframe_raw = calculator.dframe_from_update(
            new_data, self.schema.labels_to_slugs)
        calculator._check_update_is_valid(new_dframe_raw)

        call_async(calculator.calculate_updates, calculator, new_data,
                   new_dframe_raw=new_dframe_raw, update_id=update_id)
Example #22
0
    def add_observations(self, new_data):
        """Update `dataset` with `new_data`."""
        update_id = uuid.uuid4().hex
        self.add_pending_update(update_id)

        new_data = to_list(new_data)

        calculator = Calculator(self)

        new_dframe_raw = calculator.dframe_from_update(
            new_data, self.schema.labels_to_slugs)
        calculator._check_update_is_valid(new_dframe_raw)
        calculator.dataset.clear_cache()

        call_async(calculator.calculate_updates, calculator, new_data,
                   new_dframe_raw=new_dframe_raw, update_id=update_id)
Example #23
0
File: io.py Project: j/bamboo
    def import_from_url(self, url, na_values=[], allow_local_file=False):
        """Load a URL, read from a CSV, add data to dataset.

        :param url: URL to load file from.
        :param allow_local_file: Allow URL to refer to a local file.

        :raises: `IOError` for an unreadable file or a bad URL.

        :returns: The created dataset.
        """
        if not allow_local_file and isinstance(url, basestring) and url[0:4] == "file":
            raise IOError

        call_async(import_dataset, self, partial(csv_file_reader, url, na_values=na_values))

        return self
Example #24
0
def import_schema_for_dataset(dataset, schema):
    """Create a dataset from a SDF schema file (JSON).

    :param schema: The SDF (JSON) file to create a dataset from.

    :returns: The created dataset.
    """
    try:
        schema = json.loads(schema.file.read())
    except AttributeError:
        schema = json.loads(schema)

    dataset.set_schema(schema)

    call_async(import_dataset, dataset)

    return dataset
Example #25
0
File: dataset.py Project: j/bamboo
    def add_observations(self, new_data):
        """Update `dataset` with `new_data`."""
        record = self.record
        update_id = uuid.uuid4().hex
        self.add_pending_update(update_id)

        if not isinstance(new_data, list):
            new_data = [new_data]

        calculator = Calculator(self)

        new_dframe_raw = calculator.dframe_from_update(
            new_data, self.schema.labels_to_slugs)
        calculator._check_update_is_valid(new_dframe_raw)

        call_async(calculator.calculate_updates, calculator, new_data,
                   new_dframe_raw=new_dframe_raw, update_id=update_id)
Example #26
0
    def save(self, dataset, formula, name, group_str=None):
        """Parse, save, and calculate a formula.

        Validate `formula` and `group_str` for the given `dataset`. If the
        formula and group are valid for the dataset, then save a new
        calculation for them under `name`. Finally, create a background task
        to compute the calculation.

        Calculations are initially saved in a **pending** state, after the
        calculation has finished processing it will be in a **ready** state.

        :param dataset: The DataSet to save.
        :param formula: The formula to save.
        :param name: The name of the formula.
        :param group_str: Columns to group on.
        :type group_str: String, list or strings, or None.

        :raises: `ParseError` if an invalid formula was supplied.
        """
        calculator = Calculator(dataset)

        # ensure that the formula is parsable
        groups = self.split_groups(group_str) if group_str else []
        aggregation = calculator.validate(formula, groups)

        if aggregation:
            # set group if aggregation and group unset
            if not group_str:
                group_str = ''
        else:
            # ensure the name is unique
            name = make_unique(name, dataset.labels + dataset.schema.keys())

        record = {
            DATASET_ID: dataset.dataset_id,
            self.AGGREGATION: aggregation,
            self.FORMULA: formula,
            self.GROUP: group_str,
            self.NAME: name,
            self.STATE: self.STATE_PENDING,
        }
        super(self.__class__, self).save(record)

        call_async(calculate_task, self, dataset)

        return record
Example #27
0
    def import_from_url(self, url, na_values=[], allow_local_file=False):
        """Load a URL, read from a CSV, add data to dataset.

        :param url: URL to load file from.
        :param allow_local_file: Allow URL to refer to a local file.

        :raises: `IOError` for an unreadable file or a bad URL.

        :returns: The created dataset.
        """
        if not allow_local_file and isinstance(url, basestring)\
                and url[0:4] == 'file':
            raise IOError

        call_async(import_dataset, self,
                   partial(csv_file_reader, url, na_values=na_values))

        return self
Example #28
0
def import_data_from_url(dataset, url, allow_local_file=False):
    """Load a URL, read from a CSV, add data to dataset.

    :param dataset: Dataset to save in.
    :param url: URL to load file from.
    :param allow_local_file: Allow URL to refer to a local file.

    :raises: `IOError` for an unreadable file or a bad URL.

    :returns: The created dataset.
    """
    if not allow_local_file and isinstance(url, basestring)\
            and url[0:4] == 'file':
        raise IOError

    call_async(import_dataset, dataset, file_reader=partial(_file_reader, url))

    return dataset
Example #29
0
File: io.py Project: helioid/bamboo
def create_dataset_from_url(url, allow_local_file=False):
    """Load a URL, read from a CSV, create a dataset and return the unique ID.

    :param url: URL to load file from.
    :param allow_local_file: Allow URL to refer to a local file.

    :raises: `IOError` for an unreadable file or a bad URL.

    :returns: The created dataset.
    """
    if not allow_local_file and isinstance(url, basestring)\
            and url[0:4] == 'file':
        raise IOError

    dataset = Dataset()
    dataset.save()
    call_async(import_dataset, dataset, file_reader=partial(_file_reader, url))

    return dataset
Example #30
0
    def _update_aggregate_dataset(self, formula, new_dframe, name, groups,
                                  agg_dataset):
        """Update the aggregated dataset built for `self` with `calculation`.

        Proceed with the following steps:

            - delete the rows in this dataset from the parent
            - recalculate aggregated dataframe from aggregation
            - update aggregated dataset with new dataframe and add parent id
            - recur on all merged datasets descending from the aggregated
              dataset

        :param formula: The formula to execute.
        :param new_dframe: The DataFrame to aggregate on.
        :param name: The name of the aggregation.
        :param groups: A column or columns to group on.
        :type group: String, list of strings, or None.
        :param agg_dataset: The DataSet to store the aggregation in.
        """
        # parse aggregation and build column arguments
        aggregation, new_columns = self.make_columns(
            formula, name, new_dframe)

        agg = Aggregator(self.dataset, self.dframe,
                         groups, aggregation, name)
        new_agg_dframe = agg.update(agg_dataset, self, formula, new_columns)

        # jsondict from new dframe
        new_data = new_agg_dframe.to_jsondict()

        for merged_dataset in agg_dataset.merged_datasets:
            # remove rows in child from this merged dataset
            merged_dataset.remove_parent_observations(
                agg_dataset.dataset_id)

            # calculate updates on the child
            merged_calculator = Calculator(merged_dataset)
            call_async(merged_calculator.calculate_updates, merged_calculator,
                       new_data, parent_dataset_id=agg_dataset.dataset_id)
Example #31
0
    def _update_merged_datasets(self, new_data, labels_to_slugs):
        # store slugs as labels for child datasets
        slugified_data = []

        if not isinstance(new_data, list):
            new_data = [new_data]

        for row in new_data:
            for key, value in row.iteritems():
                if labels_to_slugs.get(key) and key not in MONGO_RESERVED_KEYS:
                    del row[key]
                    row[labels_to_slugs[key]] = value

            slugified_data.append(row)

        # update the merged datasets with new_dframe
        for merged_dataset in self.dataset.merged_datasets:
            merged_calculator = Calculator(merged_dataset)
            call_async(merged_calculator.calculate_updates,
                       merged_calculator,
                       slugified_data,
                       parent_dataset_id=self.dataset.dataset_id)
Example #32
0
def merge_dataset_ids(dataset_ids, mapping):
    """Load a JSON array of dataset IDs and start a background merge task.

    :param dataset_ids: An array of dataset IDs to merge.

    :raises: `MergeError` if less than 2 datasets are provided. If a dataset
        cannot be found for a dataset ID it is ignored. Therefore if 2 dataset
        IDs are provided and one of them is bad an error is raised.  However,
        if three dataset IDs are provided and one of them is bad, an error is
        not raised.
    """
    datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids]
    datasets = [dataset for dataset in datasets if dataset.record]

    if len(datasets) < 2:
        raise MergeError('merge requires 2 datasets (found %s)' %
                         len(datasets))

    new_dataset = Dataset.create()

    call_async(__merge_datasets_task, new_dataset, datasets, mapping)

    return new_dataset
    def create(cls, dataset, formula, name, group=None):
        calculation = super(cls, cls).create(dataset, formula, name, group)
        call_async(calculate_task, [calculation], dataset.clear_cache())

        return calculation
Example #34
0
 def create(cls, dataset, formula, name, group=None):
     calculation = super(cls, cls).create(dataset, formula, name, group)
     call_async(calculate_task, [calculation], dataset.clear_cache())
     return calculation
Example #35
0
 def update_observation(self, index, data):
     # check that update is valid
     dframe_from_update(self, [data])
     Observation.update(self, index, data)
     call_async(propagate, self, update={'edit': [index, data]})
Example #36
0
 def delete(self, countdown=0):
     """Delete this dataset."""
     call_async(delete_task, self, countdown=countdown)
Example #37
0
File: dataset.py Project: j/bamboo
    def delete(self, countdown=0):
        """Delete this dataset.

        :param countdown: Delete dataset after this number of seconds.
        """
        call_async(delete_task, self, countdown=countdown)
Example #38
0
 def update_observation(self, index, data):
     # check that update is valid
     dframe_from_update(self, [data])
     Observation.update(self, index, data)
     call_async(propagate, self, update={'edit': [index, data]})