Ejemplo n.º 1
0
Archivo: io.py Proyecto: helioid/bamboo
def create_dataset_from_csv(csv_file):
    """Create a dataset from a CSV file.

    .. note::

        Write to a named tempfile in order  to get a handle for pandas'
        `read_csv` function.

    :param csv_file: The CSV File to create a dataset from.

    :returns: The created dataset.
    """
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(csv_file.file.read())

    # pandas needs a closed file for *read_csv*
    tmpfile.close()

    dataset = Dataset()
    dataset.save()

    call_async(import_dataset, dataset,
               file_reader=partial(_file_reader, tmpfile.name, delete=True))

    return dataset
Ejemplo n.º 2
0
def merge_dataset_ids(dataset_ids):
    """Load a JSON array of dataset IDs and start a background merge task.

    :param dataset_ids: An array of dataset IDs to merge.

    :raises: `MergeError` if less than 2 datasets are provided. If a dataset
        cannot be found for a dataset ID it is ignored. Therefore if 2 dataset
        IDs are provided and one of them is bad an error is raised.  However,
        if three dataset IDs are provided and one of them is bad, an error is
        not raised.
    """
    dataset_ids = json.loads(dataset_ids)
    datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids]
    datasets = [dataset for dataset in datasets if dataset.record]

    if len(datasets) < 2:
        raise MergeError(
            'merge requires 2 datasets (found %s)' % len(datasets))

    new_dataset = Dataset()
    new_dataset.save()

    call_async(_merge_datasets_task, new_dataset, datasets)

    return new_dataset
Ejemplo n.º 3
0
 def test_update(self):
     for dataset_name in self.TEST_DATASETS:
         dataset = Dataset()
         dataset.save(self.test_dataset_ids[dataset_name])
         self.assertFalse('field' in dataset.record)
         dataset.update({'field': {'key': 'value'}})
         dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])
         self.assertTrue('field' in dataset.record)
         self.assertEqual(dataset.record['field'], {'key': 'value'})
Ejemplo n.º 4
0
 def test_delete(self):
     for dataset_name in self.TEST_DATASETS:
         record = Dataset()
         record.save(self.test_dataset_ids[dataset_name])
         records = Dataset.find(self.test_dataset_ids[dataset_name])
         self.assertNotEqual(records, [])
         record.delete()
         records = Dataset.find(self.test_dataset_ids[dataset_name])
         self.assertEqual(records, [])
Ejemplo n.º 5
0
Archivo: io.py Proyecto: helioid/bamboo
def create_dataset_from_json(json_file):
    content = json_file.file.read()

    dataset = Dataset()
    dataset.save()

    def file_reader(content):
        return pd.DataFrame(json.loads(content))

    call_async(import_dataset, dataset,
               file_reader=partial(file_reader, content))

    return dataset
Ejemplo n.º 6
0
class TestCalculator(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(
            self.test_dataset_ids['good_eats_with_calculations.csv'])
        dframe = recognize_dates(
            self.get_data('good_eats_with_calculations.csv'))
        self.dataset.save_observations(dframe)
        self.group = None
        self.parser = Parser(self.dataset)
        self.places = 5

    def _equal_msg(self, calculated, stored, formula):
        return '(calculated %s) %s != (stored %s) %s ' % (type(calculated),
               calculated, type(stored), stored) +\
            '(within %s places), formula: %s' % (self.places, formula)

    def _test_calculator(self):
        self.dframe = self.dataset.dframe()
        row = self.dframe.irow(0)

        columns = self.dframe.columns.tolist()
        self.start_num_cols = len(columns)
        self.added_num_cols = 0

        column_labels_to_slugs = {
            column_attrs[Dataset.LABEL]: (column_name) for
            (column_name, column_attrs) in self.dataset.schema.items()
        }
        self.label_list, self.slugified_key_list = [
            list(ary) for ary in zip(*column_labels_to_slugs.items())
        ]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx
            self.parser.validate_formula(formula, row)

            calculator = Calculator(self.dataset)

            groups = self.dataset.split_groups(self.group)
            calculation = Calculation()
            calculation.save(self.dataset, formula, name, self.group)
            calculator.calculate_columns([calculation])

            self.column_labels_to_slugs = self.dataset.schema.labels_to_slugs

            self._test_calculation_results(name, formula)
Ejemplo n.º 7
0
    def test_dframe(self):
        dataset = Dataset()
        dataset.save(self.test_dataset_ids['good_eats.csv'])
        dataset.save_observations(
            recognize_dates(self.get_data('good_eats.csv')))
        records = [x for x in Observation.find(dataset)]
        dframe = dataset.dframe()

        self.assertTrue(isinstance(dframe, DataFrame))
        self.assertTrue(all(self.get_data('good_eats.csv').reindex(
                        columns=dframe.columns).eq(dframe)))
        columns = dframe.columns
        # ensure no reserved keys
        for key in MONGO_RESERVED_KEY_STRS:
            self.assertFalse(key in columns)
        # ensure date is converted
        self.assertTrue(isinstance(dframe.submit_date[0], datetime))
Ejemplo n.º 8
0
class TestCalculator(TestBase):
    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(
            self.test_dataset_ids['good_eats_with_calculations.csv'])
        dframe = recognize_dates(
            self.get_data('good_eats_with_calculations.csv'))
        self.dataset.save_observations(dframe)
        self.group = None
        self.places = 5

    def _equal_msg(self, calculated, stored, formula):
        return '(calculated %s) %s != (stored %s) %s ' % (type(calculated),
               calculated, type(stored), stored) +\
            '(within %s places), formula: %s' % (self.places, formula)

    def _test_calculator(self):
        self.dframe = self.dataset.dframe()

        columns = self.dframe.columns.tolist()
        self.start_num_cols = len(columns)
        self.added_num_cols = 0

        column_labels_to_slugs = {
            column_attrs[Dataset.LABEL]: (column_name)
            for (column_name, column_attrs) in self.dataset.schema.items()
        }
        self.label_list, self.slugified_key_list = [
            list(ary) for ary in zip(*column_labels_to_slugs.items())
        ]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx

            Parser.validate_formula(formula, self.dataset)

            calculation = Calculation()
            calculation.save(self.dataset, formula, name, self.group)
            self.now = now()
            calculate_columns(self.dataset, [calculation])

            self.column_labels_to_slugs = self.dataset.schema.labels_to_slugs

            self._test_calculation_results(name, formula)
Ejemplo n.º 9
0
Archivo: io.py Proyecto: helioid/bamboo
def create_dataset_from_url(url, allow_local_file=False):
    """Load a URL, read from a CSV, create a dataset and return the unique ID.

    :param url: URL to load file from.
    :param allow_local_file: Allow URL to refer to a local file.

    :raises: `IOError` for an unreadable file or a bad URL.

    :returns: The created dataset.
    """
    if not allow_local_file and isinstance(url, basestring)\
            and url[0:4] == 'file':
        raise IOError

    dataset = Dataset()
    dataset.save()
    call_async(import_dataset, dataset, file_reader=partial(_file_reader, url))

    return dataset
Ejemplo n.º 10
0
Archivo: io.py Proyecto: helioid/bamboo
def create_dataset_from_schema(schema):
    """Create a dataset from a SDF schema file (JSON).

    :param schema: The SDF (JSON) file to create a dataset from.

    :returns: The created dataset.
    """
    try:
        schema = json.loads(schema.file.read())
    except AttributeError:
        schema = json.loads(schema)

    dataset = Dataset()
    dataset.save()
    dataset.set_schema(schema)

    call_async(import_dataset, dataset)

    return dataset
Ejemplo n.º 11
0
    def __create_or_update(self, url=None, csv_file=None, json_file=None,
                           schema=None, na_values=[], perish=0,
                           dataset_id=None):
        result = None
        error = 'url, csv_file or schema required'

        try:
            if schema or url or csv_file or json_file:
                if dataset_id is None:
                    dataset = Dataset()
                    dataset.save()
                else:
                    dataset = Dataset.find_one(dataset_id)
                    Observation.delete_all(dataset)

                if schema:
                    dataset.import_schema(schema)

                na_values = safe_json_loads(na_values)

                if url:
                    dataset.import_from_url(url, na_values=na_values)
                elif csv_file:
                    dataset.import_from_csv(csv_file, na_values=na_values)
                elif json_file:
                    dataset.import_from_json(json_file)

                result = {Dataset.ID: dataset.dataset_id}

            perish = parse_int(perish)
            perish and dataset.delete(countdown=perish)
        except urllib2.URLError:
            error = 'could not load: %s' % url
        except IOError:
            error = 'could not get a filehandle for: %s' % csv_file
        except JSONError as e:
            error = e.__str__()

        self.set_response_params(result, success_status_code=201)

        return self._dump_or_error(result, error)
Ejemplo n.º 12
0
    def __create_or_update(self, url=None, csv_file=None, json_file=None,
                           schema=None, na_values=[], perish=0,
                           dataset_id=None):
        result = None
        error = 'url, csv_file or schema required'

        try:
            if schema or url or csv_file or json_file:
                if dataset_id is None:
                    dataset = Dataset()
                    dataset.save()
                else:
                    dataset = Dataset.find_one(dataset_id)
                    Observation.delete_all(dataset)

                if schema:
                    dataset.import_schema(schema)

                na_values = safe_json_loads(na_values)

                if url:
                    dataset.import_from_url(url, na_values=na_values)
                elif csv_file:
                    dataset.import_from_csv(csv_file, na_values=na_values)
                elif json_file:
                    dataset.import_from_json(json_file)

                result = {Dataset.ID: dataset.dataset_id}

            perish = parse_int(perish)
            perish and dataset.delete(countdown=perish)
        except urllib2.URLError:
            error = 'could not load: %s' % url
        except IOError:
            error = 'could not get a filehandle for: %s' % csv_file
        except JSONError as e:
            error = e.__str__()

        self.set_response_params(result, success_status_code=201)

        return self._dump_or_error(result, error)
Ejemplo n.º 13
0
    def test_build_schema(self):
        illegal_col_regex = re.compile(r'\W')

        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset()
            dataset.save(self.test_dataset_ids[dataset_name])
            dataset.build_schema(self.get_data(dataset_name))

            # get dataset with new schema
            dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])

            for key in [
                    Dataset.CREATED_AT, Dataset.SCHEMA, Dataset.UPDATED_AT]:
                self.assertTrue(key in dataset.record.keys())

            df_columns = self.get_data(dataset_name).columns.tolist()
            seen_columns = []

            for column_name, column_attributes in dataset.schema.items():
                # check column_name is unique
                self.assertFalse(column_name in seen_columns)
                seen_columns.append(column_name)

                # check column name is only legal chars
                self.assertFalse(illegal_col_regex.search(column_name))

                # check has require attributes
                self.assertTrue(SIMPLETYPE in column_attributes)
                self.assertTrue(OLAP_TYPE in column_attributes)
                self.assertTrue(Dataset.LABEL in column_attributes)

                # check label is an original column
                self.assertTrue(column_attributes[Dataset.LABEL] in df_columns)
                df_columns.remove(column_attributes[Dataset.LABEL])

                # check not reserved key
                self.assertFalse(column_name in MONGO_RESERVED_KEY_STRS)

            # ensure all columns in df_columns have store columns
            self.assertTrue(len(df_columns) == 0)
Ejemplo n.º 14
0
class TestCalculation(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(self.test_dataset_ids['good_eats.csv'])
        self.formula = 'rating'
        self.name = 'test'

    def _save_calculation(self, formula):
        if not formula:
            formula = self.formula
        return self.calculation.save(self.dataset, formula, self.name)

    def _save_observations(self):
        self.dataset.save_observations(self.get_data('good_eats.csv'))

    def _save_observations_and_calculation(self, formula=None):
        self._save_observations()
        self.calculation = Calculation()
        return self._save_calculation(formula)

    def test_save(self):
        record = self._save_observations_and_calculation()
        self.assertTrue(isinstance(record, dict))
        self.assertTrue(Calculation.FORMULA in record.keys())
        self.assertTrue(Calculation.STATE in record.keys())
        record = Calculation.find(self.dataset)[0].record
        self.assertEqual(record[Calculation.STATE], Calculation.STATE_READY)
        self.assertTrue(Calculation(record).is_ready)

    def test_save_set_status(self):
        record = self._save_observations_and_calculation()
        self.assertTrue(isinstance(record, dict))
        self.assertTrue(Calculation.FORMULA in record.keys())

    def test_save_set_aggregation(self):
        record = self._save_observations_and_calculation('max(amount)')
        calculation = Calculation.find(self.dataset)[0]
        self.assertEqual('max', calculation.aggregation)

    def test_save_set_aggregation_id(self):
        record = self._save_observations_and_calculation('max(amount)')
        agg_id = self.dataset.aggregated_datasets_dict['']
        calculation = Calculation.find(self.dataset)[0]
        self.assertEqual(agg_id, calculation.aggregation_id)

    def test_save_improper_formula(self):
        assert_raises(ParseError, self._save_observations_and_calculation,
                      'NON_EXISTENT_COLUMN')
        try:
            self._save_observations_and_calculation('NON_EXISTENT_COLUMN')
        except ParseError as e:
            self.assertTrue('Missing column' in e.__str__())

    def test_save_unparsable_formula(self):
        assert_raises(ParseError, self._save_observations_and_calculation,
                      '=NON_EXISTENT_COLUMN')
        try:
            self._save_observations_and_calculation(
                '=NON_EXISTENT_COLUMN')
        except ParseError as e:
            self.assertTrue('Parse Failure' in e.__str__())

    def test_save_improper_formula_no_data(self):
        assert_raises(ParseError, Calculation().save, self.dataset,
                      'NON_EXISTENT_COLUMN', self.name)
        try:
            Calculation().save(self.dataset, 'NON_EXISTENT_COLUMN',
                               self.name)
        except ParseError as e:
            self.assertTrue('No schema' in e.__str__())

    def test_save_unparsable_formula_no_data(self):
        assert_raises(ParseError, Calculation().save, self.dataset,
                      '=NON_EXISTENT_COLUMN', self.name)
        try:
            Calculation().save(self.dataset, '=NON_EXISTENT_COLUMN',
                               self.name)
        except ParseError as e:
            self.assertTrue('Parse Failure' in e.__str__())

    def test_save_non_existent_group(self):
        self._save_observations()
        assert_raises(ParseError, Calculation().save, self.dataset,
                      self.formula, self.name, group_str='NON_EXISTENT_GROUP')
        try:
            Calculation().save(self.dataset, self.formula, self.name,
                               group_str='NON_EXISTENT_GROUP')
        except ParseError as e:
            self.assertTrue('Group' in e.__str__())

    def test_find(self):
        record = self._save_observations_and_calculation()
        rows = Calculation.find(self.dataset)
        new_record = rows[0].record
        status = new_record.pop(Calculation.STATE)
        self.assertEqual(status, Calculation.STATE_READY)

    def test_sets_dependent_calculations(self):
        record = self._save_observations_and_calculation()
        self.name = 'test1'
        record = self._save_calculation('test')
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, ['test1'])

    def test_removes_dependent_calculations(self):
        record = self._save_observations_and_calculation()
        self.name = 'test1'
        record = self._save_calculation('test')
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, ['test1'])
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test1')
        calculation.delete(self.dataset)
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, [])

    def test_disallow_delete_dependent_calculation(self):
        record = self._save_observations_and_calculation()
        self.name = 'test1'
        record = self._save_calculation('test')
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, ['test1'])
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        assert_raises(DependencyError, calculation.delete, self.dataset)
Ejemplo n.º 15
0
class TestObservation(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(self.test_dataset_ids['good_eats.csv'])
        self.query_args = QueryArgs({"rating": "delectible"})

    def _save_records(self):
        Observation.save(self.get_data('good_eats.csv'),
                         self.dataset)
        records = Observation.find(self.dataset)
        self.assertTrue(isinstance(records, list))
        self.assertTrue(isinstance(records[0], dict))
        self.assertTrue('_id' in records[0].keys())
        return records

    def _save_observations(self):
        return Observation.save(
            recognize_dates(self.get_data('good_eats.csv')), self.dataset)

    def test_save(self):
        records = self._save_records()
        self.assertEqual(len(records), 19)

    def test_save_over_bulk(self):
        Observation.save(self.get_data('good_eats_large.csv'),
                         self.dataset)
        records = Observation.find(self.dataset)
        self.assertEqual(len(records), 1001)

    def test_find(self):
        self._save_observations()
        rows = Observation.find(self.dataset)
        self.assertTrue(isinstance(rows, list))

    def test_find_with_query(self):
        self._save_observations()
        rows = Observation.find(self.dataset, self.query_args)
        self.assertTrue(isinstance(rows, list))

    def test_find_with_select(self):
        self._save_observations()
        query_args = QueryArgs(select={"rating": 1})
        rows = Observation.find(self.dataset, query_args)
        self.assertTrue(isinstance(rows, list))
        self.assertEquals(sorted(rows[0].keys()), ['_id', 'rating'])

    def test_find_with_select_and_query(self):
        self._save_observations()
        self.query_args.select = {"rating": 1}
        rows = Observation.find(self.dataset, self.query_args)
        self.assertTrue(isinstance(rows, list))
        self.assertEquals(sorted(rows[0].keys()), ['_id', 'rating'])

    def test_delete(self):
        self._save_observations()
        records = Observation.find(self.dataset)
        self.assertNotEqual(records, [])
        Observation.delete_all(self.dataset)
        records = [x for x in Observation.find(self.dataset)]
        self.assertEqual(records, [])
Ejemplo n.º 16
0
    def create(self, url=None, csv_file=None, json_file=None, schema=None,
               na_values=[], perish=0):
        """Create a dataset by URL, CSV or schema file.

        If `url` is provided, create a dataset by downloading a CSV from that
        URL. If `url` is not provided and `csv_file` is provided, create a
        dataset with the data in the passed `csv_file`. If both `url` and
        `csv_file` are provided, `csv_file` is ignored. If `schema` is
        supplied, an empty dataset is created with the associated column
        structure.

        .. note::

            The follow words are reserved and will be slugified by adding
            underscores (or multiple underscores to ensure uniqueness) if used
            as column names:

                - all
                - and
                - case
                - date
                - default
                - in
                - not
                - or
                - sum
                - years

        :param url: A URL to load a CSV file from. The URL must point to a CSV
            file.
        :param csv_file: An uploaded CSV file to read from.
        :param json_file: An uploaded JSON file to read from.
        :param schema: A SDF schema file (JSON)
        :param na_values: A JSON list of values to interpret as missing data.
        :param perish: Number of seconds after which to delete the dataset.

        :returns: An error message if `url`, `csv_file`, or `scehma` are not
            provided. An error message if an improperly formatted value raises
            a ValueError, e.g. an improperly formatted CSV file. An error
            message if the URL could not be loaded. Otherwise returns a JSON
            string with the dataset ID of the newly created dataset.  Note that
            the dataset will not be fully loaded until its state is set to
            ready.
        """
        result = None
        error = 'url, csv_file or schema required'

        try:
            if schema or url or csv_file or json_file:
                dataset = Dataset()
                dataset.save()

                if schema:
                    dataset.import_schema(schema)
                if na_values:
                    na_values = safe_json_loads(na_values)

                if url:
                    dataset.import_from_url(url, na_values=na_values)
                elif csv_file:
                    dataset.import_from_csv(csv_file, na_values=na_values)
                elif json_file:
                    dataset.import_from_json(json_file)

                result = {Dataset.ID: dataset.dataset_id}

            perish = parse_int(perish)
            if perish:
                dataset.delete(countdown=perish)
        except urllib2.URLError:
            error = 'could not load: %s' % url
        except IOError:
            error = 'could not get a filehandle for: %s' % csv_file

        self.set_response_params(result, success_status_code=201)
        return self._dump_or_error(result, error)
Ejemplo n.º 17
0
class TestObservation(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(self.test_dataset_ids['good_eats.csv'])
        self.query_args = QueryArgs({"rating": "delectible"})

    def __save_records(self):
        Observation.save(self.get_data('good_eats.csv'),
                         self.dataset)
        records = Observation.find(self.dataset)
        self.assertTrue(isinstance(records, list))
        self.assertTrue(isinstance(records[0], dict))
        self.assertTrue('_id' in records[0].keys())

        return records

    def __decode(self, row):
        return Observation.encode(row,
                                  encoding=Observation.decoding(self.dataset))

    def test_encoding(self):
        self.__save_records()
        encoding = Observation.encoding(self.dataset)

        for column in self.dataset.dframe().columns:
            if column == MONGO_ID:
                column = MONGO_ID_ENCODED

            self.assertTrue(column in encoding.keys())

        for v in encoding.values():
            self.assertTrue(isinstance(int(v), int))

    def test_encode_no_dataset(self):
        records = self.__save_records()

        for record in records:
            encoded = Observation.encode(record)
            self.assertEqual(dump_mongo_json(encoded), dump_mongo_json(record))

    def test_save(self):
        records = self.__save_records()
        self.assertEqual(len(records), 19)

    def test_save_over_bulk(self):
        Observation.save(self.get_data('good_eats_large.csv'),
                         self.dataset)
        records = Observation.find(self.dataset)

        self.assertEqual(len(records), 1001)

    def test_find(self):
        self.__save_records()
        rows = Observation.find(self.dataset)

        self.assertTrue(isinstance(rows, list))

    def test_find_with_query(self):
        self.__save_records()
        rows = Observation.find(self.dataset, self.query_args)

        self.assertTrue(isinstance(rows, list))

    def test_find_with_select(self):
        self.__save_records()
        query_args = QueryArgs(select={"rating": 1})
        rows = Observation.find(self.dataset, query_args)

        self.assertTrue(isinstance(rows, list))

        row = self.__decode(rows[0])

        self.assertEquals(sorted(row.keys()), ['_id', 'rating'])

    def test_find_with_select_and_query(self):
        self.__save_records()
        self.query_args.select = {"rating": 1}
        rows = Observation.find(self.dataset, self.query_args)
        self.assertTrue(isinstance(rows, list))

        row = self.__decode(rows[0])

        self.assertEquals(sorted(row.keys()), ['_id', 'rating'])

    def test_delete_all(self):
        self.__save_records()
        records = Observation.find(self.dataset)
        self.assertNotEqual(records, [])
        Observation.delete_all(self.dataset)
        records = Observation.find(self.dataset)

        self.assertEqual(records, [])

    def test_delete_one(self):
        self.__save_records()
        records = Observation.find(self.dataset)
        self.assertNotEqual(records, [])

        row = self.__decode(records[0])

        Observation.delete(self.dataset, row[INDEX])
        new_records = Observation.find(self.dataset)

        # Dump to avoid problems with nan != nan.
        self.assertEqual(dump_mongo_json(records[1:]),
                         dump_mongo_json(new_records))

    def test_delete_encoding(self):
        self.__save_records()
        encoding = Observation.encoding(self.dataset)

        self.assertTrue(isinstance(encoding, dict))

        Observation.delete_encoding(self.dataset)
        encoding = Observation.encoding(self.dataset)

        self.assertEqual(encoding, None)
Ejemplo n.º 18
0
 def _create_dataset_from_url(self, url):
     dataset = Dataset()
     dataset.save()
     return import_data_from_url(
         dataset, url, allow_local_file=True).dataset_id
Ejemplo n.º 19
0
 def _post_file(self, file_name='good_eats.csv'):
     dataset = Dataset()
     dataset.save()
     return import_data_from_csv(
         dataset,
         self._file_mock(self._fixture_path_prefix(file_name))).dataset_id
Ejemplo n.º 20
0
class TestCalculation(TestBase):
    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(self.test_dataset_ids['good_eats.csv'])
        self.formula = 'rating'
        self.name = 'test'

    def _save_calculation(self, formula):
        if not formula:
            formula = self.formula
        return Calculation.create(self.dataset, formula, self.name)

    def _save_observations(self):
        self.dataset.save_observations(self.get_data('good_eats.csv'))

    def _save_observations_and_calculation(self, formula=None):
        self._save_observations()
        return self._save_calculation(formula)

    def test_save(self):
        calculation = self._save_observations_and_calculation()

        self.assertTrue(isinstance(calculation, Calculation))

        record = calculation.record

        self.assertTrue(isinstance(record, dict))
        self.assertTrue(Calculation.FORMULA in record.keys())
        self.assertTrue(Calculation.STATE in record.keys())

        record = Calculation.find(self.dataset)[0].record

        self.assertEqual(record[Calculation.STATE], Calculation.STATE_READY)
        self.assertTrue(Calculation(record).is_ready)

    def test_save_set_status(self):
        record = self._save_observations_and_calculation().record

        self.assertTrue(isinstance(record, dict))
        self.assertTrue(Calculation.FORMULA in record.keys())

    def test_save_set_aggregation(self):
        calculation = self._save_observations_and_calculation('max(amount)')

        self.assertEqual('max', calculation.aggregation)

    def test_save_set_aggregation_id(self):
        calculation = self._save_observations_and_calculation('max(amount)')
        agg_id = self.dataset.aggregated_datasets_dict['']

        self.assertEqual(agg_id, calculation.aggregation_id)

    def test_save_improper_formula(self):
        assert_raises(ParseError, self._save_observations_and_calculation,
                      'NON_EXISTENT_COLUMN')
        try:
            self._save_observations_and_calculation('NON_EXISTENT_COLUMN')
        except ParseError as e:
            self.assertTrue('Missing column' in e.__str__())

    def test_save_unparsable_formula(self):
        assert_raises(ParseError, self._save_observations_and_calculation,
                      '=NON_EXISTENT_COLUMN')
        try:
            self._save_observations_and_calculation('=NON_EXISTENT_COLUMN')
        except ParseError as e:
            self.assertTrue('Parse Failure' in e.__str__())

    def test_save_improper_formula_no_data(self):
        assert_raises(ParseError,
                      Calculation().save, self.dataset, 'NON_EXISTENT_COLUMN',
                      self.name)
        try:
            Calculation().save(self.dataset, 'NON_EXISTENT_COLUMN', self.name)
        except ParseError as e:
            self.assertTrue('No schema' in e.__str__())

    def test_save_unparsable_formula_no_data(self):
        assert_raises(ParseError,
                      Calculation().save, self.dataset, '=NON_EXISTENT_COLUMN',
                      self.name)
        try:
            Calculation().save(self.dataset, '=NON_EXISTENT_COLUMN', self.name)
        except ParseError as e:
            self.assertTrue('Parse Failure' in e.__str__())

    def test_save_non_existent_group(self):
        self._save_observations()
        assert_raises(ParseError,
                      Calculation().save,
                      self.dataset,
                      self.formula,
                      self.name,
                      group_str='NON_EXISTENT_GROUP')
        try:
            Calculation().save(self.dataset,
                               self.formula,
                               self.name,
                               group_str='NON_EXISTENT_GROUP')
        except ParseError as e:
            self.assertTrue('Group' in e.__str__())

    def test_find(self):
        self._save_observations_and_calculation()
        rows = Calculation.find(self.dataset)
        new_record = rows[0].record
        status = new_record.pop(Calculation.STATE)
        self.assertEqual(status, Calculation.STATE_READY)

    def test_sets_dependent_calculations(self):
        self._save_observations_and_calculation()
        self.name = 'test1'
        self._save_calculation('test')
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, ['test1'])

    def test_removes_dependent_calculations(self):
        self._save_observations_and_calculation()
        self.name = 'test1'
        self._save_calculation('test')
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, ['test1'])
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test1')
        calculation.delete(self.dataset)
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, [])

    def test_disallow_delete_dependent_calculation(self):
        self._save_observations_and_calculation()
        self.name = 'test1'
        self._save_calculation('test')
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        self.assertEqual(calculation.dependent_calculations, ['test1'])
        calculation = Calculation.find_one(self.dataset.dataset_id, 'test')
        assert_raises(DependencyError, calculation.delete, self.dataset)