Example #1
0
    def test_create_from_schema_and_update(self):
        self._upload_good_eats_schema()
        results = json.loads(self.controller.show(self.dataset_id))

        self.assertFalse(len(results))

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(dataset.num_rows, 0)

        old_schema = dataset.schema
        self._put_row_updates()
        results = json.loads(self.controller.show(self.dataset_id))

        self.assertTrue(len(results))

        for result in results:
            self.assertTrue(isinstance(result, dict))
            self.assertTrue(len(result.keys()))

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(dataset.num_rows, 1)

        new_schema = dataset.schema

        self.assertEqual(set(old_schema.keys()), set(new_schema.keys()))

        for column in new_schema.keys():
            if new_schema.cardinality(column):
                self.assertEqual(new_schema.cardinality(column), 1)
    def test_merge_datasets_async(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()

        self.assertEqual(
            Dataset.find_one(dataset_id1).state,
            Dataset.STATE_PENDING)
        self.assertEqual(
            Dataset.find_one(dataset_id2).state,
            Dataset.STATE_PENDING)

        result = json.loads(self.controller.merge(
            datasets=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        merged_id = result[Dataset.ID]

        # wait for background tasks for finish
        while True:
            results1 = json.loads(self.controller.show(dataset_id1))
            results2 = json.loads(self.controller.show(dataset_id2))
            results3 = json.loads(self.controller.show(merged_id))

            if all([len(res) for res in [results1, results2, results3]]):
                break

            sleep(self.SLEEP_DELAY)

        while True:
            datasets = [Dataset.find_one(dataset_id)
                        for dataset_id in [dataset_id1, dataset_id2]]

            if all([dataset.is_ready for dataset in datasets]):
                break

            sleep(self.SLEEP_DELAY)

        for dataset in datasets:
            self.assertTrue(merged_id in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(merged_id)
        merged_rows = merged_dataset.observations()

        for row in merged_rows:
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1],
                                 ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
    def test_update_after_agg(self):
        dataset_id = self._post_file('wp_data.csv')
        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,wp_id)', 'wp_newest'))

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        self.assertTrue(self.controller.SUCCESS in results)
        self.assertFalse(dataset.aggregated_dataset('') is None)

        update = {
            'submit_date': '2013-01-05',
            'wp_id': 'D',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)
        update = {
            'wp_id': 'E',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)

        dataset = Dataset.find_one(dataset_id)
        current_num_rows = dataset.num_rows
        agg_df = dataset.aggregated_dataset('').dframe()

        self.assertEqual(agg_df.get_value(0, 'wp_newest'), 'D')
        self.assertEqual(current_num_rows, previous_num_rows + 2)
Example #4
0
 def test_find(self):
     for dataset_name in self.TEST_DATASETS:
         record = Dataset().save(self.test_dataset_ids[dataset_name])
         rows = Dataset.find(self.test_dataset_ids[dataset_name])
         self.assertEqual(record, rows[0].record)
         self.assertEqual(record, Dataset.find_one(
                          self.test_dataset_ids[dataset_name]).record)
Example #5
0
    def test_merge_datasets(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()
        result = json.loads(
            self.controller.merge(
                dataset_ids=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        datasets = [
            Dataset.find_one(dataset_id)
            for dataset_id in [dataset_id1, dataset_id2]
        ]

        for dataset in datasets:
            self.assertTrue(result[Dataset.ID] in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(result[Dataset.ID])
        merged_dframe = merged_dataset.dframe(keep_parent_ids=True)

        for _, row in merged_dframe.iterrows():
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1], ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
Example #6
0
File: io.py Project: helioid/bamboo
def create_dataset_from_csv(csv_file):
    """Create a dataset from a CSV file.

    .. note::

        Write to a named tempfile in order  to get a handle for pandas'
        `read_csv` function.

    :param csv_file: The CSV File to create a dataset from.

    :returns: The created dataset.
    """
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(csv_file.file.read())

    # pandas needs a closed file for *read_csv*
    tmpfile.close()

    dataset = Dataset()
    dataset.save()

    call_async(import_dataset, dataset,
               file_reader=partial(_file_reader, tmpfile.name, delete=True))

    return dataset
Example #7
0
    def test_join_datasets_different_columns(self):
        left_dataset_id = self._post_file()
        right_dataset_id = self._post_file('good_eats_aux_join.csv')
        on_lhs = 'food_type'
        on_rhs = 'also_food_type'
        on = '%s,%s' % (on_lhs, on_rhs)
        results = json.loads(self.controller.join(
            left_dataset_id, right_dataset_id, on=on))

        self.assertTrue(isinstance(results, dict))
        self.assertTrue(Datasets.SUCCESS in results.keys())
        self.assertTrue(Dataset.ID in results.keys())

        joined_dataset_id = results[Dataset.ID]
        data = json.loads(self.controller.show(joined_dataset_id))

        self.assertTrue('code' in data[0].keys())

        left_dataset = Dataset.find_one(left_dataset_id)
        right_dataset = Dataset.find_one(right_dataset_id)

        self.assertEqual([('right', right_dataset_id, on, joined_dataset_id)],
                         left_dataset.joined_dataset_ids)
        self.assertEqual([('left', left_dataset_id, on, joined_dataset_id)],
                         right_dataset.joined_dataset_ids)
Example #8
0
    def test_create_two_from_schema_and_join(self):
        self._upload_good_eats_schema()
        left_dataset_id = self.dataset_id

        schema = open('tests/fixtures/good_eats_aux.schema.json')
        mock_uploaded_file = MockUploadedFile(schema)
        result = json.loads(
            self.controller.create(schema=mock_uploaded_file))
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        right_dataset_id = result[Dataset.ID]

        on = 'food_type'
        dataset_id_tuples = [
            (left_dataset_id, right_dataset_id),
            (right_dataset_id, left_dataset_id),
        ]

        for dataset_ids in dataset_id_tuples:
            result = json.loads(self.controller.join(*dataset_ids, on=on))
            expected_schema_keys = set(sum([
                Dataset.find_one(dataset_id).schema.keys()
                for dataset_id in dataset_ids], []))

            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)
            merge_dataset_id = result[Dataset.ID]
            dataset = Dataset.find_one(merge_dataset_id)
            self.assertEqual(dataset.num_rows, 0)
            self.assertEqual(dataset.num_columns, len(expected_schema_keys))
            schema_keys = set(dataset.schema.keys())
            self.assertEqual(schema_keys, expected_schema_keys)
Example #9
0
    def test_find_one(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            record = dataset.record
            row = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertEqual(record, row.record)
Example #10
0
def merge_dataset_ids(dataset_ids):
    """Load a JSON array of dataset IDs and start a background merge task.

    :param dataset_ids: An array of dataset IDs to merge.

    :raises: `MergeError` if less than 2 datasets are provided. If a dataset
        cannot be found for a dataset ID it is ignored. Therefore if 2 dataset
        IDs are provided and one of them is bad an error is raised.  However,
        if three dataset IDs are provided and one of them is bad, an error is
        not raised.
    """
    dataset_ids = json.loads(dataset_ids)
    datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids]
    datasets = [dataset for dataset in datasets if dataset.record]

    if len(datasets) < 2:
        raise MergeError(
            'merge requires 2 datasets (found %s)' % len(datasets))

    new_dataset = Dataset()
    new_dataset.save()

    call_async(_merge_datasets_task, new_dataset, datasets)

    return new_dataset
    def test_find_one(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            record = dataset.record
            row = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertEqual(record, row.record)
Example #12
0
    def test_merge_datasets(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()
        result = json.loads(self.controller.merge(
            dataset_ids=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        datasets = [Dataset.find_one(dataset_id)
                    for dataset_id in [dataset_id1, dataset_id2]]

        for dataset in datasets:
            self.assertTrue(result[Dataset.ID] in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(result[Dataset.ID])
        merged_rows = merged_dataset.observations()

        for row in merged_rows:
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1],
                                 ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
Example #13
0
    def test_create_one_from_schema_and_join(self):
        self._upload_good_eats_schema()
        left_dataset_id = self.dataset_id
        right_dataset_id = self._post_file('good_eats_aux.csv')

        on = 'food_type'
        dataset_id_tuples = [
            (left_dataset_id, right_dataset_id),
            (right_dataset_id, left_dataset_id),
        ]

        for dataset_ids in dataset_id_tuples:
            result = json.loads(self.controller.join(*dataset_ids, on=on))
            expected_schema_keys = set(sum([
                Dataset.find_one(dataset_id).schema.keys()
                for dataset_id in dataset_ids], []))

            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)
            merge_dataset_id = result[Dataset.ID]
            dataset = Dataset.find_one(merge_dataset_id)
            self.assertEqual(dataset.num_rows, 0)
            self.assertEqual(dataset.num_columns, len(expected_schema_keys))
            schema_keys = set(dataset.schema.keys())
            self.assertEqual(schema_keys, expected_schema_keys)
Example #14
0
 def test_delete(self):
     for dataset_name in self.TEST_DATASETS:
         dataset = Dataset.create(self.test_dataset_ids[dataset_name])
         records = Dataset.find(self.test_dataset_ids[dataset_name])
         self.assertNotEqual(records, [])
         dataset.delete()
         records = Dataset.find(self.test_dataset_ids[dataset_name])
         self.assertEqual(records, [])
Example #15
0
 def test_update(self):
     for dataset_name in self.TEST_DATASETS:
         dataset = Dataset.create(self.test_dataset_ids[dataset_name])
         self.assertFalse('field' in dataset.record)
         dataset.update({'field': {'key': 'value'}})
         dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])
         self.assertTrue('field' in dataset.record)
         self.assertEqual(dataset.record['field'], {'key': 'value'})
Example #16
0
    def test_update_after_agg_group(self):
        dataset_id = self._post_file('wp_data.csv')
        group = 'wp_id'
        self._wait_for_dataset_state(dataset_id)

        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,functional)', 'wp_functional',
                             group=group))
        results = json.loads(self.controller.create(dataset_id,
                             'max(submit_date)', 'latest_submit_date',
                             group=group))

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        while True:
            dataset = Dataset.find_one(dataset_id)

            if dataset.aggregated_datasets.get(group) and all(
                    [c.is_ready for c in dataset.calculations()]):
                break
            sleep(1)

        agg_dframe = dataset.aggregated_datasets[group].dframe()
        self.assertEqual(
            set(['wp_id', 'wp_functional', 'latest_submit_date']),
            set(agg_dframe.columns.tolist()))

        self.assertTrue(self.controller.SUCCESS in results.keys())

        update = {
            'wp_id': 'D',
            'functional': 'yes',
        }
        self._post_update(dataset_id, update)
        update = {
            'submit_date': '2013-01-08',
            'wp_id': 'A',
            'functional': 'no',
        }
        self._post_update(dataset_id, update)

        while True:
            dataset = Dataset.find_one(dataset_id)
            current_num_rows = dataset.num_rows

            if not len(dataset.pending_updates):
                break

            sleep(1)

        dataset = Dataset.find_one(dataset_id)
        agg_dframe = dataset.aggregated_datasets[group].dframe()

        self.assertEqual(agg_dframe.get_value(0, 'wp_id'), 'A')
        self.assertEqual(current_num_rows, previous_num_rows + 2)
        self.assertEqual(set(agg_dframe[group]),
                         set(['A', 'B', 'C', 'D', 'n/a']))
    def test_update(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            self.assertFalse('field' in dataset.record)
            dataset.update({'field': {'key': 'value'}})
            dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertTrue('field' in dataset.record)
            self.assertEqual(dataset.record['field'], {'key': 'value'})
Example #18
0
    def test_update(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            self.assertFalse("field" in dataset.record)
            dataset.update({"field": {"key": "value"}})
            dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertTrue("field" in dataset.record)
            self.assertEqual(dataset.record["field"], {"key": "value"})
    def test_delete_update_summary(self):
        self.__post_formula()

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name in dataset.stats.get(Dataset.ALL).keys())

        json.loads(self.controller.delete(self.dataset_id, self.name))

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.stats.get(Dataset.ALL).keys())
    def test_delete(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            records = Dataset.find(self.test_dataset_ids[dataset_name])
            self.assertNotEqual(records, [])
            dataset.delete()
            records = Dataset.find(self.test_dataset_ids[dataset_name])

            self.assertEqual(records, [])
            self.assertEqual(Observation.encoding(dataset), None)
 def setUp(self):
     TestBase.setUp(self)
     self.dataset = Dataset()
     self.dataset.save(
         self.test_dataset_ids['good_eats_with_calculations.csv'])
     dframe = recognize_dates(
         self.get_data('good_eats_with_calculations.csv'))
     self.dataset.save_observations(dframe)
     self.group = None
     self.places = 5
Example #22
0
    def test_merge_datasets_async(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()

        self.assertEqual(
            Dataset.find_one(dataset_id1).state, Dataset.STATE_PENDING)
        self.assertEqual(
            Dataset.find_one(dataset_id2).state, Dataset.STATE_PENDING)

        result = json.loads(
            self.controller.merge(
                dataset_ids=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        merged_id = result[Dataset.ID]

        while True:
            datasets = [
                Dataset.find_one(dataset_id)
                for dataset_id in [merged_id, dataset_id1, dataset_id2]
            ]

            if all([dataset.record_ready for dataset in datasets]) and all(
                [d.merged_dataset_ids for d in datasets[1:]]):
                break

            sleep(self.SLEEP_DELAY)

        datasets = [
            Dataset.find_one(dataset_id)
            for dataset_id in [dataset_id1, dataset_id2]
        ]

        for dataset in datasets:
            self.assertTrue(merged_id in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(merged_id)
        merged_dframe = merged_dataset.dframe(keep_parent_ids=True)

        for _, row in merged_dframe.iterrows():
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1], ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
Example #23
0
    def test_create_dataset_with_duplicate_column_names(self):
        formula_names = [
            'water_not_functioning_none',  # an already slugged column
            'water_not_functioning/none',  # a non-slug column
            'region',                # an existing column
            'sum',                   # a reserved key
            'date',                  # a reserved key and an existing column
        ]

        for formula_name in formula_names:
            dataset_id = self._post_file('water_points.csv')
            dframe_before = Dataset.find_one(dataset_id).dframe()

            # a calculation
            response = json.loads(self.controller.create(
                dataset_id,
                'water_source_type in ["borehole"]',
                formula_name))

            self.assertTrue(isinstance(response, dict))
            self.assertTrue(self.controller.SUCCESS in response)

            dataset = Dataset.find_one(dataset_id)
            name = dataset.calculations()[-1].name

            # an aggregation
            response = json.loads(self.controller.create(
                dataset_id,
                'newest(date_, water_functioning)',
                formula_name))

            self.assertTrue(isinstance(response, dict))
            self.assertTrue(self.controller.SUCCESS in response)

            dframe_after = dataset.dframe()
            slug = dataset.schema.labels_to_slugs[name]

            self.assertEqual(len(dframe_before), len(dframe_after))
            self.assertTrue(slug not in dframe_before.columns)
            self.assertTrue(slug in dframe_after.columns)
            self.assertEqual(
                len(dframe_before.columns) + 1, len(dframe_after.columns))

            # check OK on update
            update = {
                'date': '2013-01-05',
                'water_source_type': 'borehole',
            }
            result = self._post_update(dataset_id, update)
            dataset = Dataset.find_one(dataset_id)
            dframe_after_update = dataset.dframe()
            self.assertEqual(len(dframe_after) + 1, len(dframe_after_update))
    def test_delete_calculation_not_in_dataset(self):
        self.__post_formula()

        # Remove column from dataset
        dataset = Dataset.find_one(self.dataset_id)
        dataset.delete_columns([self.name])

        result = json.loads(self.controller.delete(self.dataset_id, self.name))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.schema.labels_to_slugs)
Example #25
0
File: io.py Project: helioid/bamboo
def create_dataset_from_json(json_file):
    content = json_file.file.read()

    dataset = Dataset()
    dataset.save()

    def file_reader(content):
        return pd.DataFrame(json.loads(content))

    call_async(import_dataset, dataset,
               file_reader=partial(file_reader, content))

    return dataset
Example #26
0
    def test_delete_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        response = json.loads(self._post_formula())
        result = json.loads(
            self.controller.delete(self.dataset_id, self.name, ''))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        agg_dataset = Dataset.find_one(dataset.aggregated_datasets_dict[''])

        self.assertTrue(self.name not in agg_dataset.schema.labels_to_slugs)
    def test_update_diff_schema(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        update = json.dumps({column: '2'})

        expected_col_schema = dataset.schema[column]

        self.controller.update(dataset_id=dataset_id, update=update)
        dataset = Dataset.find_one(dataset_id)

        self.assertEqual(dataset.num_rows, self.NUM_ROWS + 1)
        self.assertEqual(expected_col_schema, dataset.schema[column])
    def test_update_diff_schema(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        update = json.dumps({column: '2'})

        expected_col_schema = dataset.schema[column]

        self.controller.update(dataset_id=dataset_id, update=update)
        dataset = Dataset.find_one(dataset_id)

        self.assertEqual(dataset.num_rows, self.NUM_ROWS + 1)
        self.assertEqual(expected_col_schema, dataset.schema[column])
Example #29
0
    def __verify_create(self, response):
        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        self.__wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.schema.keys())

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(TestAbstractDatasets.NUM_ROWS, len(dataset.dframe()))
Example #30
0
    def test_create_remove_summary(self):
        dataset_id = self._post_file()
        Datasets().summary(
            dataset_id,
            select=Datasets.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertTrue(isinstance(dataset.stats, dict))
        self.assertTrue(isinstance(dataset.stats[Dataset.ALL], dict))

        self._post_formula()
        # stats should have new column for calculation
        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.stats.get(Dataset.ALL).keys())
Example #31
0
    def test_merge_with_map(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file('good_eats_aux.csv')
        merged_dataset_id = self._post_merge([dataset_id1, dataset_id2])

        expected_columns = Dataset.find_one(
            dataset_id1).dframe().columns.tolist()
        expected_columns.remove("food_type")
        expected_columns.append("food_type_2")
        expected_columns = set(expected_columns)

        merged_dataset = Dataset.find_one(merged_dataset_id)
        new_columns = set(merged_dataset.dframe().columns)

        self.assertEquals(expected_columns, new_columns)
Example #32
0
    def test_merge_with_map(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file('good_eats_aux.csv')
        merged_dataset_id = self._post_merge([dataset_id1, dataset_id2])

        expected_columns = Dataset.find_one(
            dataset_id1).dframe().columns.tolist()
        expected_columns.remove("food_type")
        expected_columns.append("food_type_2")
        expected_columns = set(expected_columns)

        merged_dataset = Dataset.find_one(merged_dataset_id)
        new_columns = set(merged_dataset.dframe().columns)

        self.assertEquals(expected_columns, new_columns)
    def test_aggregations_datasets_with_multigroup(self):
        self.dataset_id = self._post_file()
        group = 'food_type,rating'
        self._post_calculations(self.default_formulae + ['sum(amount)'], group)
        results = self._test_aggregations([group])
        # only so we can split
        dataset = Dataset()
        row_keys = (dataset.split_groups(group) + ['sum_amount_']).sort()

        for row in results:
            sorted_row_keys = row.keys().sort()
            self.assertEqual(sorted_row_keys, row_keys)
            self.assertTrue(isinstance(row.values()[0], basestring))
            self.assertTrue(isinstance(row.values()[1], basestring))
            self.assertTrue(isinstance(row.values()[2], float))
Example #34
0
    def test_update_row(self):
        dataset_id = self._post_file()
        index = 0
        update = {"amount": 10, "food_type": "breakfast"}
        expected_dframe = Dataset.find_one(dataset_id).dframe()
        expected_row = expected_dframe.ix[0].to_dict()
        expected_row.update(update)
        expected_dframe.ix[0] = Series(expected_row)

        results = json.loads(self.controller.row_update(dataset_id, index, json.dumps(update)))
        self.assertTrue(Datasets.SUCCESS in results.keys())

        dframe = Dataset.find_one(dataset_id).dframe()
        self.assertEqual(self.NUM_ROWS, len(dframe))
        self._check_dframes_are_equal(expected_dframe, dframe)
Example #35
0
class TestCalculator(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(
            self.test_dataset_ids['good_eats_with_calculations.csv'])
        dframe = recognize_dates(
            self.get_data('good_eats_with_calculations.csv'))
        self.dataset.save_observations(dframe)
        self.group = None
        self.parser = Parser(self.dataset)
        self.places = 5

    def _equal_msg(self, calculated, stored, formula):
        return '(calculated %s) %s != (stored %s) %s ' % (type(calculated),
               calculated, type(stored), stored) +\
            '(within %s places), formula: %s' % (self.places, formula)

    def _test_calculator(self):
        self.dframe = self.dataset.dframe()

        columns = self.dframe.columns.tolist()
        self.start_num_cols = len(columns)
        self.added_num_cols = 0

        column_labels_to_slugs = {
            column_attrs[Dataset.LABEL]: (column_name) for
            (column_name, column_attrs) in self.dataset.schema.items()
        }
        self.label_list, self.slugified_key_list = [
            list(ary) for ary in zip(*column_labels_to_slugs.items())
        ]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx

            self.parser.validate_formula(formula)

            calculator = Calculator(self.dataset)

            calculation = Calculation()
            calculation.save(self.dataset, formula, name, self.group)
            calculator.calculate_columns([calculation])

            self.column_labels_to_slugs = self.dataset.schema.labels_to_slugs

            self._test_calculation_results(name, formula)
    def test_create_two_from_schema_and_join_and_update_lhs_rhs(self):
        left_dataset_id = self.dataset_id

        result = self._upload_from_schema('good_eats_aux.schema.json')
        right_dataset_id = result[Dataset.ID]

        on = 'food_type'
        result = json.loads(self.controller.join(
            left_dataset_id, right_dataset_id, on=on))
        merged_dataset_id = self.dataset_id = result[Dataset.ID]

        num_rows = 0
        results = json.loads(self.controller.show(merged_dataset_id))
        self.assertEqual(num_rows, len(results))

        num_rows += 1
        self._put_row_updates(left_dataset_id,
                              file_name='good_eats_update_bg.json')
        results = json.loads(self.controller.show(merged_dataset_id))
        self.assertEqual(num_rows, len(results))

        prev_num_cols = len(
            Dataset.find_one(merged_dataset_id).dframe().columns)
        self.dataset_id = right_dataset_id
        self._put_row_updates(right_dataset_id,
                              file_name='good_eats_aux_update.json')
        results = json.loads(self.controller.show(merged_dataset_id))
        self.assertEqual(num_rows, len(results))
        result = results[0]
        self.assertTrue('code' in result.keys())
        self.assertFalse(result['code'] is None)
Example #37
0
    def test_create_async_sets_calculation_status(self):
        self.dataset_id = create_dataset_from_url(
            '%s%s' % (self._local_fixture_prefix(), 'good_eats_huge.csv'),
            allow_local_file=True).dataset_id

        self._wait_for_dataset_state(self.dataset_id)

        response = json.loads(self._post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        response = json.loads(self.controller.show(self.dataset_id))[0]

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(Calculation.STATE in response)
        self.assertEqual(response[Calculation.STATE],
                         Calculation.STATE_PENDING)

        self._wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.schema.keys())
Example #38
0
    def test_merge_with_map_update(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file('good_eats_aux.csv')
        merged_dataset_id = self._post_merge([dataset_id1, dataset_id2])

        original_ds2 = json.loads(self.controller.show(dataset_id2))
        original_length = len(original_ds2)
        original_merge = json.loads(self.controller.show(merged_dataset_id))
        original_merge_length = len(original_merge)

        self._put_row_updates(dataset_id2, 'good_eats_aux_update.json')
        response = json.loads(self.controller.show(dataset_id2))
        new_length = len(response)

        for new_row in response:
            if new_row not in original_ds2:
                break

        response = json.loads(self.controller.show(merged_dataset_id))

        for new_merge_row in response:
            if new_merge_row not in original_merge:
                break

        new_merge_length = len(response)

        self.assertEqual(original_length + 1, new_length)
        self.assertEqual(original_merge_length + 1, new_merge_length)
        self.assertEqual(new_row['food_type'], new_merge_row['food_type_2'])
        self.assertEqual(new_row['code'], new_merge_row['comments'])
        merged_dataset = Dataset.find_one(merged_dataset_id)
    def test_save(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset().save(self.test_dataset_ids[dataset_name])
            record = dataset.record

            self.assertTrue(isinstance(record, dict))
            self.assertTrue('_id' in record.keys())
    def test_create_update_summary(self):
        dataset_id = self._post_file()
        Datasets().summary(
            dataset_id,
            select=Datasets.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertTrue(isinstance(dataset.stats, dict))
        self.assertTrue(isinstance(dataset.stats[Dataset.ALL], dict))

        self.__post_formula()

        # stats should have new column for calculation
        dataset = Dataset.find_one(self.dataset_id)
        stats = dataset.stats.get(Dataset.ALL)
        self.assertTrue(self.name in stats.keys())
Example #41
0
    def test_aggregations_datasets_with_multigroup(self):
        self.dataset_id = self._post_file()
        group = 'food_type,rating'
        self._post_calculations(self.default_formulae + ['sum(amount)'], group)
        results = self._test_aggregations([group])
        # only so we can split
        dataset = Dataset()
        row_keys = (dataset.split_groups(group) +
                    ['sum_amount_']).sort()

        for row in results:
            sorted_row_keys = row.keys().sort()
            self.assertEqual(sorted_row_keys, row_keys)
            self.assertTrue(isinstance(row.values()[0], basestring))
            self.assertTrue(isinstance(row.values()[1], basestring))
            self.assertTrue(isinstance(row.values()[2], float))
    def test_delete(self):
        self.__post_formula()
        result = json.loads(self.controller.delete(self.dataset_id, self.name))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.schema.labels_to_slugs)
Example #43
0
 def __init__(self, db, dataset_id=None, dataset=None):
     self._db = db
     if dataset_id:
         self.dataset = Dataset.find_one(dataset_id)
     if dataset:
         self.dataset = dataset
     if not dataset_id and not dataset:
         raise Exception(u"Please specify a dataset_id")
    def test_update_diff_schema_unconvertable(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        update = json.dumps({column: 'a'})

        expected_col_schema = dataset.schema[column]

        result = json.loads(
            self.controller.update(dataset_id=dataset_id, update=update))
        dataset = Dataset.find_one(dataset_id)

        # the update is rejected
        self.assertTrue(Datasets.ERROR in result)
        self.assertEqual(dataset.num_rows, self.NUM_ROWS)
        self.assertEqual(expected_col_schema, dataset.schema[column])
class TestCalculator(TestBase):
    def setUp(self):
        TestBase.setUp(self)
        self.dataset = Dataset()
        self.dataset.save(
            self.test_dataset_ids['good_eats_with_calculations.csv'])
        dframe = recognize_dates(
            self.get_data('good_eats_with_calculations.csv'))
        self.dataset.save_observations(dframe)
        self.group = None
        self.places = 5

    def _equal_msg(self, calculated, stored, formula):
        return '(calculated %s) %s != (stored %s) %s ' % (type(calculated),
               calculated, type(stored), stored) +\
            '(within %s places), formula: %s' % (self.places, formula)

    def _test_calculator(self):
        self.dframe = self.dataset.dframe()

        columns = self.dframe.columns.tolist()
        self.start_num_cols = len(columns)
        self.added_num_cols = 0

        column_labels_to_slugs = {
            column_attrs[Dataset.LABEL]: (column_name)
            for (column_name, column_attrs) in self.dataset.schema.items()
        }
        self.label_list, self.slugified_key_list = [
            list(ary) for ary in zip(*column_labels_to_slugs.items())
        ]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx

            Parser.validate_formula(formula, self.dataset)

            calculation = Calculation()
            calculation.save(self.dataset, formula, name, self.group)
            self.now = now()
            calculate_columns(self.dataset, [calculation])

            self.column_labels_to_slugs = self.dataset.schema.labels_to_slugs

            self._test_calculation_results(name, formula)
Example #46
0
        def action(dataset):
            other_dataset = Dataset.find_one(other_dataset_id)

            if other_dataset.record:
                merged_dataset = dataset.join(other_dataset, on)

                return self._success('joined dataset %s to %s on %s' % (
                    other_dataset_id, dataset_id, on),
                    merged_dataset.dataset_id)
    def test_delete_with_query(self):
        dataset_id = self._post_file()
        query = {'food_type': 'caffeination'}
        dataset = Dataset.find_one(dataset_id)
        dframe = dataset.dframe(query_args=QueryArgs(query=query))
        len_after_delete = len(dataset.dframe()) - len(dframe)

        query = json.dumps(query)
        result = json.loads(self.controller.delete(dataset_id, query=query))
        message = result[Datasets.SUCCESS]

        self.assertTrue('deleted dataset' in message)
        self.assertTrue(query in message)
        self.assertEqual(result[Dataset.ID], dataset_id)

        dframe = Dataset.find_one(dataset_id).dframe()

        self.assertEqual(len(dframe), len_after_delete)
    def test_summary_with_multigroup(self):
        dataset_id = self._post_file()
        group_columns = 'rating,food_type'

        results = self.controller.summary(
            dataset_id,
            group=group_columns,
            select=self.controller.SELECT_ALL_FOR_SUMMARY)

        results = self._test_summary_results(results)

        self.assertFalse(Datasets.ERROR in results.keys())
        self.assertTrue(group_columns in results.keys())
        # for split
        dataset = Dataset()
        self.assertEqual(
            len(dataset.split_groups(results[group_columns].keys()[0])),
            len(dataset.split_groups(group_columns)))
Example #49
0
    def _wait_for_dataset_state(self, dataset_id):
        while True:
            dataset = Dataset.find_one(dataset_id)

            if dataset.state != Dataset.STATE_PENDING:
                break

            sleep(self.SLEEP_DELAY)

        return dataset
    def test_edit_row_with_join_invalid(self):
        index = 0
        update = {'food_type': 'deserts'}

        left_dataset_id = self._post_file()
        right_dataset_id = self._post_file('good_eats_aux.csv')
        num_rows_before = Dataset.find_one(right_dataset_id).num_rows
        on = 'food_type'
        json.loads(
            self.controller.join(left_dataset_id, right_dataset_id, on=on))

        results = json.loads(
            self.controller.row_update(right_dataset_id, index,
                                       json.dumps(update)))
        self.assertTrue(Datasets.SUCCESS in results.keys())

        dataset = Dataset.find_one(right_dataset_id)
        self.assertEqual(num_rows_before, dataset.num_rows)
        self.assertEqual(dataset.pending_updates, [])
    def test_create_from_csv(self):
        result = self.__upload_mocked_file()
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        # test parse type as date correctly
        dframe = Dataset.find_one(result[Dataset.ID]).dframe()
        self.assertTrue(isinstance(dframe.submit_date[0], datetime))

        results = self._test_summary_built(result)
        self._test_summary_no_group(results)
    def test_edit_row(self):
        dataset_id = self._post_file()
        index = 0
        update = {'amount': 10, 'food_type': 'breakfast'}
        expected_dframe = Dataset.find_one(dataset_id).dframe()
        expected_row = expected_dframe.ix[0].to_dict()
        expected_row.update(update)
        expected_dframe.ix[0] = Series(expected_row)

        results = json.loads(
            self.controller.row_update(dataset_id, index, json.dumps(update)))
        self.assertTrue(Datasets.SUCCESS in results.keys())

        dataset = Dataset.find_one(dataset_id)
        dframe = dataset.dframe()
        self.assertEqual(self.NUM_ROWS, len(dframe))
        self._check_dframes_are_equal(expected_dframe, dframe)

        # check that previous row exists
        all_observations = Observation.find(dataset, include_deleted=True)
        self.assertEqual(self.NUM_ROWS + 1, len(all_observations))
    def test_create_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        response = json.loads(self.__post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertEqual(response[Dataset.ID], self.dataset_id)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue('' in dataset.aggregated_datasets_dict.keys())
    def test_newest(self):
        expected_dataset = {
            u'wp_functional': {0: u'no', 1: u'yes', 2: u'no', 3: u'yes'},
            u'id': {0: 1, 1: 2, 2: 3, 3: 4}}
        dataset_id = self._post_file('newest_test.csv')
        self.controller.create(dataset_id,
                               'newest(submit_date,functional)',
                               'wp_functional', group='id')
        dataset = Dataset.find_one(dataset_id)
        agg_ds = dataset.aggregated_dataset('id')

        self.assertEqual(expected_dataset, agg_ds.dframe().to_dict())