Exemple #1
0
    def test_create_from_schema_and_update(self):
        self._upload_good_eats_schema()
        results = json.loads(self.controller.show(self.dataset_id))

        self.assertFalse(len(results))

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(dataset.num_rows, 0)

        old_schema = dataset.schema
        self._put_row_updates()
        results = json.loads(self.controller.show(self.dataset_id))

        self.assertTrue(len(results))

        for result in results:
            self.assertTrue(isinstance(result, dict))
            self.assertTrue(len(result.keys()))

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(dataset.num_rows, 1)

        new_schema = dataset.schema

        self.assertEqual(set(old_schema.keys()), set(new_schema.keys()))

        for column in new_schema.keys():
            if new_schema.cardinality(column):
                self.assertEqual(new_schema.cardinality(column), 1)
Exemple #2
0
    def test_merge_datasets(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()
        result = json.loads(
            self.controller.merge(
                dataset_ids=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        datasets = [
            Dataset.find_one(dataset_id)
            for dataset_id in [dataset_id1, dataset_id2]
        ]

        for dataset in datasets:
            self.assertTrue(result[Dataset.ID] in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(result[Dataset.ID])
        merged_dframe = merged_dataset.dframe(keep_parent_ids=True)

        for _, row in merged_dframe.iterrows():
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1], ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
    def test_update_after_agg(self):
        dataset_id = self._post_file('wp_data.csv')
        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,wp_id)', 'wp_newest'))

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        self.assertTrue(self.controller.SUCCESS in results)
        self.assertFalse(dataset.aggregated_dataset('') is None)

        update = {
            'submit_date': '2013-01-05',
            'wp_id': 'D',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)
        update = {
            'wp_id': 'E',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)

        dataset = Dataset.find_one(dataset_id)
        current_num_rows = dataset.num_rows
        agg_df = dataset.aggregated_dataset('').dframe()

        self.assertEqual(agg_df.get_value(0, 'wp_newest'), 'D')
        self.assertEqual(current_num_rows, previous_num_rows + 2)
Exemple #4
0
    def test_create_two_from_schema_and_join(self):
        self._upload_good_eats_schema()
        left_dataset_id = self.dataset_id

        schema = open('tests/fixtures/good_eats_aux.schema.json')
        mock_uploaded_file = MockUploadedFile(schema)
        result = json.loads(
            self.controller.create(schema=mock_uploaded_file))
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        right_dataset_id = result[Dataset.ID]

        on = 'food_type'
        dataset_id_tuples = [
            (left_dataset_id, right_dataset_id),
            (right_dataset_id, left_dataset_id),
        ]

        for dataset_ids in dataset_id_tuples:
            result = json.loads(self.controller.join(*dataset_ids, on=on))
            expected_schema_keys = set(sum([
                Dataset.find_one(dataset_id).schema.keys()
                for dataset_id in dataset_ids], []))

            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)
            merge_dataset_id = result[Dataset.ID]
            dataset = Dataset.find_one(merge_dataset_id)
            self.assertEqual(dataset.num_rows, 0)
            self.assertEqual(dataset.num_columns, len(expected_schema_keys))
            schema_keys = set(dataset.schema.keys())
            self.assertEqual(schema_keys, expected_schema_keys)
Exemple #5
0
    def test_create_one_from_schema_and_join(self):
        self._upload_good_eats_schema()
        left_dataset_id = self.dataset_id
        right_dataset_id = self._post_file('good_eats_aux.csv')

        on = 'food_type'
        dataset_id_tuples = [
            (left_dataset_id, right_dataset_id),
            (right_dataset_id, left_dataset_id),
        ]

        for dataset_ids in dataset_id_tuples:
            result = json.loads(self.controller.join(*dataset_ids, on=on))
            expected_schema_keys = set(sum([
                Dataset.find_one(dataset_id).schema.keys()
                for dataset_id in dataset_ids], []))

            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)
            merge_dataset_id = result[Dataset.ID]
            dataset = Dataset.find_one(merge_dataset_id)
            self.assertEqual(dataset.num_rows, 0)
            self.assertEqual(dataset.num_columns, len(expected_schema_keys))
            schema_keys = set(dataset.schema.keys())
            self.assertEqual(schema_keys, expected_schema_keys)
Exemple #6
0
    def test_merge_datasets(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()
        result = json.loads(self.controller.merge(
            dataset_ids=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        datasets = [Dataset.find_one(dataset_id)
                    for dataset_id in [dataset_id1, dataset_id2]]

        for dataset in datasets:
            self.assertTrue(result[Dataset.ID] in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(result[Dataset.ID])
        merged_rows = merged_dataset.observations()

        for row in merged_rows:
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1],
                                 ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
Exemple #7
0
    def test_join_datasets_different_columns(self):
        left_dataset_id = self._post_file()
        right_dataset_id = self._post_file('good_eats_aux_join.csv')
        on_lhs = 'food_type'
        on_rhs = 'also_food_type'
        on = '%s,%s' % (on_lhs, on_rhs)
        results = json.loads(self.controller.join(
            left_dataset_id, right_dataset_id, on=on))

        self.assertTrue(isinstance(results, dict))
        self.assertTrue(Datasets.SUCCESS in results.keys())
        self.assertTrue(Dataset.ID in results.keys())

        joined_dataset_id = results[Dataset.ID]
        data = json.loads(self.controller.show(joined_dataset_id))

        self.assertTrue('code' in data[0].keys())

        left_dataset = Dataset.find_one(left_dataset_id)
        right_dataset = Dataset.find_one(right_dataset_id)

        self.assertEqual([('right', right_dataset_id, on, joined_dataset_id)],
                         left_dataset.joined_dataset_ids)
        self.assertEqual([('left', left_dataset_id, on, joined_dataset_id)],
                         right_dataset.joined_dataset_ids)
    def test_merge_datasets_async(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()

        self.assertEqual(
            Dataset.find_one(dataset_id1).state,
            Dataset.STATE_PENDING)
        self.assertEqual(
            Dataset.find_one(dataset_id2).state,
            Dataset.STATE_PENDING)

        result = json.loads(self.controller.merge(
            datasets=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        merged_id = result[Dataset.ID]

        # wait for background tasks for finish
        while True:
            results1 = json.loads(self.controller.show(dataset_id1))
            results2 = json.loads(self.controller.show(dataset_id2))
            results3 = json.loads(self.controller.show(merged_id))

            if all([len(res) for res in [results1, results2, results3]]):
                break

            sleep(self.SLEEP_DELAY)

        while True:
            datasets = [Dataset.find_one(dataset_id)
                        for dataset_id in [dataset_id1, dataset_id2]]

            if all([dataset.is_ready for dataset in datasets]):
                break

            sleep(self.SLEEP_DELAY)

        for dataset in datasets:
            self.assertTrue(merged_id in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(merged_id)
        merged_rows = merged_dataset.observations()

        for row in merged_rows:
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1],
                                 ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
Exemple #9
0
    def test_update_after_agg_group(self):
        dataset_id = self._post_file('wp_data.csv')
        group = 'wp_id'
        self._wait_for_dataset_state(dataset_id)

        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,functional)', 'wp_functional',
                             group=group))
        results = json.loads(self.controller.create(dataset_id,
                             'max(submit_date)', 'latest_submit_date',
                             group=group))

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        while True:
            dataset = Dataset.find_one(dataset_id)

            if dataset.aggregated_datasets.get(group) and all(
                    [c.is_ready for c in dataset.calculations()]):
                break
            sleep(1)

        agg_dframe = dataset.aggregated_datasets[group].dframe()
        self.assertEqual(
            set(['wp_id', 'wp_functional', 'latest_submit_date']),
            set(agg_dframe.columns.tolist()))

        self.assertTrue(self.controller.SUCCESS in results.keys())

        update = {
            'wp_id': 'D',
            'functional': 'yes',
        }
        self._post_update(dataset_id, update)
        update = {
            'submit_date': '2013-01-08',
            'wp_id': 'A',
            'functional': 'no',
        }
        self._post_update(dataset_id, update)

        while True:
            dataset = Dataset.find_one(dataset_id)
            current_num_rows = dataset.num_rows

            if not len(dataset.pending_updates):
                break

            sleep(1)

        dataset = Dataset.find_one(dataset_id)
        agg_dframe = dataset.aggregated_datasets[group].dframe()

        self.assertEqual(agg_dframe.get_value(0, 'wp_id'), 'A')
        self.assertEqual(current_num_rows, previous_num_rows + 2)
        self.assertEqual(set(agg_dframe[group]),
                         set(['A', 'B', 'C', 'D', 'n/a']))
    def test_delete_update_summary(self):
        self.__post_formula()

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name in dataset.stats.get(Dataset.ALL).keys())

        json.loads(self.controller.delete(self.dataset_id, self.name))

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.stats.get(Dataset.ALL).keys())
Exemple #11
0
    def test_merge_datasets_async(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file()

        self.assertEqual(
            Dataset.find_one(dataset_id1).state, Dataset.STATE_PENDING)
        self.assertEqual(
            Dataset.find_one(dataset_id2).state, Dataset.STATE_PENDING)

        result = json.loads(
            self.controller.merge(
                dataset_ids=json.dumps([dataset_id1, dataset_id2])))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        merged_id = result[Dataset.ID]

        while True:
            datasets = [
                Dataset.find_one(dataset_id)
                for dataset_id in [merged_id, dataset_id1, dataset_id2]
            ]

            if all([dataset.record_ready for dataset in datasets]) and all(
                [d.merged_dataset_ids for d in datasets[1:]]):
                break

            sleep(self.SLEEP_DELAY)

        datasets = [
            Dataset.find_one(dataset_id)
            for dataset_id in [dataset_id1, dataset_id2]
        ]

        for dataset in datasets:
            self.assertTrue(merged_id in dataset.merged_dataset_ids)

        dframe1 = datasets[0].dframe()
        merged_dataset = Dataset.find_one(merged_id)
        merged_dframe = merged_dataset.dframe(keep_parent_ids=True)

        for _, row in merged_dframe.iterrows():
            self.assertTrue(PARENT_DATASET_ID in row.keys())

        merged_dframe = merged_dataset.dframe()

        self.assertEqual(len(merged_dframe), 2 * len(dframe1))

        expected_dframe = concat([dframe1, dframe1], ignore_index=True)

        self.assertEqual(list(merged_dframe.columns),
                         list(expected_dframe.columns))

        self._check_dframes_are_equal(merged_dframe, expected_dframe)
Exemple #12
0
    def test_create_dataset_with_duplicate_column_names(self):
        formula_names = [
            'water_not_functioning_none',  # an already slugged column
            'water_not_functioning/none',  # a non-slug column
            'region',                # an existing column
            'sum',                   # a reserved key
            'date',                  # a reserved key and an existing column
        ]

        for formula_name in formula_names:
            dataset_id = self._post_file('water_points.csv')
            dframe_before = Dataset.find_one(dataset_id).dframe()

            # a calculation
            response = json.loads(self.controller.create(
                dataset_id,
                'water_source_type in ["borehole"]',
                formula_name))

            self.assertTrue(isinstance(response, dict))
            self.assertTrue(self.controller.SUCCESS in response)

            dataset = Dataset.find_one(dataset_id)
            name = dataset.calculations()[-1].name

            # an aggregation
            response = json.loads(self.controller.create(
                dataset_id,
                'newest(date_, water_functioning)',
                formula_name))

            self.assertTrue(isinstance(response, dict))
            self.assertTrue(self.controller.SUCCESS in response)

            dframe_after = dataset.dframe()
            slug = dataset.schema.labels_to_slugs[name]

            self.assertEqual(len(dframe_before), len(dframe_after))
            self.assertTrue(slug not in dframe_before.columns)
            self.assertTrue(slug in dframe_after.columns)
            self.assertEqual(
                len(dframe_before.columns) + 1, len(dframe_after.columns))

            # check OK on update
            update = {
                'date': '2013-01-05',
                'water_source_type': 'borehole',
            }
            result = self._post_update(dataset_id, update)
            dataset = Dataset.find_one(dataset_id)
            dframe_after_update = dataset.dframe()
            self.assertEqual(len(dframe_after) + 1, len(dframe_after_update))
    def test_delete_calculation_not_in_dataset(self):
        self.__post_formula()

        # Remove column from dataset
        dataset = Dataset.find_one(self.dataset_id)
        dataset.delete_columns([self.name])

        result = json.loads(self.controller.delete(self.dataset_id, self.name))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.schema.labels_to_slugs)
Exemple #14
0
    def test_delete_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        response = json.loads(self._post_formula())
        result = json.loads(
            self.controller.delete(self.dataset_id, self.name, ''))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        agg_dataset = Dataset.find_one(dataset.aggregated_datasets_dict[''])

        self.assertTrue(self.name not in agg_dataset.schema.labels_to_slugs)
    def test_update_diff_schema(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        update = json.dumps({column: '2'})

        expected_col_schema = dataset.schema[column]

        self.controller.update(dataset_id=dataset_id, update=update)
        dataset = Dataset.find_one(dataset_id)

        self.assertEqual(dataset.num_rows, self.NUM_ROWS + 1)
        self.assertEqual(expected_col_schema, dataset.schema[column])
    def test_update_diff_schema(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        update = json.dumps({column: '2'})

        expected_col_schema = dataset.schema[column]

        self.controller.update(dataset_id=dataset_id, update=update)
        dataset = Dataset.find_one(dataset_id)

        self.assertEqual(dataset.num_rows, self.NUM_ROWS + 1)
        self.assertEqual(expected_col_schema, dataset.schema[column])
Exemple #17
0
    def __verify_create(self, response):
        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        self.__wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.schema.keys())

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(TestAbstractDatasets.NUM_ROWS, len(dataset.dframe()))
Exemple #18
0
    def test_merge_with_map(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file('good_eats_aux.csv')
        merged_dataset_id = self._post_merge([dataset_id1, dataset_id2])

        expected_columns = Dataset.find_one(
            dataset_id1).dframe().columns.tolist()
        expected_columns.remove("food_type")
        expected_columns.append("food_type_2")
        expected_columns = set(expected_columns)

        merged_dataset = Dataset.find_one(merged_dataset_id)
        new_columns = set(merged_dataset.dframe().columns)

        self.assertEquals(expected_columns, new_columns)
Exemple #19
0
    def test_create_remove_summary(self):
        dataset_id = self._post_file()
        Datasets().summary(
            dataset_id,
            select=Datasets.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertTrue(isinstance(dataset.stats, dict))
        self.assertTrue(isinstance(dataset.stats[Dataset.ALL], dict))

        self._post_formula()
        # stats should have new column for calculation
        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.stats.get(Dataset.ALL).keys())
Exemple #20
0
    def test_merge_with_map(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file('good_eats_aux.csv')
        merged_dataset_id = self._post_merge([dataset_id1, dataset_id2])

        expected_columns = Dataset.find_one(
            dataset_id1).dframe().columns.tolist()
        expected_columns.remove("food_type")
        expected_columns.append("food_type_2")
        expected_columns = set(expected_columns)

        merged_dataset = Dataset.find_one(merged_dataset_id)
        new_columns = set(merged_dataset.dframe().columns)

        self.assertEquals(expected_columns, new_columns)
Exemple #21
0
    def test_update_row(self):
        dataset_id = self._post_file()
        index = 0
        update = {"amount": 10, "food_type": "breakfast"}
        expected_dframe = Dataset.find_one(dataset_id).dframe()
        expected_row = expected_dframe.ix[0].to_dict()
        expected_row.update(update)
        expected_dframe.ix[0] = Series(expected_row)

        results = json.loads(self.controller.row_update(dataset_id, index, json.dumps(update)))
        self.assertTrue(Datasets.SUCCESS in results.keys())

        dframe = Dataset.find_one(dataset_id).dframe()
        self.assertEqual(self.NUM_ROWS, len(dframe))
        self._check_dframes_are_equal(expected_dframe, dframe)
    def test_create_update_summary(self):
        dataset_id = self._post_file()
        Datasets().summary(
            dataset_id,
            select=Datasets.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertTrue(isinstance(dataset.stats, dict))
        self.assertTrue(isinstance(dataset.stats[Dataset.ALL], dict))

        self.__post_formula()

        # stats should have new column for calculation
        dataset = Dataset.find_one(self.dataset_id)
        stats = dataset.stats.get(Dataset.ALL)
        self.assertTrue(self.name in stats.keys())
Exemple #23
0
 def test_find(self):
     for dataset_name in self.TEST_DATASETS:
         record = Dataset().save(self.test_dataset_ids[dataset_name])
         rows = Dataset.find(self.test_dataset_ids[dataset_name])
         self.assertEqual(record, rows[0].record)
         self.assertEqual(record, Dataset.find_one(
                          self.test_dataset_ids[dataset_name]).record)
    def test_create_two_from_schema_and_join_and_update_lhs_rhs(self):
        left_dataset_id = self.dataset_id

        result = self._upload_from_schema('good_eats_aux.schema.json')
        right_dataset_id = result[Dataset.ID]

        on = 'food_type'
        result = json.loads(self.controller.join(
            left_dataset_id, right_dataset_id, on=on))
        merged_dataset_id = self.dataset_id = result[Dataset.ID]

        num_rows = 0
        results = json.loads(self.controller.show(merged_dataset_id))
        self.assertEqual(num_rows, len(results))

        num_rows += 1
        self._put_row_updates(left_dataset_id,
                              file_name='good_eats_update_bg.json')
        results = json.loads(self.controller.show(merged_dataset_id))
        self.assertEqual(num_rows, len(results))

        prev_num_cols = len(
            Dataset.find_one(merged_dataset_id).dframe().columns)
        self.dataset_id = right_dataset_id
        self._put_row_updates(right_dataset_id,
                              file_name='good_eats_aux_update.json')
        results = json.loads(self.controller.show(merged_dataset_id))
        self.assertEqual(num_rows, len(results))
        result = results[0]
        self.assertTrue('code' in result.keys())
        self.assertFalse(result['code'] is None)
    def test_find_one(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            record = dataset.record
            row = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertEqual(record, row.record)
Exemple #26
0
    def test_merge_with_map_update(self):
        dataset_id1 = self._post_file()
        dataset_id2 = self._post_file('good_eats_aux.csv')
        merged_dataset_id = self._post_merge([dataset_id1, dataset_id2])

        original_ds2 = json.loads(self.controller.show(dataset_id2))
        original_length = len(original_ds2)
        original_merge = json.loads(self.controller.show(merged_dataset_id))
        original_merge_length = len(original_merge)

        self._put_row_updates(dataset_id2, 'good_eats_aux_update.json')
        response = json.loads(self.controller.show(dataset_id2))
        new_length = len(response)

        for new_row in response:
            if new_row not in original_ds2:
                break

        response = json.loads(self.controller.show(merged_dataset_id))

        for new_merge_row in response:
            if new_merge_row not in original_merge:
                break

        new_merge_length = len(response)

        self.assertEqual(original_length + 1, new_length)
        self.assertEqual(original_merge_length + 1, new_merge_length)
        self.assertEqual(new_row['food_type'], new_merge_row['food_type_2'])
        self.assertEqual(new_row['code'], new_merge_row['comments'])
        merged_dataset = Dataset.find_one(merged_dataset_id)
Exemple #27
0
def merge_dataset_ids(dataset_ids):
    """Load a JSON array of dataset IDs and start a background merge task.

    :param dataset_ids: An array of dataset IDs to merge.

    :raises: `MergeError` if less than 2 datasets are provided. If a dataset
        cannot be found for a dataset ID it is ignored. Therefore if 2 dataset
        IDs are provided and one of them is bad an error is raised.  However,
        if three dataset IDs are provided and one of them is bad, an error is
        not raised.
    """
    dataset_ids = json.loads(dataset_ids)
    datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids]
    datasets = [dataset for dataset in datasets if dataset.record]

    if len(datasets) < 2:
        raise MergeError(
            'merge requires 2 datasets (found %s)' % len(datasets))

    new_dataset = Dataset()
    new_dataset.save()

    call_async(_merge_datasets_task, new_dataset, datasets)

    return new_dataset
Exemple #28
0
    def test_create_async_sets_calculation_status(self):
        self.dataset_id = create_dataset_from_url(
            '%s%s' % (self._local_fixture_prefix(), 'good_eats_huge.csv'),
            allow_local_file=True).dataset_id

        self._wait_for_dataset_state(self.dataset_id)

        response = json.loads(self._post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        response = json.loads(self.controller.show(self.dataset_id))[0]

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(Calculation.STATE in response)
        self.assertEqual(response[Calculation.STATE],
                         Calculation.STATE_PENDING)

        self._wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.schema.keys())
Exemple #29
0
    def test_find_one(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            record = dataset.record
            row = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertEqual(record, row.record)
    def test_delete(self):
        self.__post_formula()
        result = json.loads(self.controller.delete(self.dataset_id, self.name))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.schema.labels_to_slugs)
Exemple #31
0
 def test_update(self):
     for dataset_name in self.TEST_DATASETS:
         dataset = Dataset.create(self.test_dataset_ids[dataset_name])
         self.assertFalse('field' in dataset.record)
         dataset.update({'field': {'key': 'value'}})
         dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])
         self.assertTrue('field' in dataset.record)
         self.assertEqual(dataset.record['field'], {'key': 'value'})
    def test_update_diff_schema_unconvertable(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        update = json.dumps({column: 'a'})

        expected_col_schema = dataset.schema[column]

        result = json.loads(
            self.controller.update(dataset_id=dataset_id, update=update))
        dataset = Dataset.find_one(dataset_id)

        # the update is rejected
        self.assertTrue(Datasets.ERROR in result)
        self.assertEqual(dataset.num_rows, self.NUM_ROWS)
        self.assertEqual(expected_col_schema, dataset.schema[column])
    def test_update_diff_schema_unconvertable(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        update = json.dumps({column: 'a'})

        expected_col_schema = dataset.schema[column]

        result = json.loads(self.controller.update(dataset_id=dataset_id,
                                                   update=update))
        dataset = Dataset.find_one(dataset_id)

        # the update is rejected
        self.assertTrue(Datasets.ERROR in result)
        self.assertEqual(dataset.num_rows, self.NUM_ROWS)
        self.assertEqual(expected_col_schema, dataset.schema[column])
Exemple #34
0
 def __init__(self, db, dataset_id=None, dataset=None):
     self._db = db
     if dataset_id:
         self.dataset = Dataset.find_one(dataset_id)
     if dataset:
         self.dataset = dataset
     if not dataset_id and not dataset:
         raise Exception(u"Please specify a dataset_id")
Exemple #35
0
    def test_delete_row(self):
        dataset_id = self._post_file()
        index = 0
        expected_dframe = Dataset.find_one(dataset_id).dframe()[index + 1 :].reset_index()
        del expected_dframe["index"]

        results = json.loads(self.controller.row_delete(dataset_id, index))
        self.assertTrue(Datasets.SUCCESS in results.keys())

        dataset = Dataset.find_one(dataset_id)
        dframe = dataset.dframe()
        self.assertEqual(self.NUM_ROWS - 1, len(dframe))
        self._check_dframes_are_equal(expected_dframe, dframe)

        # check info updated
        info = dataset.info()
        self.assertEqual(self.NUM_ROWS - 1, info[Dataset.NUM_ROWS])
 def __init__(self, db, dataset_id=None, dataset=None):
     self._db = db
     if dataset_id:
         self.dataset = Dataset.find_one(dataset_id)
     if dataset:
         self.dataset = dataset
     if not dataset_id and not dataset:
         raise Exception(u"Please specify a dataset_id")
    def test_delete_with_query(self):
        dataset_id = self._post_file()
        query = {'food_type': 'caffeination'}
        dataset = Dataset.find_one(dataset_id)
        dframe = dataset.dframe(query_args=QueryArgs(query=query))
        len_after_delete = len(dataset.dframe()) - len(dframe)

        query = json.dumps(query)
        result = json.loads(self.controller.delete(dataset_id, query=query))
        message = result[Datasets.SUCCESS]

        self.assertTrue('deleted dataset' in message)
        self.assertTrue(query in message)
        self.assertEqual(result[Dataset.ID], dataset_id)

        dframe = Dataset.find_one(dataset_id).dframe()

        self.assertEqual(len(dframe), len_after_delete)
    def test_update(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            self.assertFalse('field' in dataset.record)
            dataset.update({'field': {'key': 'value'}})
            dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertTrue('field' in dataset.record)
            self.assertEqual(dataset.record['field'], {'key': 'value'})
        def action(dataset):
            other_dataset = Dataset.find_one(other_dataset_id)

            if other_dataset.record:
                merged_dataset = dataset.join(other_dataset, on)

                return self._success('joined dataset %s to %s on %s' % (
                    other_dataset_id, dataset_id, on),
                    merged_dataset.dataset_id)
Exemple #40
0
    def test_update(self):
        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            self.assertFalse("field" in dataset.record)
            dataset.update({"field": {"key": "value"}})
            dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])

            self.assertTrue("field" in dataset.record)
            self.assertEqual(dataset.record["field"], {"key": "value"})
Exemple #41
0
    def test_delete_with_query(self):
        dataset_id = self._post_file()
        query = {'food_type': 'caffeination'}
        dataset = Dataset.find_one(dataset_id)
        dframe = dataset.dframe(query_args=QueryArgs(query=query))
        len_after_delete = len(dataset.dframe()) - len(dframe)

        query = json.dumps(query)
        result = json.loads(self.controller.delete(dataset_id, query=query))
        message = result[Datasets.SUCCESS]

        self.assertTrue('deleted dataset' in message)
        self.assertTrue(query in message)
        self.assertEqual(result[Dataset.ID], dataset_id)

        dframe = Dataset.find_one(dataset_id).dframe()

        self.assertEqual(len(dframe), len_after_delete)
Exemple #42
0
    def _wait_for_dataset_state(self, dataset_id):
        while True:
            dataset = Dataset.find_one(dataset_id)

            if dataset.state != Dataset.STATE_PENDING:
                break

            sleep(self.SLEEP_DELAY)

        return dataset
    def test_edit_row_with_join_invalid(self):
        index = 0
        update = {'food_type': 'deserts'}

        left_dataset_id = self._post_file()
        right_dataset_id = self._post_file('good_eats_aux.csv')
        num_rows_before = Dataset.find_one(right_dataset_id).num_rows
        on = 'food_type'
        json.loads(
            self.controller.join(left_dataset_id, right_dataset_id, on=on))

        results = json.loads(
            self.controller.row_update(right_dataset_id, index,
                                       json.dumps(update)))
        self.assertTrue(Datasets.SUCCESS in results.keys())

        dataset = Dataset.find_one(right_dataset_id)
        self.assertEqual(num_rows_before, dataset.num_rows)
        self.assertEqual(dataset.pending_updates, [])
    def test_create_from_csv(self):
        result = self.__upload_mocked_file()
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        # test parse type as date correctly
        dframe = Dataset.find_one(result[Dataset.ID]).dframe()
        self.assertTrue(isinstance(dframe.submit_date[0], datetime))

        results = self._test_summary_built(result)
        self._test_summary_no_group(results)
    def test_edit_row(self):
        dataset_id = self._post_file()
        index = 0
        update = {'amount': 10, 'food_type': 'breakfast'}
        expected_dframe = Dataset.find_one(dataset_id).dframe()
        expected_row = expected_dframe.ix[0].to_dict()
        expected_row.update(update)
        expected_dframe.ix[0] = Series(expected_row)

        results = json.loads(
            self.controller.row_update(dataset_id, index, json.dumps(update)))
        self.assertTrue(Datasets.SUCCESS in results.keys())

        dataset = Dataset.find_one(dataset_id)
        dframe = dataset.dframe()
        self.assertEqual(self.NUM_ROWS, len(dframe))
        self._check_dframes_are_equal(expected_dframe, dframe)

        # check that previous row exists
        all_observations = Observation.find(dataset, include_deleted=True)
        self.assertEqual(self.NUM_ROWS + 1, len(all_observations))
    def test_summary_async(self):
        dataset_id = self._post_file()
        results = self.controller.summary(
            dataset_id, select=self.controller.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertEqual(dataset.state, Dataset.STATE_PENDING)

        results = self._test_summary_results(results)

        self.assertTrue(Datasets.ERROR in results.keys())
        self.assertTrue('not finished' in results[Datasets.ERROR])
    def test_newest(self):
        expected_dataset = {
            u'wp_functional': {0: u'no', 1: u'yes', 2: u'no', 3: u'yes'},
            u'id': {0: 1, 1: 2, 2: 3, 3: 4}}
        dataset_id = self._post_file('newest_test.csv')
        self.controller.create(dataset_id,
                               'newest(submit_date,functional)',
                               'wp_functional', group='id')
        dataset = Dataset.find_one(dataset_id)
        agg_ds = dataset.aggregated_dataset('id')

        self.assertEqual(expected_dataset, agg_ds.dframe().to_dict())
    def test_create_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        response = json.loads(self.__post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertEqual(response[Dataset.ID], self.dataset_id)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue('' in dataset.aggregated_datasets_dict.keys())
    def test_create_from_json(self):
        mock = self._file_mock(self._fixture_path_prefix('good_eats.json'))
        result = json.loads(self.controller.create(json_file=mock))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)

        # test parse type as date correctly
        dframe = Dataset.find_one(result[Dataset.ID]).dframe()
        self.assertTrue(isinstance(dframe.submit_date[0], datetime))

        results = self._test_summary_built(result)
        self._test_summary_no_group(results)
    def test_plot_index(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        select = {column: 1}
        result = self.controller.plot(dataset_id,
                                      select=json.dumps(select),
                                      index='submit_date')
        dframe = dataset.dframe()

        dframe = self.dataset.dframe(QueryArgs(select=select))
        self.__test_result(result, dframe)
    def test_join_datasets(self):
        left_dataset_id = self._post_file()
        right_dataset_id = self._post_file('good_eats_aux.csv')
        on = 'food_type'
        results = json.loads(
            self.controller.join(left_dataset_id, right_dataset_id, on=on))

        self.assertTrue(Datasets.SUCCESS in results.keys())
        self.assertTrue(Dataset.ID in results.keys())

        joined_dataset_id = results[Dataset.ID]
        data = json.loads(self.controller.show(joined_dataset_id))

        self.assertTrue('code' in data[0].keys())

        left_dataset = Dataset.find_one(left_dataset_id)
        right_dataset = Dataset.find_one(right_dataset_id)

        self.assertEqual([('right', right_dataset_id, on, joined_dataset_id)],
                         left_dataset.joined_dataset_ids)
        self.assertEqual([('left', left_dataset_id, on, joined_dataset_id)],
                         right_dataset.joined_dataset_ids)
    def test_create_async_not_ready(self):
        self.dataset_id = self._create_dataset_from_url(
            '%s%s' % (self._local_fixture_prefix(), 'good_eats_huge.csv'))
        response = json.loads(self.__post_formula())
        dataset = Dataset.find_one(self.dataset_id)

        self.assertFalse(dataset.is_ready)
        self.assertTrue(isinstance(response, dict))
        self.assertFalse(DATASET_ID in response)

        self._wait_for_dataset_state(self.dataset_id)

        self.assertFalse(self.name in dataset.schema.keys())
    def test_bad_date(self):
        dataset_id = self._post_file('bad_date.csv')
        dataset = Dataset.find_one(dataset_id)

        self.assertEqual(dataset.num_rows, 1)
        self.assertEqual(len(dataset.schema.keys()), 3)

        result = json.loads(
            self.controller.summary(
                dataset_id,
                select=self.controller.SELECT_ALL_FOR_SUMMARY,
                group='name'))

        self.assertTrue('name' in result.keys())