def test_find_one(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) record = dataset.record row = Dataset.find_one(self.test_dataset_ids[dataset_name]) self.assertEqual(record, row.record)
def test_delete(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) records = Dataset.find(self.test_dataset_ids[dataset_name]) self.assertNotEqual(records, []) dataset.delete() records = Dataset.find(self.test_dataset_ids[dataset_name]) self.assertEqual(records, [])
def test_update(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) self.assertFalse('field' in dataset.record) dataset.update({'field': {'key': 'value'}}) dataset = Dataset.find_one(self.test_dataset_ids[dataset_name]) self.assertTrue('field' in dataset.record) self.assertEqual(dataset.record['field'], {'key': 'value'})
def test_update(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) self.assertFalse("field" in dataset.record) dataset.update({"field": {"key": "value"}}) dataset = Dataset.find_one(self.test_dataset_ids[dataset_name]) self.assertTrue("field" in dataset.record) self.assertEqual(dataset.record["field"], {"key": "value"})
def test_delete(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) records = Dataset.find(self.test_dataset_ids[dataset_name]) self.assertNotEqual(records, []) dataset.delete() records = Dataset.find(self.test_dataset_ids[dataset_name]) self.assertEqual(records, []) self.assertEqual(Observation.encoding(dataset), None)
def test_dframe(self): dataset = Dataset.create(self.test_dataset_ids["good_eats.csv"]) dataset.save_observations(recognize_dates(self.get_data("good_eats.csv"))) dframe = dataset.dframe() self.assertTrue(isinstance(dframe, DataFrame)) self.assertTrue(all(self.get_data("good_eats.csv").reindex(columns=dframe.columns).eq(dframe))) columns = dframe.columns # ensure no reserved keys self.assertFalse(MONGO_ID_ENCODED in columns) # ensure date is converted self.assertTrue(isinstance(dframe.submit_date[0], datetime))
def test_dframe(self): dataset = Dataset.create(self.test_dataset_ids['good_eats.csv']) dataset.save_observations( recognize_dates(self.get_data('good_eats.csv'))) dframe = dataset.dframe() self.assertTrue(isinstance(dframe, DataFrame)) self.assertTrue(all(self.get_data('good_eats.csv').reindex( columns=dframe.columns).eq(dframe))) columns = dframe.columns # ensure no reserved keys self.assertFalse(MONGO_ID_ENCODED in columns) # ensure date is converted self.assertTrue(isinstance(dframe.submit_date[0], datetime))
def test_dframe(self): dataset = Dataset.create(self.test_dataset_ids['good_eats.csv']) dataset.save_observations( recognize_dates(self.get_data('good_eats.csv'))) records = [x for x in Observation.find(dataset)] dframe = dataset.dframe() self.assertTrue(isinstance(dframe, DataFrame)) self.assertTrue(all(self.get_data('good_eats.csv').reindex( columns=dframe.columns).eq(dframe))) columns = dframe.columns # ensure no reserved keys for key in MONGO_RESERVED_KEY_STRS: self.assertFalse(key in columns) # ensure date is converted self.assertTrue(isinstance(dframe.submit_date[0], datetime))
def test_build_schema(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) dataset.build_schema(self.get_data(dataset_name)) # get dataset with new schema dataset = Dataset.find_one(self.test_dataset_ids[dataset_name]) for key in [ Dataset.CREATED_AT, Dataset.SCHEMA, Dataset.UPDATED_AT]: self.assertTrue(key in dataset.record.keys()) df_columns = self.get_data(dataset_name).columns.tolist() seen_columns = [] for column_name, column_attributes in dataset.schema.items(): # check column_name is unique self.assertFalse(column_name in seen_columns) seen_columns.append(column_name) # check column name is only legal chars self.assertFalse(RE_ENCODED_COLUMN.search(column_name)) # check has require attributes self.assertTrue(SIMPLETYPE in column_attributes) self.assertTrue(OLAP_TYPE in column_attributes) self.assertTrue(Dataset.LABEL in column_attributes) # check label is an original column original_col = column_attributes[Dataset.LABEL] error_msg = '%s not in %s' % (original_col, df_columns) self.assertTrue(original_col in df_columns, error_msg) df_columns.remove(column_attributes[Dataset.LABEL]) # check not reserved key self.assertFalse(column_name == MONGO_ID_ENCODED) # ensure all columns in df_columns have store columns self.assertTrue(len(df_columns) == 0)
def test_build_schema(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) dataset.build_schema(self.get_data(dataset_name)) # get dataset with new schema dataset = Dataset.find_one(self.test_dataset_ids[dataset_name]) for key in [ Dataset.CREATED_AT, Dataset.SCHEMA, Dataset.UPDATED_AT]: self.assertTrue(key in dataset.record.keys()) df_columns = self.get_data(dataset_name).columns.tolist() seen_columns = [] for column_name, column_attributes in dataset.schema.items(): # check column_name is unique self.assertFalse(column_name in seen_columns) seen_columns.append(column_name) # check column name is only legal chars self.assertFalse(RE_ENCODED_COLUMN.search(column_name)) # check has require attributes self.assertTrue(SIMPLETYPE in column_attributes) self.assertTrue(OLAP_TYPE in column_attributes) self.assertTrue(Dataset.LABEL in column_attributes) # check label is an original column original_col = column_attributes[Dataset.LABEL] error_msg = '%s not in %s' % (original_col, df_columns) self.assertTrue(original_col in df_columns, error_msg) df_columns.remove(column_attributes[Dataset.LABEL]) # check not reserved key self.assertFalse(column_name in MONGO_RESERVED_KEY_STRS) # ensure all columns in df_columns have store columns self.assertTrue(len(df_columns) == 0)
def merge_dataset_ids(dataset_ids, mapping): """Load a JSON array of dataset IDs and start a background merge task. :param dataset_ids: An array of dataset IDs to merge. :raises: `MergeError` if less than 2 datasets are provided. If a dataset cannot be found for a dataset ID it is ignored. Therefore if 2 dataset IDs are provided and one of them is bad an error is raised. However, if three dataset IDs are provided and one of them is bad, an error is not raised. """ datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids] datasets = [dataset for dataset in datasets if dataset.record] if len(datasets) < 2: raise MergeError( 'merge requires 2 datasets (found %s)' % len(datasets)) new_dataset = Dataset.create() call_async(__merge_datasets_task, new_dataset, datasets, mapping) return new_dataset
def merge_dataset_ids(dataset_ids, mapping): """Load a JSON array of dataset IDs and start a background merge task. :param dataset_ids: An array of dataset IDs to merge. :raises: `MergeError` if less than 2 datasets are provided. If a dataset cannot be found for a dataset ID it is ignored. Therefore if 2 dataset IDs are provided and one of them is bad an error is raised. However, if three dataset IDs are provided and one of them is bad, an error is not raised. """ datasets = [Dataset.find_one(dataset_id) for dataset_id in dataset_ids] datasets = [dataset for dataset in datasets if dataset.record] if len(datasets) < 2: raise MergeError('merge requires 2 datasets (found %s)' % len(datasets)) new_dataset = Dataset.create() call_async(__merge_datasets_task, new_dataset, datasets, mapping) return new_dataset
def _post_file(self, file_name='good_eats.csv'): dataset = Dataset.create() return dataset.import_from_csv( self._file_mock(self._fixture_path_prefix(file_name))).dataset_id
def _create_dataset_from_url(self, url): dataset = Dataset.create() return dataset.import_from_url(url, allow_local_file=True).dataset_id
def test_create(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) self.assertTrue(isinstance(dataset, Dataset))
def test_count(self): dataset = Dataset.create(self.test_dataset_ids["good_eats.csv"]) dataset.save_observations(recognize_dates(self.get_data("good_eats.csv"))) self.assertEqual(len(dataset.dframe()), dataset.count())
def test_count(self): dataset = Dataset.create(self.test_dataset_ids['good_eats.csv']) dataset.save_observations( recognize_dates(self.get_data('good_eats.csv'))) self.assertEqual(len(dataset.dframe()), dataset.count())