class TestMimirDatasetAnnotations(unittest.TestCase): def setUp(self): """Create empty server directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.fileserver = FileSystemFilestore(FILESERVER_DIR) self.db = MimirDatastore(DATASTORE_DIRECTORY) def tearDown(self): """Delete server directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_dataset_annotations(self): """Run test for Mimir datastore.""" dh = self.db.load_dataset( f_handle=self.fileserver.upload_file(DATA_FILE)) ds = self.db.get_dataset(dh.identifier) rows = ds.fetch_rows() print(ds.row_ids) for row in rows: print(str(row.identifier) + '\t' + str(row.values)) for row_id in ds.row_ids: for anno in ds.get_annotations(column_id=1, row_id=row_id): print(str(row_id) + '\t' + anno.key + '=' + str(anno.value))
def setUp(self): """Create empty server directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.fileserver = FileSystemFilestore(FILESERVER_DIR) self.db = MimirDatastore(DATASTORE_DIRECTORY)
def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR)
def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = MimirProcessor() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) self.available_lenses = set(mimir.getAvailableLensTypes())
def set_up(self, engine): """Create an empty file server repository.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) # Setup file server self.fs = FileSystemFilestore(FILESERVER_DIR) # Setup the respective datastore and Vizual engine if engine == ENGINEENV_DEFAULT: self.datastore = FileSystemDatastore(DATASTORE_DIR) elif engine == ENGINEENV_MIMIR: self.datastore = MimirDatastore(DATASTORE_DIR)
class TestSQLProcessor(unittest.TestCase): def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_run_sql_query(self): """Test running a SQL query without materializing the result.""" f_handle = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) cmd = sql_cell(source='SELECT grade_or_service_category FROM ' + DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'', validate=True) result = SQLTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(datasets={DATASET_NAME: ds.identifier}, datastore=self.datastore, filestore=self.filestore)) self.assertTrue(result.is_success) self.assertIsNone(result.provenance.read) self.assertIsNone(result.provenance.write) self.assertTrue(len(result.outputs.stdout) > 0) self.assertEqual(len(result.outputs.stderr), 0) # Materialize result cmd = sql_cell(source='SELECT grade_or_service_category FROM ' + DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'', output_dataset='ge', validate=True) result = SQLTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(datasets={DATASET_NAME: ds.identifier}, datastore=self.datastore, filestore=self.filestore)) self.assertTrue(result.is_success) self.assertIsNone(result.provenance.read) self.assertIsNotNone(result.provenance.write) self.assertTrue('ge' in result.provenance.write) self.assertTrue(len(result.outputs.stdout) > 0) self.assertEqual(len(result.outputs.stderr), 0)
def get_datastore(self, identifier): """Get the datastore instance for the project with the given identifier. Paramaters ---------- identifier: string Unique identifier for datastore Returns ------- vizier.datastore.base.Datastore """ datastore_dir = os.path.join(self.base_path, identifier) return MimirDatastore(datastore_dir)
def test_mimir_config(self): """Run workflows for Mimir configurations.""" # Create new work trail and retrieve the HEAD workflow of the default # branch import vizier.mimir as mimir self.run_workflow(MimirDatastore(DATASTORE_DIR))
def datastore_init(self): """Test initalizing a datastore with existing datasets.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) self.db = MimirDatastore(DATASTORE_DIR)
def set_up(self): """Create empty data store directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.db = MimirDatastore(DATASTORE_DIR)
class TestMimirDatastore(unittest.TestCase): def setup_fileserver(self): """Create a fresh file server.""" if os.path.isdir(FILESERVER_DIR): shutil.rmtree(FILESERVER_DIR) os.mkdir(FILESERVER_DIR) self.fileserver = FileSystemFilestore(FILESERVER_DIR) def set_up(self): """Create empty data store directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.db = MimirDatastore(DATASTORE_DIR) def tear_down(self): """Delete data store directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_mimir_datastore(self): """Run test for Mimir datastore.""" self.set_up() self.dataset_load() self.tear_down() self.set_up() self.datastore_init() self.tear_down() self.set_up() self.dataset_read() self.tear_down() self.set_up() self.dataset_column_index() self.tear_down() self.tear_down() def datastore_init(self): """Test initalizing a datastore with existing datasets.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) self.db = MimirDatastore(DATASTORE_DIR) def dataset_column_index(self): """Test the column by id index of the dataset handle.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) # Ensure that the project data has three columns and two rows self.assertEqual(ds.column_by_id(0).name.upper(), 'NAME') self.assertEqual(ds.column_by_id(1).name.upper(), 'AGE') self.assertEqual(ds.column_by_id(2).name.upper(), 'SALARY') with self.assertRaises(ValueError): ds.column_by_id(5) ds.columns.append(DatasetColumn(identifier=5, name='NEWNAME')) self.assertEqual(ds.column_by_id(5).name.upper(), 'NEWNAME') with self.assertRaises(ValueError): ds.column_by_id(4) def dataset_load(self): """Test create and delete dataset.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) # Ensure that the project data has three columns and two rows self.assertEqual(len(ds.columns), 3) self.assertEqual(len(ds.fetch_rows()), 2) self.assertEqual(ds.row_count, 2) def dataset_read(self): """Test reading a dataset.""" self.setup_fileserver() dh = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) ds = self.db.get_dataset(dh.identifier) ds_rows = ds.fetch_rows() self.assertEqual(dh.identifier, ds.identifier) self.assertEqual(len(dh.columns), len(ds.columns)) self.assertEqual(len(dh.fetch_rows()), len(ds_rows)) self.assertEqual(len(dh.fetch_rows()), len(ds_rows)) self.assertEqual(dh.row_count, len(ds_rows)) # Name,Age,Salary # Alice,23,35K # Bob,32,30K self.assertEqual(ds.column_index('Name'), 0) self.assertEqual(ds.column_index('Age'), 1) self.assertEqual(ds.column_index('Salary'), 2) row = ds_rows[0] self.assertEqual(row.values[0], 'Alice') self.assertEqual(int(row.values[1]), 23) self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 'Bob') self.assertEqual(int(row.values[1]), 32) self.assertEqual(row.values[2], '30K')
class TestDatasetPaginationReader(unittest.TestCase): def set_up(self, engine): """Create an empty file server repository.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) # Setup file server self.fs = FileSystemFilestore(FILESERVER_DIR) # Setup the respective datastore and Vizual engine if engine == ENGINEENV_DEFAULT: self.datastore = FileSystemDatastore(DATASTORE_DIR) elif engine == ENGINEENV_MIMIR: self.datastore = MimirDatastore(DATASTORE_DIR) def tear_down(self, engine): """Clean-up by dropping file server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_default_engine(self): """Test functionality for the default setup.""" self.run_tests(ENGINEENV_DEFAULT) def test_mimir_engine(self): """Test functionality for the Mimir setup.""" import vizier.mimir as mimir # noqa: F401 self.run_tests(ENGINEENV_MIMIR) def run_tests(self, engine): """Run sequence of tests for given configuration.""" self.set_up(engine) ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_1)) rows = ds.fetch_rows() self.assertEqual(len(rows), 7) rows = ds.fetch_rows(offset=1) self.assertEqual(len(rows), 6) self.assertEqual(rows[0].values[0], 'Bob') self.assertEqual(rows[5].values[0], 'Gertrud') rows = ds.fetch_rows(limit=2) self.assertEqual(len(rows), 2) self.assertEqual(rows[0].values[0], 'Alice') self.assertEqual(rows[1].values[0], 'Bob') rows = ds.fetch_rows(offset=4, limit=3) self.assertEqual(len(rows), 3) self.assertEqual(rows[0].values[0], 'Eileen') self.assertEqual(rows[2].values[0], 'Gertrud') rows = ds.fetch_rows(offset=5, limit=3) self.assertEqual(len(rows), 2) self.assertEqual(rows[0].values[0], 'Frank') self.assertEqual(rows[1].values[0], 'Gertrud') rows = ds.fetch_rows(offset=6, limit=3) self.assertEqual(len(rows), 1) self.assertEqual(rows[0].values[0], 'Gertrud') # Test larger dataset with deletes ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_2)) rows = ds.fetch_rows(offset=0, limit=10) self.assertEqual(len(rows), 10) rows = ds.fetch_rows(offset=10, limit=20) self.assertEqual(len(rows), 20) rows = ds.fetch_rows(offset=60, limit=10) self.assertEqual(len(rows), 3) self.tear_down(engine)
class TestMimirProcessor(unittest.TestCase): """Individual test for Mimir lenses. Run separately since each test has to initialize and shout down the Mimir gateway. """ def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = MimirProcessor() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_domain_lens(self): """Test DOMAIN lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) col_age = ds.column_by_name('Age') command = cmd.mimir_domain(DATASET_NAME, col_age.identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() self.assertNotEqual(rows[2].values[ds.column_index('Age')], '') # Introduce an error. Make sure command formating is correct command = cmd.mimir_domain('MY DS', 'MY COL') with self.assertRaises(ValueError): result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) def test_geocode_lens(self): """Test GEOCODE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(GEO_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Geocode Lens command = cmd.mimir_geocode( DATASET_NAME, 'GOOGLE', house_nr=ds.column_by_name('STRNUMBER').identifier, street=ds.column_by_name('STRNAME').identifier, city=ds.column_by_name('CITY').identifier, state=ds.column_by_name('STATE').identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 6) self.assertTrue('LATITUDE' in columns) self.assertTrue('LONGITUDE' in columns) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 8) self.assertTrue('LATITUDE_1' in columns) self.assertTrue('LONGITUDE_1' in columns) self.assertEqual(len(ds.columns), 8) def test_key_repair_lens(self): """Test KEY REPAIR lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(KEY_REPAIR_FILE) ds1 = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_key_repair(DATASET_NAME, ds1.column_by_name('Empid').identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds1.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 4) self.assertEqual(ds.row_count, 3) names = set() empids = set() rowids = set() for row in ds.fetch_rows(): rowids.add(row.identifier) empids.add(int(row.get_value('empid'))) names.add(row.get_value('name')) self.assertTrue(1 in empids) self.assertTrue(2 in rowids) self.assertTrue('Alice' in names) self.assertTrue('Carla' in names) # Test error case and command text command = cmd.mimir_key_repair('MY DS', 'MY COL') with self.assertRaises(ValueError): self.processor.compute(command_id=command.command_id, arguments=command.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) def test_missing_value_lens(self): """Test MISSING_VALUE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier }]) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) self.assertNotEqual(rows[2].values[ds.column_index('Age')], '') # MISSING VALUE Lens with value constraint command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier, 'constraint': '> 30' }], ) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) self.assertTrue(rows[2].values[ds.column_index('Age')] > 30) def test_missing_key_lens(self): """Test MISSING_KEY lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens age_col = ds.column_by_name('Age').identifier command = cmd.mimir_missing_key(DATASET_NAME, age_col) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEqual(len(rows), 24) command = cmd.mimir_missing_key(DATASET_NAME, ds.column_by_name('Salary').identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEqual(len(rows), 55) def test_picker_lens(self): """Test PICKER lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(PICKER_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }]) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] print(columns) self.assertEqual(len(ds.columns), 5) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) # Pick another column, this time with custom name command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }], pick_as='My_Column') result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(ds.columns), 6) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) self.assertTrue('MY_COLUMN' in columns) # Pick from a picked column command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier }], pick_as='My_Next_Column') result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertTrue('MY_NEXT_COLUMN' in columns) def test_schema_matching_lens(self): """Test SCHEMA_MATCHING lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_schema_matching(DATASET_NAME, [{ 'column': 'BDate', 'type': 'int' }, { 'column': 'PName', 'type': 'varchar' }], 'new_' + DATASET_NAME) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write['new_' + DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.row_count, 2) def test_type_inference_lens(self): """Test TYPE INFERENCE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Infer type command = cmd.mimir_type_inference(DATASET_NAME, 0.6) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds2 = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds2.columns), 3) self.assertEqual(ds2.row_count, 7) ds1_rows = ds.fetch_rows() ds2_rows = ds2.fetch_rows() for i in range(ds2.row_count): self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)
class TestDefaultVizualApi(unittest.TestCase): api: MimirVizualApi def setUp(self): """Create an instance of the default vizier API for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.api = MimirVizualApi() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_api(self): """Run all tests after we initialize mimir. Make sure to create a fresh environment after each test. """ self.delete_column() self.setUp() self.delete_row() self.setUp() self.filter_columns() self.setUp() self.insert_column() self.setUp() self.insert_row() self.setUp() self.load_dataset() self.setUp() self.move_column() self.setUp() self.move_row() self.setUp() self.rename_column() self.setUp() self.sequence_of_steps() self.setUp() self.sort_dataset() self.setUp() self.update_cell() def delete_column(self): """Test functionality to delete a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Delete Age column col_id = ds.column_by_name('AGE').identifier result = self.api.delete_column(ds.identifier, col_id, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset and ensure that it cobtains the following # # Name, Salary # ------------ # Alice, 35K # Bob, 30K ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Schema is Name, Salary self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.columns[0].name.upper(), 'NAME') self.assertEqual(ds.columns[1].name.upper(), 'SALARY') # Make sure that all rows only have two columns row = ds_rows[0] self.assertEqual(len(row.values), 2) self.assertEqual(len(row.values), 2) self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], '35K') row = ds_rows[1] self.assertEqual(len(row.values), 2) self.assertEqual(len(row.values), 2) self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], '30K') # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.delete_column('unknown:uri', 0, self.datastore) # Ensure exception is thrown if column identifier is unknown with self.assertRaises(ValueError): self.api.delete_column(ds.identifier, 100, self.datastore) def delete_row(self): """Test functionality to delete a row.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Delete second row result = self.api.delete_row(ds.identifier, row_ids[1], self.datastore) del row_ids[1] # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset and ensure that it contains the following # data: # # Name, Age, Salary # ------------ # Alice, 23, 35K ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Schema is Name, Salary col_names = ['Name', 'Age', 'Salary'] self.assertEqual(len(ds.columns), len(col_names)) for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].name.upper(), col_names[i].upper()) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # There should only be one row self.assertEqual(len(ds_rows), 1) # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset is unknown with self.assertRaises(MimirError): self.api.delete_row('unknown:uri', 0, self.datastore) def filter_columns(self): """Test projection of a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset result = self.api.filter_columns(ds.identifier, [2, 0], ['BD', None], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.columns[0].name.upper(), 'BD') self.assertEqual(ds.columns[1].name.upper(), 'NAME') rows = ds.fetch_rows() self.assertEqual(rows[0].values, ['35K', 'Alice']) self.assertEqual(rows[1].values, ['30K', 'Bob']) def insert_column(self): """Test functionality to insert a columns.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Insert columns at position 1 col_ids.insert(1, ds.max_column_id() + 1) result = self.api.insert_column(ds.identifier, 1, 'Height', self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve dataset and ensure that it has the following schema: # Name, Height, Age, Salary ds = self.datastore.get_dataset(result.dataset.identifier) col_names = ['Name', 'Height', 'Age', 'Salary'] # Ensure that there are four rows self.assertEqual(len(ds.columns), len(col_names)) print(ds.columns) for i in range(len(col_names)): col = ds.columns[i] self.assertEqual(col.name.upper(), col_names[i].upper()) # Insert columns at last position col_ids.append(ds.max_column_id() + 1) col_names.append('Weight') result = self.api.insert_column(ds.identifier, 4, 'Weight', self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve dataset and ensure that it has the following schema: # Name, Height, Age, Salary, Weight ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are five rows self.assertEqual(len(ds.columns), len(col_names)) for i in range(len(col_names)): col = ds.columns[i] self.assertEqual(col.name.upper(), col_names[i].upper()) # The cell values for new columns are None all other values are not None for row in ds_rows: for i in range(len(ds.columns)): if i == 1 or i == 4: self.assertIsNone(row.values[i]) else: self.assertTrue(row.values[i]) # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.insert_column('unknown:uri', 1, 'Height', self.datastore) # Ensure exception is thrown if column name is invalid self.api.insert_column(ds.identifier, 1, 'Height_from_ground', self.datastore) with self.assertRaises(ValueError): self.api.insert_column(ds.identifier, 1, 'Height from ground!@#', self.datastore) # Ensure exception is thrown if column position is out of bounds with self.assertRaises(ValueError): self.api.insert_column(ds.identifier, 100, 'Height', self.datastore) def insert_row(self): """Test functionality to insert a row.""" fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset # Keep track of column and row identifier ds_rows = ds.fetch_rows() row_ids = [row.identifier for row in ds_rows] # Insert row at index position 1 row_ids.insert(1, None) # Result should indicate that one row was inserted. The identifier of # the resulting dataset should differ from the identifier of the # original dataset result = self.api.insert_row(ds.identifier, 1, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are three rows self.assertEqual(len(ds_rows), 3) # The second row has empty values for each column row = ds_rows[1] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNone(row.values[i]) # Append row at end current dataset row_ids.append(None) result = self.api.insert_row(ds.identifier, 3, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are three rows self.assertEqual(len(ds_rows), 4) # The next to last row has non-empty values for each column row = ds_rows[2] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNotNone(row.values[i]) # The last row has empty values for each column row = ds_rows[3] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNone(row.values[i]) # Ensure that row ids haven't changed # ## July 16, 2020 by OK: Bug in mimir that is going to take a bunch of # ## heavy lifting to fix: https://github.com/UBOdin/mimir-api/issues/11 # for i in range(len(ds_rows)): # if row_ids[i] is not None: # self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.insert_row('unknown:uri', 1, self.datastore) # Ensure no exception is raised self.api.insert_row(ds.identifier, 4, self.datastore) def load_dataset(self) -> None: """Test functionality to load a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier) ds = result.dataset resources = result.resources assert (isinstance(ds, DatasetHandle)) ds_rows = ds.fetch_rows() self.assertEqual(len(ds.columns), 3) self.assertEqual(len(ds_rows), 2) for row in ds_rows: self.assertTrue(isinstance(row.values[1], int)) self.assertIsNotNone(resources) self.assertEqual(resources[RESOURCE_FILEID], fh.identifier) self.assertEqual(resources[RESOURCE_DATASET], ds.identifier) # Delete file handle and oing the same should raise an exception self.filestore.delete_file(fh.identifier) with self.assertRaises(ValueError): self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id='unknown:uri') # Test loading file from external resource. Skip if DOWNLOAD_URL is None if DOWNLOAD_URL is None: print('Skipping download test') return result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=DOWNLOAD_URL, options=[{ 'delimiter': '\t' }]) ds = result.dataset resources = result.resources ds_rows = ds.fetch_rows() self.assertEqual(len(ds.columns), 4) self.assertEqual(len(ds_rows), 54) self.assertIsNotNone(resources) self.assertEqual(resources[RESOURCE_URL], DOWNLOAD_URL) self.assertEqual(resources[RESOURCE_DATASET], ds.identifier) # Attempt to simulate re-running without downloading again. Set the # Uri to some fake Uri that would raise an exception if an attempt was # made to download url = 'some fake uri' resources[RESOURCE_URL] = url result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=url, resources=resources) prev_id = result.dataset.identifier self.assertEqual(result.dataset.identifier, prev_id) # If we re-run with reload flag true a new dataset should be returned resources[RESOURCE_URL] = DOWNLOAD_URL result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=DOWNLOAD_URL, resources=resources, reload=True, options=[{ 'delimiter': '\t' }]) self.assertNotEqual(result.dataset.identifier, prev_id) def move_column(self): """Test functionality to move a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Swap first two columns c = col_ids[0] del col_ids[0] col_ids.insert(1, c) result = self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, 1, self.datastore) self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 23) self.assertEqual(row.values[1], 'Alice') self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 32) self.assertEqual(row.values[1], 'Bob') self.assertEqual(row.values[2], '30K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Swap last two columns c = col_ids[1] del col_ids[1] col_ids.append(c) result = self.api.move_column(ds.identifier, ds.column_by_name('Salary').identifier, 1, self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Salary'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Name'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 23) self.assertEqual(row.values[1], '35K') self.assertEqual(row.values[2], 'Alice') row = ds_rows[1] self.assertEqual(row.values[0], 32) self.assertEqual(row.values[1], '30K') self.assertEqual(row.values[2], 'Bob') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # No changes if source and target position are the same result = self.api.move_column(ds.identifier, ds.columns[1].identifier, 1, self.datastore) self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.move_column('unknown:uri', 0, 1, self.datastore) # Raise error if source column is out of bounds with self.assertRaises(ValueError): self.api.move_column(ds.identifier, 40, 1, self.datastore) # Raise error if target position is out of bounds with self.assertRaises(ValueError): self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, -1, self.datastore) with self.assertRaises(ValueError): self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, 4, self.datastore) def move_row(self): """Test functionality to move a row.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Swap first two rows result = self.api.move_row(ds.identifier, row_ids[0], 1, self.datastore) row_ids = [row for row in reversed(row_ids)] self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') row = ds_rows[1] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Swap last two rows result = self.api.move_row(ds.identifier, row_ids[1], 0, self.datastore) row_ids = [row for row in reversed(row_ids)] ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Move first row to the end result = self.api.move_row(ds.identifier, row_ids[0], 2, self.datastore) row_ids = [row for row in reversed(row_ids)] ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = ds_rows[0] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') row = ds_rows[1] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') # Ensure that row ids haven't changed # ## July 16, 2020 by OK: Bug in mimir that is going to take a bunch of # ## heavy lifting to fix: https://github.com/UBOdin/mimir-api/issues/11 # for i in range(len(ds_rows)): # self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # No changes if source and target position are the same result = self.api.move_row(ds.identifier, row_ids[1], 1, self.datastore) # ## July 21, 2020 by OK: It would be fantastic if we could easily detect # no-op vizual, but for now skip this check #self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.move_row('unknown:uri', 0, 1, self.datastore) # Raise error if target position is out of bounds # ## July 21, 2020 by OK: Skipping this check for now # with self.assertRaises(ValueError): # self.api.move_row(ds.identifier, 0, -1, self.datastore) # with self.assertRaises(ValueError): # self.api.move_row(ds.identifier, 1, 4, self.datastore) def rename_column(self): """Test functionality to rename a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Rename first column to Firstname result = self.api.rename_column(ds.identifier, ds.column_by_name('Name').identifier, 'Firstname', self.datastore) self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) result = self.api.rename_column(ds.identifier, ds.column_by_name('Age').identifier, 'BDate', self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper()) self.assertEqual(ds.columns[1].name, 'BDate') self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # No changes if the old and new column name are the same (with exception # to upper and lower cases). result = self.api.rename_column(ds.identifier, ds.column_by_name('BDate').identifier, 'BDate', self.datastore) # ## July 21, 2020 by OK: It would be fantastic if we could easily detect # no-op vizual, but for now skip this check # self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.rename_column('unknown:uri', 0, 'Firstname', self.datastore) # Ensure exception is thrown for invalid column id with self.assertRaises(ValueError): self.api.rename_column(ds.identifier, 500, 'BDate', self.datastore) def sequence_of_steps(self): """Test sequence of calls that modify a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds = self.api.insert_row(ds.identifier, 1, self.datastore).dataset row_ids = [row.identifier for row in ds.fetch_rows()] row0 = row_ids[0] row1 = row_ids[1] row2 = row_ids[2] ds = self.api.insert_column(ds.identifier, 3, 'HDate', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('HDate').identifier, row0, '180', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('HDate').identifier, row2, '160', self.datastore).dataset ds = self.api.rename_column(ds.identifier, ds.column_by_name('HDate').identifier, 'Height', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Height').identifier, row1, '170', self.datastore).dataset ds = self.api.move_row(ds.identifier, row1, 2, self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, row2, 'Carla', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Age').identifier, row2, '45', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Salary').identifier, row2, '56K', self.datastore).dataset ds = self.api.move_column(ds.identifier, ds.column_by_name('Salary').identifier, 4, self.datastore).dataset ds = self.api.delete_column(ds.identifier, ds.column_by_name('Age').identifier, self.datastore).dataset ds = self.api.delete_row(ds.identifier, row0, self.datastore).dataset ds = self.api.delete_row(ds.identifier, row1, self.datastore).dataset ds = self.datastore.get_dataset(ds.identifier) ds_rows = ds.fetch_rows() names = ['Name', 'Height', 'Salary'] self.assertEqual(len(ds.columns), len(names)) for i in range(len(names)): col = ds.columns[i] self.assertEqual(col.name.upper(), names[i].upper()) self.assertEqual(len(ds_rows), 1) self.assertEqual(ds_rows[0].values, ['Carla', '160', '56K']) def sort_dataset(self): """Test sorting a dataset.""" # Create a new dataset fh = self.filestore.upload_file(SORT_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset result = self.api.sort_dataset(ds.identifier, [1, 2, 0], [False, False, True], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) rows = ds.fetch_rows() names = ['Alice', 'Bob', 'Dave', 'Gertrud', 'Frank'] result = list() for row in rows: name = row.values[0] if name in names: result.append(name) for i in range(len(names)): self.assertEqual(names[i], result[i]) result = self.api.sort_dataset(ds.identifier, [2, 1, 0], [True, False, True], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) rows = ds.fetch_rows() names = ['Gertrud', 'Frank', 'Bob', 'Alice', 'Dave'] result = list() for row in rows: name = row.values[0] if name in names: result.append(name) for i in range(len(names)): self.assertEqual(names[i], result[i]) # Raises error for invalid column identifier with self.assertRaises(ValueError): self.api.sort_dataset(ds.identifier, [2, 10, 0], [True, False, True], self.datastore) def update_cell(self): """Test functionality to update a dataset cell.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Update cell [0, 0]. Ensure that one row was updated and a new # identifier is generated. Also ensure that the resulting datasets # has the new value in cell [0, 0] row_id = row_ids[0] result = self.api.update_cell(ds.identifier, 0, row_id, 'MyValue', self.datastore) self.assertNotEqual(ds.identifier, result.dataset.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = None for r in ds.fetch_rows(): if r.identifier == row_id: row = r break self.assertEqual(row.values[0], 'MyValue') result = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, row_id, 'AValue', self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = None for r in ds.fetch_rows(): if r.identifier == row_id: row = r break self.assertEqual(row.values[0], 'AValue') self.assertEqual(row.values[ds.column_index('Name')], 'AValue') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Set value to None result = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, row_id, None, self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = None for r in ds.fetch_rows(): if r.identifier == row_id: row = r break self.assertIsNone(row.values[0]) self.assertIsNone(row.values[ds.column_index('Name')]) # Ensure exception is thrown if dataset is unknown with self.assertRaises(MimirError): self.api.update_cell('unknown:uri', 0, 0, 'MyValue', self.datastore)
class TestMimirProcessor(unittest.TestCase): """Individual test for Mimir lenses. Run separately since each test has to initialize and shout down the Mimir gateway. """ def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = MimirProcessor() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) self.available_lenses = set(mimir.getAvailableLensTypes()) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def compute_lens_result(self, ds, command): return self.processor.compute(command_id=command.command_id, arguments=command.arguments, context=TaskContext( project_id=1, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: ds})) def test_geocode_lens(self): if lens_types.MIMIR_GEOCODE not in self.available_lenses: self.skipTest("Mimir Geocoding Lens not initialized.") """Test GEOCODE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(GEO_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Geocode Lens command = cmd.mimir_geocode( DATASET_NAME, 'GOOGLE', house_nr=ds.column_by_name('STRNUMBER').identifier, street=ds.column_by_name('STRNAME').identifier, city=ds.column_by_name('CITY').identifier, state=ds.column_by_name('STATE').identifier) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 6) self.assertTrue('LATITUDE' in columns) self.assertTrue('LONGITUDE' in columns) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 8) self.assertTrue('LATITUDE_1' in columns) self.assertTrue('LONGITUDE_1' in columns) self.assertEqual(len(ds.columns), 8) def test_key_repair_lens(self): """Test KEY REPAIR lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(KEY_REPAIR_FILE) ds1 = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_key_repair(DATASET_NAME, ds1.column_by_name('Empid').identifier) result = self.compute_lens_result(ds1, command) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 4) self.assertEqual(ds.row_count, 2) names = set() empids = set() for row in ds.fetch_rows(): empids.add(int(row.values[0])) names.add(row.values[1]) self.assertTrue(1 in empids) self.assertTrue('Alice' in names or 'Bob' in names) self.assertFalse('Alice' in names and 'Bob' in names) self.assertTrue('Carla' in names) # Test error case and command text with self.assertRaises(ValueError): command = cmd.mimir_key_repair('MY DS', 'MY COL') result = self.compute_lens_result(ds, command) def test_missing_value_lens(self): """Test MISSING_VALUE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier }]) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) self.assertNotEqual(rows[2].values[ds.column_index('Age')], '') # MISSING VALUE Lens with value constraint command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier, 'constraint': '> 30' }], ) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) print(rows[2].values) # we shouldn't be imputing a value lower than the minimum value in the dataset self.assertTrue(rows[2].values[ds.column_index('Age')] >= 23) def test_missing_key_lens(self): """Test MISSING_KEY lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens age_col = ds.column_by_name('Age').identifier command = cmd.mimir_missing_key(DATASET_NAME, age_col) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() # Depending on implementation this could be either 22 or 24, as there are two rows # with missing values for the key column. Currently, Mimir discards such rows, but # if this suddenly turns into a 24, that's not incorrect either. self.assertEqual(len(rows), 22) command = cmd.mimir_missing_key(DATASET_NAME, ds.column_by_name('Salary').identifier) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEqual(len(rows), 31) def test_picker_lens(self): """Test PICKER lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(PICKER_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }]) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset result_ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in result_ds.columns] # print(columns) self.assertEqual(len(result_ds.columns), 3) self.assertTrue('AGE_1' in columns) # Pick another column, this time with custom name command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }], pick_as='My_Column') result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset result_ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in result_ds.columns] self.assertEqual(len(result_ds.columns), 3) self.assertTrue('MY_COLUMN' in columns) def test_type_inference_lens(self): """Test TYPE INFERENCE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Infer type command = cmd.mimir_type_inference(DATASET_NAME, 0.6) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds2 = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds2.columns), 3) self.assertEqual(ds2.row_count, 7) ds1_rows = ds.fetch_rows() ds2_rows = ds2.fetch_rows() for i in range(ds2.row_count): self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)