Exemple #1
0
class TestMimirDatasetAnnotations(unittest.TestCase):
    def setUp(self):
        """Create empty server directory."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.mkdir(SERVER_DIR)
        self.fileserver = FileSystemFilestore(FILESERVER_DIR)
        self.db = MimirDatastore(DATASTORE_DIRECTORY)

    def tearDown(self):
        """Delete server directory."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_dataset_annotations(self):
        """Run test for Mimir datastore."""
        dh = self.db.load_dataset(
            f_handle=self.fileserver.upload_file(DATA_FILE))
        ds = self.db.get_dataset(dh.identifier)
        rows = ds.fetch_rows()
        print(ds.row_ids)
        for row in rows:
            print(str(row.identifier) + '\t' + str(row.values))
        for row_id in ds.row_ids:
            for anno in ds.get_annotations(column_id=1, row_id=row_id):
                print(str(row_id) + '\t' + anno.key + '=' + str(anno.value))
Exemple #2
0
class TestSQLProcessor(unittest.TestCase):
    def setUp(self):
        """Create an instance of the Mimir processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.datastore = MimirDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_run_sql_query(self):
        """Test running a SQL query without materializing the result."""
        f_handle = self.filestore.upload_file(CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        cmd = sql_cell(source='SELECT grade_or_service_category FROM ' +
                       DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'',
                       validate=True)
        result = SQLTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(datasets={DATASET_NAME: ds.identifier},
                                datastore=self.datastore,
                                filestore=self.filestore))
        self.assertTrue(result.is_success)
        self.assertIsNone(result.provenance.read)
        self.assertIsNone(result.provenance.write)
        self.assertTrue(len(result.outputs.stdout) > 0)
        self.assertEqual(len(result.outputs.stderr), 0)
        # Materialize result
        cmd = sql_cell(source='SELECT grade_or_service_category FROM ' +
                       DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'',
                       output_dataset='ge',
                       validate=True)
        result = SQLTaskProcessor().compute(
            command_id=cmd.command_id,
            arguments=cmd.arguments,
            context=TaskContext(datasets={DATASET_NAME: ds.identifier},
                                datastore=self.datastore,
                                filestore=self.filestore))
        self.assertTrue(result.is_success)
        self.assertIsNone(result.provenance.read)
        self.assertIsNotNone(result.provenance.write)
        self.assertTrue('ge' in result.provenance.write)
        self.assertTrue(len(result.outputs.stdout) > 0)
        self.assertEqual(len(result.outputs.stderr), 0)
Exemple #3
0
class TestMimirDatastore(unittest.TestCase):
    def setup_fileserver(self):
        """Create a fresh file server."""
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)
        os.mkdir(FILESERVER_DIR)
        self.fileserver = FileSystemFilestore(FILESERVER_DIR)

    def set_up(self):
        """Create empty data store directory."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.mkdir(SERVER_DIR)
        self.db = MimirDatastore(DATASTORE_DIR)

    def tear_down(self):
        """Delete data store directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_mimir_datastore(self):
        """Run test for Mimir datastore."""
        self.set_up()
        self.dataset_load()
        self.tear_down()
        self.set_up()
        self.datastore_init()
        self.tear_down()
        self.set_up()
        self.dataset_read()
        self.tear_down()
        self.set_up()
        self.dataset_column_index()
        self.tear_down()
        self.tear_down()

    def datastore_init(self):
        """Test initalizing a datastore with existing datasets."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        self.db = MimirDatastore(DATASTORE_DIR)

    def dataset_column_index(self):
        """Test the column by id index of the dataset handle."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        # Ensure that the project data has three columns and two rows
        self.assertEqual(ds.column_by_id(0).name.upper(), 'NAME')
        self.assertEqual(ds.column_by_id(1).name.upper(), 'AGE')
        self.assertEqual(ds.column_by_id(2).name.upper(), 'SALARY')
        with self.assertRaises(ValueError):
            ds.column_by_id(5)
        ds.columns.append(DatasetColumn(identifier=5, name='NEWNAME'))
        self.assertEqual(ds.column_by_id(5).name.upper(), 'NEWNAME')
        with self.assertRaises(ValueError):
            ds.column_by_id(4)

    def dataset_load(self):
        """Test create and delete dataset."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        # Ensure that the project data has three columns and two rows
        self.assertEqual(len(ds.columns), 3)
        self.assertEqual(len(ds.fetch_rows()), 2)
        self.assertEqual(ds.row_count, 2)

    def dataset_read(self):
        """Test reading a dataset."""
        self.setup_fileserver()
        dh = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        ds = self.db.get_dataset(dh.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEqual(dh.identifier, ds.identifier)
        self.assertEqual(len(dh.columns), len(ds.columns))
        self.assertEqual(len(dh.fetch_rows()), len(ds_rows))
        self.assertEqual(len(dh.fetch_rows()), len(ds_rows))
        self.assertEqual(dh.row_count, len(ds_rows))
        # Name,Age,Salary
        # Alice,23,35K
        # Bob,32,30K
        self.assertEqual(ds.column_index('Name'), 0)
        self.assertEqual(ds.column_index('Age'), 1)
        self.assertEqual(ds.column_index('Salary'), 2)
        row = ds_rows[0]
        self.assertEqual(row.values[0], 'Alice')
        self.assertEqual(int(row.values[1]), 23)
        self.assertEqual(row.values[2], '35K')
        row = ds_rows[1]
        self.assertEqual(row.values[0], 'Bob')
        self.assertEqual(int(row.values[1]), 32)
        self.assertEqual(row.values[2], '30K')
Exemple #4
0
class TestDatasetPaginationReader(unittest.TestCase):

    def set_up(self, engine):
        """Create an empty file server repository."""
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.mkdir(SERVER_DIR)
        # Setup file server
        self.fs = FileSystemFilestore(FILESERVER_DIR)
        # Setup the respective datastore and Vizual engine
        if engine == ENGINEENV_DEFAULT:
            self.datastore = FileSystemDatastore(DATASTORE_DIR)
        elif engine == ENGINEENV_MIMIR:
            self.datastore = MimirDatastore(DATASTORE_DIR)

    def tear_down(self, engine):
        """Clean-up by dropping file server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_default_engine(self):
        """Test functionality for the default setup."""
        self.run_tests(ENGINEENV_DEFAULT)

    def test_mimir_engine(self):
        """Test functionality for the Mimir setup."""
        import vizier.mimir as mimir # noqa: F401
        self.run_tests(ENGINEENV_MIMIR)

    def run_tests(self, engine):
        """Run sequence of tests for given configuration."""
        self.set_up(engine)
        ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_1))
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 7)
        rows = ds.fetch_rows(offset=1)
        self.assertEqual(len(rows), 6)
        self.assertEqual(rows[0].values[0], 'Bob')
        self.assertEqual(rows[5].values[0], 'Gertrud')
        rows = ds.fetch_rows(limit=2)
        self.assertEqual(len(rows), 2)
        self.assertEqual(rows[0].values[0], 'Alice')
        self.assertEqual(rows[1].values[0], 'Bob')
        rows = ds.fetch_rows(offset=4, limit=3)
        self.assertEqual(len(rows), 3)
        self.assertEqual(rows[0].values[0], 'Eileen')
        self.assertEqual(rows[2].values[0], 'Gertrud')
        rows = ds.fetch_rows(offset=5, limit=3)
        self.assertEqual(len(rows), 2)
        self.assertEqual(rows[0].values[0], 'Frank')
        self.assertEqual(rows[1].values[0], 'Gertrud')
        rows = ds.fetch_rows(offset=6, limit=3)
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0].values[0], 'Gertrud')
        # Test larger dataset with deletes
        ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_2))
        rows = ds.fetch_rows(offset=0, limit=10)
        self.assertEqual(len(rows), 10)
        rows = ds.fetch_rows(offset=10, limit=20)
        self.assertEqual(len(rows), 20)
        rows = ds.fetch_rows(offset=60, limit=10)
        self.assertEqual(len(rows), 3)
        self.tear_down(engine)
Exemple #5
0
class TestMimirProcessor(unittest.TestCase):
    """Individual test for Mimir lenses. Run separately since each test has to
    initialize and shout down the Mimir gateway.
    """
    def setUp(self):
        """Create an instance of the Mimir processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.processor = MimirProcessor()
        self.datastore = MimirDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_domain_lens(self):
        """Test DOMAIN lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        col_age = ds.column_by_name('Age')
        command = cmd.mimir_domain(DATASET_NAME, col_age.identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        self.assertNotEqual(rows[2].values[ds.column_index('Age')], '')
        # Introduce an error. Make sure command formating is correct
        command = cmd.mimir_domain('MY DS', 'MY COL')
        with self.assertRaises(ValueError):
            result = self.processor.compute(
                command_id=command.command_id,
                arguments=command.arguments,
                context=TaskContext(datastore=self.datastore,
                                    filestore=self.filestore,
                                    datasets={DATASET_NAME: ds.identifier}))

    def test_geocode_lens(self):
        """Test GEOCODE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(GEO_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Geocode Lens
        command = cmd.mimir_geocode(
            DATASET_NAME,
            'GOOGLE',
            house_nr=ds.column_by_name('STRNUMBER').identifier,
            street=ds.column_by_name('STRNAME').identifier,
            city=ds.column_by_name('CITY').identifier,
            state=ds.column_by_name('STATE').identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 6)
        self.assertTrue('LATITUDE' in columns)
        self.assertTrue('LONGITUDE' in columns)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 8)
        self.assertTrue('LATITUDE_1' in columns)
        self.assertTrue('LONGITUDE_1' in columns)
        self.assertEqual(len(ds.columns), 8)

    def test_key_repair_lens(self):
        """Test KEY REPAIR lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(KEY_REPAIR_FILE)
        ds1 = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_key_repair(DATASET_NAME,
                                       ds1.column_by_name('Empid').identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds1.identifier}))
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 4)
        self.assertEqual(ds.row_count, 3)
        names = set()
        empids = set()
        rowids = set()
        for row in ds.fetch_rows():
            rowids.add(row.identifier)
            empids.add(int(row.get_value('empid')))
            names.add(row.get_value('name'))
        self.assertTrue(1 in empids)
        self.assertTrue(2 in rowids)
        self.assertTrue('Alice' in names)
        self.assertTrue('Carla' in names)
        # Test error case and command text
        command = cmd.mimir_key_repair('MY DS', 'MY COL')
        with self.assertRaises(ValueError):
            self.processor.compute(command_id=command.command_id,
                                   arguments=command.arguments,
                                   context=TaskContext(
                                       datastore=self.datastore,
                                       filestore=self.filestore,
                                       datasets={DATASET_NAME: ds.identifier}))

    def test_missing_value_lens(self):
        """Test MISSING_VALUE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier
            }])
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        self.assertNotEqual(rows[2].values[ds.column_index('Age')], '')
        # MISSING VALUE Lens with value constraint
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier,
                'constraint': '> 30'
            }],
        )
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        self.assertTrue(rows[2].values[ds.column_index('Age')] > 30)

    def test_missing_key_lens(self):
        """Test MISSING_KEY lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        age_col = ds.column_by_name('Age').identifier
        command = cmd.mimir_missing_key(DATASET_NAME, age_col)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 24)
        command = cmd.mimir_missing_key(DATASET_NAME,
                                        ds.column_by_name('Salary').identifier)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 55)

    def test_picker_lens(self):
        """Test PICKER lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(PICKER_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }])
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        print(columns)
        self.assertEqual(len(ds.columns), 5)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        # Pick another column, this time with custom name
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }],
            pick_as='My_Column')
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(ds.columns), 6)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        self.assertTrue('MY_COLUMN' in columns)
        # Pick from a picked column
        command = cmd.mimir_picker(
            DATASET_NAME,
            [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier
            }],
            pick_as='My_Next_Column')
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertTrue('MY_NEXT_COLUMN' in columns)

    def test_schema_matching_lens(self):
        """Test SCHEMA_MATCHING lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_schema_matching(DATASET_NAME, [{
            'column': 'BDate',
            'type': 'int'
        }, {
            'column': 'PName',
            'type': 'varchar'
        }], 'new_' + DATASET_NAME)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write['new_' + DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 2)
        self.assertEqual(ds.row_count, 2)

    def test_type_inference_lens(self):
        """Test TYPE INFERENCE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Infer type
        command = cmd.mimir_type_inference(DATASET_NAME, 0.6)
        result = self.processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=TaskContext(datastore=self.datastore,
                                filestore=self.filestore,
                                datasets={DATASET_NAME: ds.identifier}))
        self.assertTrue(result.is_success)
        # Get dataset
        ds2 = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds2.columns), 3)
        self.assertEqual(ds2.row_count, 7)
        ds1_rows = ds.fetch_rows()
        ds2_rows = ds2.fetch_rows()
        for i in range(ds2.row_count):
            self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)
class TestMimirProcessor(unittest.TestCase):
    """Individual test for Mimir lenses. Run separately since each test has to
    initialize and shout down the Mimir gateway.
    """
    def setUp(self):
        """Create an instance of the Mimir processor for an empty server
        directory.
        """
        # Drop directory if it exists
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        os.makedirs(SERVER_DIR)
        self.processor = MimirProcessor()
        self.datastore = MimirDatastore(DATASTORE_DIR)
        self.filestore = FileSystemFilestore(FILESTORE_DIR)
        self.available_lenses = set(mimir.getAvailableLensTypes())

    def tearDown(self):
        """Clean-up by dropping the server directory.
        """
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def compute_lens_result(self, ds, command):
        return self.processor.compute(command_id=command.command_id,
                                      arguments=command.arguments,
                                      context=TaskContext(
                                          project_id=1,
                                          datastore=self.datastore,
                                          filestore=self.filestore,
                                          artifacts={DATASET_NAME: ds}))

    def test_geocode_lens(self):
        if lens_types.MIMIR_GEOCODE not in self.available_lenses:
            self.skipTest("Mimir Geocoding Lens not initialized.")
        """Test GEOCODE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(GEO_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Geocode Lens
        command = cmd.mimir_geocode(
            DATASET_NAME,
            'GOOGLE',
            house_nr=ds.column_by_name('STRNUMBER').identifier,
            street=ds.column_by_name('STRNAME').identifier,
            city=ds.column_by_name('CITY').identifier,
            state=ds.column_by_name('STATE').identifier)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)

        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 6)
        self.assertTrue('LATITUDE' in columns)
        self.assertTrue('LONGITUDE' in columns)

        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in ds.columns]
        self.assertEqual(len(columns), 8)
        self.assertTrue('LATITUDE_1' in columns)
        self.assertTrue('LONGITUDE_1' in columns)
        self.assertEqual(len(ds.columns), 8)

    def test_key_repair_lens(self):
        """Test KEY REPAIR lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(KEY_REPAIR_FILE)
        ds1 = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_key_repair(DATASET_NAME,
                                       ds1.column_by_name('Empid').identifier)
        result = self.compute_lens_result(ds1, command)
        self.assertTrue(result.is_success)
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 4)
        self.assertEqual(ds.row_count, 2)
        names = set()
        empids = set()
        for row in ds.fetch_rows():
            empids.add(int(row.values[0]))
            names.add(row.values[1])
        self.assertTrue(1 in empids)
        self.assertTrue('Alice' in names or 'Bob' in names)
        self.assertFalse('Alice' in names and 'Bob' in names)
        self.assertTrue('Carla' in names)
        # Test error case and command text
        with self.assertRaises(ValueError):
            command = cmd.mimir_key_repair('MY DS', 'MY COL')
            result = self.compute_lens_result(ds, command)

    def test_missing_value_lens(self):
        """Test MISSING_VALUE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier
            }])
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        self.assertNotEqual(rows[2].values[ds.column_index('Age')], '')
        # MISSING VALUE Lens with value constraint
        command = cmd.mimir_missing_value(
            DATASET_NAME,
            columns=[{
                'column': ds.column_by_name('AGE').identifier,
                'constraint': '> 30'
            }],
        )
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        rows = ds.fetch_rows()
        for row in rows:
            self.assertIsNotNone(row.values[1])
        print(rows[2].values)
        # we shouldn't be imputing a value lower than the minimum value in the dataset
        self.assertTrue(rows[2].values[ds.column_index('Age')] >= 23)

    def test_missing_key_lens(self):
        """Test MISSING_KEY lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Missing Value Lens
        age_col = ds.column_by_name('Age').identifier
        command = cmd.mimir_missing_key(DATASET_NAME, age_col)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        # Depending on implementation this could be either 22 or 24, as there are two rows
        # with missing values for the key column.  Currently, Mimir discards such rows, but
        # if this suddenly turns into a 24, that's not incorrect either.
        self.assertEqual(len(rows), 22)
        command = cmd.mimir_missing_key(DATASET_NAME,
                                        ds.column_by_name('Salary').identifier)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEqual(len(rows), 31)

    def test_picker_lens(self):
        """Test PICKER lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(PICKER_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }])
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        result_ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in result_ds.columns]
        # print(columns)
        self.assertEqual(len(result_ds.columns), 3)
        self.assertTrue('AGE_1' in columns)
        # Pick another column, this time with custom name
        command = cmd.mimir_picker(
            DATASET_NAME, [{
                'pickFrom': ds.column_by_name('Age').identifier
            }, {
                'pickFrom': ds.column_by_name('Salary').identifier
            }],
            pick_as='My_Column')
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        result_ds = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        columns = [c.name for c in result_ds.columns]
        self.assertEqual(len(result_ds.columns), 3)
        self.assertTrue('MY_COLUMN' in columns)

    def test_type_inference_lens(self):
        """Test TYPE INFERENCE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE)
        ds = self.datastore.load_dataset(f_handle=f_handle)
        # Infer type
        command = cmd.mimir_type_inference(DATASET_NAME, 0.6)
        result = self.compute_lens_result(ds, command)
        self.assertTrue(result.is_success)
        # Get dataset
        ds2 = self.datastore.get_dataset(
            result.provenance.write[DATASET_NAME].identifier)
        self.assertEqual(len(ds2.columns), 3)
        self.assertEqual(ds2.row_count, 7)
        ds1_rows = ds.fetch_rows()
        ds2_rows = ds2.fetch_rows()
        for i in range(ds2.row_count):
            self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)