Example #1
0
 def setUp(self):
     """Create an empty work trails repository."""
     # Cleanup first
     self.cleanUp()
     self.datastore = MimirDataStore(DATASTORE_DIR)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
     self.vizual = MimirVizualEngine(self.datastore, self.fileserver)
Example #2
0
 def setUp(self):
     """Create an empty file server repository."""
     # Drop project descriptor directory
     if os.path.isdir(SERVER_DIR):
         shutil.rmtree(SERVER_DIR)
     # Setup project repository
     self.db = DefaultFileServer(SERVER_DIR)
Example #3
0
 def test_mem_client(self):
     """Run tests for default engine and in-memory data store."""
     self.fs = DefaultFileServer(SERVER_DIR)
     self.ds = InMemDataStore()
     self.run_client_tests(
         VizierDBClient(self.ds, dict(),
                        DefaultVizualEngine(self.ds, self.fs)))
Example #4
0
 def test_fs_client(self):
     """Run tests for default engine and file server data store."""
     self.fs = DefaultFileServer(SERVER_DIR)
     self.ds = FileSystemDataStore(DATASTORE_DIR)
     self.run_client_tests(
         VizierDBClient(self.ds, dict(),
                        DefaultVizualEngine(self.ds, self.fs)))
Example #5
0
 def test_mimir_client(self):
     """Run tests for default engine and Mimir data store."""
     mimir.initialize()
     self.fs = DefaultFileServer(SERVER_DIR)
     self.ds = MimirDataStore(DATASTORE_DIR)
     self.run_client_tests(
         VizierDBClient(self.ds, dict(),
                        DefaultVizualEngine(self.ds, self.fs)))
     mimir.finalize()
Example #6
0
 def setUp(self):
     """Create an empty work trails repository."""
     # Create fresh set of directories
     for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
         if os.path.isdir(d):
             shutil.rmtree(d)
         os.mkdir(d)
     self.datastore = MimirDataStore(DATASTORE_DIR)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
     vizual = MimirVizualEngine(self.datastore, self.fileserver)
     self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                            {ENV.identifier: ENV})
Example #7
0
 def setUp(self):
     """Create empty data store directory."""
     # Setup file server and upload file
     if os.path.isdir(FILESERVER_DIR):
         shutil.rmtree(FILESERVER_DIR)
     os.mkdir(FILESERVER_DIR)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
     # Remove directory if it exists
     if os.path.isdir(DATASTORE_DIRECTORY):
         shutil.rmtree(DATASTORE_DIRECTORY)
     os.mkdir(DATASTORE_DIRECTORY)
     self.db = FileSystemDataStore(DATASTORE_DIRECTORY)
 def set_up(self, engine):
     """Create an empty file server repository."""
     # Drop project descriptor directory
     if os.path.isdir(FILESERVER_DIR):
         shutil.rmtree(FILESERVER_DIR)
     # Setup project repository
     self.fs = DefaultFileServer(FILESERVER_DIR)
     if engine == ENGINEENV_DEFAULT:
         self.datastore = FileSystemDataStore(DATASTORE_DIR)
         self.vizual = DefaultVizualEngine(self.datastore, self.fs)
     elif engine == ENGINEENV_MIMIR:
         self.datastore = MimirDataStore(DATASTORE_DIR)
         self.vizual = MimirVizualEngine(self.datastore, self.fs)
Example #9
0
 def set_up_default(self):
     """Setup configuration using default Vizual engine."""
     env = ExecEnv(
             FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
             packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON]
         ).from_dict({'datastore': {'directory': DATASTORE_DIR}})
     self.ENGINE_ID = env.identifier
     self.set_up()
     self.datastore = FileSystemDataStore(DATASTORE_DIR)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
     self.db = FileSystemViztrailRepository(
         VIZTRAILS_DIR,
         {env.identifier: env}
     )
Example #10
0
class TestDataStore(unittest.TestCase):

    def setUp(self):
        """Create empty data store directory."""
        # Setup file server and upload file
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)
        os.mkdir(FILESERVER_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        # Remove directory if it exists
        if os.path.isdir(DATASTORE_DIRECTORY):
            shutil.rmtree(DATASTORE_DIRECTORY)
        os.mkdir(DATASTORE_DIRECTORY)
        self.db = FileSystemDataStore(DATASTORE_DIRECTORY)

    def tearDown(self):
        """Delete data store directory.
        """
        for d in [DATASTORE_DIRECTORY, FILESERVER_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_datastore(self):
        """Test functionality of the file server data store."""
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        self.assertEquals(ds.column_counter, 3)
        self.assertEquals(ds.row_counter, 2)
        self.assertEquals(ds.row_count, 2)
        ds = self.db.get_dataset(ds.identifier)
        names = ['Name', 'Age', 'Salary']
        for i in range(3):
            col = ds.columns[i]
            self.assertEquals(col.identifier, i)
            self.assertEquals(col.name, names[i])
        rows = ds.fetch_rows()
        self.assertEquals(len(rows), ds.row_count)
        for i in range(len(rows)):
            row = rows[i]
            self.assertEquals(row.identifier, i)
        rows[0].values[0] = 'Jane'
        ds = self.db.create_dataset(columns=ds.columns, rows=rows)
        ds = self.db.get_dataset(ds.identifier)
        for i in range(3):
            col = ds.columns[i]
            self.assertEquals(col.identifier, i)
            self.assertEquals(col.name, names[i])
        rows = ds.fetch_rows()
        self.assertEquals(len(rows), ds.row_count)
        for i in range(len(rows)):
            row = rows[i]
            self.assertEquals(row.identifier, i)
        self.assertEquals(rows[0].values[0], 'Jane')
Example #11
0
 def setUp(self):
     """Create an empty work trails repository."""
     # Create fresh set of directories
     self.config = AppConfig()
     env = ExecEnv(
         FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
         packages=[PACKAGE_VIZUAL, PACKAGE_PLOT
                   ]).from_dict({'datastore': {
                       'directory': DATASTORE_DIR
                   }})
     self.ENGINE_ID = env.identifier
     self.config.envs[self.ENGINE_ID] = env
     self.config.fileserver = env.fileserver
     for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
         if os.path.isdir(d):
             shutil.rmtree(d)
         os.mkdir(d)
     self.datastore = FileSystemDataStore(DATASTORE_DIR)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
     self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                            {env.identifier: env})
     self.api = VizierWebService(self.db, self.datastore, self.fileserver,
                                 self.config)
Example #12
0
class TestLoadMimirDataset(unittest.TestCase):
    def setUp(self):
        """Create an empty work trails repository."""
        # Cleanup first
        self.cleanUp()
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)

    def tearDown(self):
        """Clean-up by deleting directories.
        """
        self.cleanUp()

    def cleanUp(self):
        """Remove datastore and fileserver directory."""
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_load(self):
        """Run workflow with default configuration."""
        # Ignore files that raised errors (or are taking too much time to load)
        ignore_files = ['JSONOUTPUTWIDE.csv']
        data_types = set()
        mimir.initialize()
        for filename in os.listdir(LOAD_DIR):
            if filename in ignore_files:
                continue
            print 'LOAD ' + filename
            filename = os.path.join(LOAD_DIR, filename)
            f_handle = self.fileserver.upload_file(filename)
            ds = self.datastore.load_dataset(f_handle)
            ds_load = self.datastore.get_dataset(ds.identifier)
            for col in ds_load.columns:
                data_types.add(col.data_type)
                print '\t' + col.name_in_rdb + ' AS ' + col.name + '(' + col.data_type + ')'
            print '\t' + str(ds.row_count) + ' row(s)'
            self.assertEquals(len(ds.columns), len(ds_load.columns))
            self.assertEquals(ds.column_counter, ds_load.column_counter)
            self.assertEquals(ds.row_counter, ds_load.row_counter)
            rows = ds.fetch_rows()
            self.assertEquals(ds.row_counter, len(rows))
            self.assertEquals(ds.row_count, len(rows))
            for i in range(len(rows)):
                row = rows[i]
                self.assertEquals(row.identifier, i)
                self.assertEquals(len(row.values), len(ds.columns))
        mimir.finalize()
        print data_types
Example #13
0
 def setUp(self):
     """Create an new Web Service API."""
     # Clear various directories
     for d in [WORKTRAILS_DIR, DATASTORE_DIR, FILESERVER_DIR]:
         if os.path.isdir(d):
             shutil.rmtree(d)
         os.mkdir(d)
     # Setup datastore and API
     self.config = AppConfig()
     self.ENV = ExecEnv(
         FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
         packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON
                   ]).from_dict({'datastore': {
                       'directory': DATASTORE_DIR
                   }})
     self.ENGINE_ID = self.ENV.identifier
     self.config.envs[self.ENGINE_ID] = self.ENV
     self.config.fileserver = self.ENV.fileserver
     self.datastore = FileSystemDataStore(DATASTORE_DIR)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
     self.api = VizierWebService(
         FileSystemViztrailRepository(WORKTRAILS_DIR,
                                      {self.ENV.identifier: self.ENV}),
         self.datastore, self.fileserver, self.config)
Example #14
0
class TestMimirAnnotations(unittest.TestCase):
    def setUp(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        vizual = MimirVizualEngine(self.datastore, self.fileserver)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {ENV.identifier: ENV})

    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_annotations(self):
        """Test DOMAIN lens."""
        # Create new work trail and create dataset from CSV file
        mimir.initialize()
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name': 'My Project'})
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value(
                DS_NAME,
                ds.column_by_name('AGE').identifier))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        annos = ds.get_annotations(column_id=1, row_id=2)
        self.assertEquals(len(annos), 2)
        for anno in annos:
            self.assertEquals(anno.key, ANNO_UNCERTAIN)
        mimir.finalize()
Example #15
0
 def setUp(self):
     """Create an new Web Service API."""
     # Clear various directories
     for d in [WORKTRAILS_DIRECTORY, DATASTORE_DIRECTORY, FILESERVER_DIR]:
         if os.path.isdir(d):
             shutil.rmtree(d)
         os.mkdir(d)
     # Setup datastore and API
     self.config = AppConfig(configuration_file=CONFIG_FILE)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
     self.config.envs = {
         'default': TestEnv(),
         'blocked': self.config.envs[ENGINEENV_DEFAULT]
     }
     self.datastore = FileSystemDataStore(DATASTORE_DIRECTORY)
     self.api = VizierWebService(
         FileSystemViztrailRepository(
             WORKTRAILS_DIRECTORY,
             self.config.envs
         ),
         self.datastore,
         self.fileserver,
         self.config
     )
Example #16
0
configuration is used.
"""

config = AppConfig()

# Create the app and enable cross-origin resource sharing
app = Flask(__name__)
app.config['APPLICATION_ROOT'] = config.api.app_path
app.config['DEBUG'] = config.debug
# Set size limit for uploaded files
app.config['MAX_CONTENT_LENGTH'] = config.fileserver.max_file_size

CORS(app)

# Currently uses the default file server
fileserver = DefaultFileServer(config.fileserver.directory)

# Create datastore for the API. Different execution environments may use
# different data stores. The API needs to be able to serve datasets from all
# of them. Thus, if more than one execution environment is specified we need
# to use a federated datastore. Individual viztrails will create their own
# instances of their respective data store.
datastores = list()
for env_id in config.envs:
    env_conf = config.envs[env_id]
    if env_id == ENGINEENV_DEFAULT:
        datastores.append(FileSystemDataStore(env_conf.datastore.directory))
    elif env_id == ENGINEENV_MIMIR:
        datastores.append(MimirDataStore(env_conf.datastore.directory))
    else:
        raise RuntimeError('unknown execution environment \'' + env_id + '\'')
Example #17
0
class TestVizualEngine(unittest.TestCase):
    def set_up(self, engine):
        """Create an empty file server repository."""
        # Drop project descriptor directory
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)
        # Setup project repository
        self.fs = DefaultFileServer(FILESERVER_DIR)
        if engine == ENGINEENV_DEFAULT:
            self.datastore = FileSystemDataStore(DATASTORE_DIR)
            self.vizual = DefaultVizualEngine(self.datastore, self.fs)
        elif engine == ENGINEENV_MIMIR:
            self.datastore = MimirDataStore(DATASTORE_DIR)
            self.vizual = MimirVizualEngine(self.datastore, self.fs)
        self.file = self.fs.upload_file(CSV_FILE)

    def tear_down(self, engine):
        """Clean-up by dropping file server directory.
        """
        # Drop data store directory
        if os.path.isdir(DATASTORE_DIR):
            shutil.rmtree(DATASTORE_DIR)
        # Drop project descriptor directory
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)

    def test_default_engine(self):
        """Test functionality if the default VizUAL engine."""
        self.run_engine_tests(ENGINEENV_DEFAULT)

    def test_mimir_engine(self):
        """Test functionality if the Mimir VizUAL engine."""
        import vistrails.packages.mimir.init as mimir
        mimir.initialize()
        self.run_engine_tests(ENGINEENV_MIMIR)
        mimir.finalize()

    def run_engine_tests(self, engine):
        """Run sequence of tests for given engine."""
        self.load_dataset(engine)
        self.insert_column(engine)
        self.insert_row(engine)
        self.delete_column(engine)
        self.delete_row(engine)
        self.move_column(engine)
        self.move_row(engine)
        self.rename_column(engine)
        self.update_cell(engine)
        self.filter_columns(engine)
        self.sort_dataset(engine)
        self.sequence_of_steps(engine)

    def delete_column(self, engine):
        """Test functionality to delete a column."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Delete Age column
        col_id = ds.column_by_name('AGE').identifier
        col_count, id1 = self.vizual.delete_column(ds.identifier, col_id)
        del col_ids[1]
        # Result should indicate that one column was deleted. The identifier of
        # the resulting dataset should differ from the identifier of the
        # original dataset
        self.assertEquals(col_count, 1)
        self.assertNotEquals(id1, ds.identifier)
        # Retrieve modified dataset and ensure that it cobtains the following
        #
        # Name, Salary
        # ------------
        # Alice, 35K
        # Bob, 30K
        ds = self.datastore.get_dataset(id1)
        ds_rows = ds.fetch_rows()
        # Schema is Name, Salary
        self.assertEquals(len(ds.columns), 2)
        self.assertEquals(ds.columns[0].name.upper(), 'NAME')
        self.assertEquals(ds.columns[1].name.upper(), 'SALARY')
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Make sure that all rows only have two columns
        row = ds_rows[0]
        self.assertEquals(len(row.values), 2)
        self.assertEquals(len(row.values), 2)
        self.assertEquals(row.values[0], 'Alice')
        self.assertEquals(row.values[1], '35K')
        row = ds_rows[1]
        self.assertEquals(len(row.values), 2)
        self.assertEquals(len(row.values), 2)
        self.assertEquals(row.values[0], 'Bob')
        self.assertEquals(row.values[1], '30K')
        # Ensure that row identifier haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(ValueError):
            self.vizual.delete_column('unknown:uri', 0)
        self.tear_down(engine)

    def delete_row(self, engine):
        """Test functionality to delete a row."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Delete second row
        row_count, id1 = self.vizual.delete_row(ds.identifier, 1)
        del row_ids[1]
        # Result should indicate that one row was deleted. The identifier of the
        #  resulting dataset should differ from the identifier of the original
        # dataset
        self.assertEquals(row_count, 1)
        self.assertNotEquals(id1, ds.identifier)
        # Retrieve modified dataset and ensure that it contains the following
        # data:
        #
        # Name, Age, Salary
        # ------------
        # Alice, 23, 35K
        ds = self.datastore.get_dataset(id1)
        ds_rows = ds.fetch_rows()
        # Schema is Name, Salary
        col_names = ['Name', 'Age', 'Salary']
        self.assertEquals(len(ds.columns), len(col_names))
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].name.upper(), col_names[i].upper())
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # There should only be one row
        self.assertEquals(len(ds_rows), 1)
        # Ensure that row identifier haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(ValueError):
            self.vizual.delete_row('unknown:uri', 1)
        # Ensure exception is thrown if row index is out of bounds
        with self.assertRaises(ValueError):
            self.vizual.delete_row(ds.identifier, 100)
        self.tear_down(engine)

    def filter_columns(self, engine):
        """Test projection of a dataset."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        count, ds_id = self.vizual.filter_columns(ds.identifier, [2, 0],
                                                  ['BD', None])
        ds = self.datastore.get_dataset(ds_id)
        self.assertEquals(len(ds.columns), 2)
        self.assertEquals(ds.columns[0].identifier, 2)
        self.assertEquals(ds.columns[0].name.upper(), 'BD')
        self.assertEquals(ds.columns[1].identifier, 0)
        self.assertEquals(ds.columns[1].name.upper(), 'NAME')
        rows = ds.fetch_rows()
        self.assertEquals(rows[0].values, ['35K', 'Alice'])
        self.assertEquals(rows[1].values, ['30K', 'Bob'])
        with self.assertRaises(ValueError):
            self.vizual.filter_columns(ds.identifier, [0, 1], ['BD', None])

        self.tear_down(engine)

    def insert_column(self, engine):
        """Test functionality to insert a columns."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Insert columns at position 1
        col_ids.insert(1, ds.column_counter)
        col_count, id1 = self.vizual.insert_column(ds.identifier, 1, 'Height')
        # Result should indicate that one column was inserted. The identifier of
        # the resulting dataset should differ from the identifier of the
        # original dataset
        self.assertEquals(col_count, 1)
        self.assertNotEquals(id1, ds.identifier)
        # Retrieve dataset and ensure that it has the following schema:
        # Name, Height, Age, Salary
        ds = self.datastore.get_dataset(id1)
        col_names = ['Name', 'Height', 'Age', 'Salary']
        # Ensure that there are four rows
        self.assertEquals(len(ds.columns), len(col_names))
        for i in range(len(col_names)):
            col = ds.columns[i]
            self.assertEquals(col.identifier, col_ids[i])
            self.assertEquals(col.name.upper(), col_names[i].upper())
        # Insert columns at last position
        col_ids.append(ds.column_counter)
        col_names.append('Weight')
        col_count, id2 = self.vizual.insert_column(id1, 4, 'Weight')
        # Result should indicate that one column was deleted. The identifier of
        # the resulting dataset should differ from the identifier of the
        # previous dataset
        self.assertEquals(col_count, 1)
        self.assertNotEquals(id1, id2)
        # Retrieve dataset and ensure that it has the following schema:
        # Name, Height, Age, Salary, Weight
        ds = self.datastore.get_dataset(id2)
        ds_rows = ds.fetch_rows()
        # Ensure that there are five rows
        self.assertEquals(len(ds.columns), len(col_names))
        for i in range(len(col_names)):
            col = ds.columns[i]
            self.assertEquals(col.identifier, col_ids[i])
            self.assertEquals(col.name.upper(), col_names[i].upper())
        # The cell values for new columns are None all other values are not None
        for row in ds_rows:
            for i in range(len(ds.columns)):
                if i == 1 or i == 4:
                    self.assertTrue(is_null(row.values[i]))
                else:
                    self.assertFalse(is_null(row.values[i]))
        # Ensure that row identifier haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(ValueError):
            self.vizual.insert_column('unknown:uri', 1, 'Height')
        # Ensure exception is thrown if column name is invalid
        self.vizual.insert_column(ds.identifier, 1, 'Height from ground')
        with self.assertRaises(ValueError):
            self.vizual.insert_column(ds.identifier, 1,
                                      'Height from ground!@#')
        # Ensure exception is thrown if column position is out of bounds
        with self.assertRaises(ValueError):
            self.vizual.insert_column(ds.identifier, 100, 'Height')
        self.tear_down(engine)

    def insert_row(self, engine):
        """Test functionality to insert a row."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Insert row at index position 1
        row_ids.insert(1, ds.row_counter)
        # Result should indicate that one row was inserted. The identifier of
        # the resulting dataset should differ from the identifier of the
        # original dataset
        row_count, id1 = self.vizual.insert_row(ds.identifier, 1)
        self.assertEquals(row_count, 1)
        self.assertNotEquals(id1, ds.identifier)
        # Retrieve modified dataset
        ds = self.datastore.get_dataset(id1)
        ds_rows = ds.fetch_rows()
        # Ensure that there are three rows
        self.assertEquals(len(ds_rows), 3)
        # The second row has empty values for each column
        row = ds_rows[1]
        self.assertEquals(len(row.values), len(ds.columns))
        for i in range(len(ds.columns)):
            self.assertTrue(is_null(row.values[i]))
        # Append row at end current dataset
        row_ids.append(ds.row_counter)
        row_count, id2 = self.vizual.insert_row(id1, 3)
        self.assertEquals(row_count, 1)
        self.assertNotEquals(id1, id2)
        ds = self.datastore.get_dataset(id2)
        ds_rows = ds.fetch_rows()
        # Ensure that there are three rows
        self.assertEquals(len(ds_rows), 4)
        # The next to last row has non-empty values for each column
        row = ds_rows[2]
        self.assertEquals(len(row.values), len(ds.columns))
        for i in range(len(ds.columns)):
            self.assertFalse(is_null(row.values[i]))
        # The last row has empty values for each column
        row = ds_rows[3]
        self.assertEquals(len(row.values), len(ds.columns))
        for i in range(len(ds.columns)):
            self.assertTrue(is_null(row.values[i]))
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(ValueError):
            self.vizual.insert_row('unknown:uri', 1)
        # Ensure exception is thrown if row index is out of bounds
        with self.assertRaises(ValueError):
            self.vizual.insert_row(ds.identifier, 5)
        # Ensure no exception is raised
        self.vizual.insert_row(ds.identifier, 4)
        self.tear_down(engine)

    def load_dataset(self, engine):
        """Test functionality to load a dataset."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEquals(len(ds.columns), 3)
        self.assertEquals(len(ds_rows), 2)
        for row in ds_rows:
            self.assertTrue(isinstance(row.values[1], int))
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(ValueError):
            self.vizual.load_dataset('unknown:uri')
        self.tear_down(engine)

    def move_column(self, engine):
        """Test functionality to move a column."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Swap first two columns
        c = col_ids[0]
        del col_ids[0]
        col_ids.insert(1, c)
        col_count, id1 = self.vizual.move_column(
            ds.identifier,
            ds.column_by_name('Name').identifier, 1)
        self.assertEquals(col_count, 1)
        self.assertNotEquals(id1, ds.identifier)
        ds = self.datastore.get_dataset(id1)
        ds_rows = ds.fetch_rows()
        self.assertEquals(ds.columns[0].name.upper(), 'Age'.upper())
        self.assertEquals(ds.columns[1].name.upper(), 'Name'.upper())
        self.assertEquals(ds.columns[2].name.upper(), 'Salary'.upper())
        row = ds_rows[0]
        self.assertEquals(row.values[0], 23)
        self.assertEquals(row.values[1], 'Alice')
        self.assertEquals(row.values[2], '35K')
        row = ds_rows[1]
        self.assertEquals(row.values[0], 32)
        self.assertEquals(row.values[1], 'Bob')
        self.assertEquals(row.values[2], '30K')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Swap last two columns
        c = col_ids[1]
        del col_ids[1]
        col_ids.append(c)
        col_count, id2 = self.vizual.move_column(
            id1,
            ds.column_by_name('Salary').identifier, 1)
        ds = self.datastore.get_dataset(id2)
        ds_rows = ds.fetch_rows()
        self.assertEquals(ds.columns[0].name.upper(), 'Age'.upper())
        self.assertEquals(ds.columns[1].name.upper(), 'Salary'.upper())
        self.assertEquals(ds.columns[2].name.upper(), 'Name'.upper())
        row = ds_rows[0]
        self.assertEquals(row.values[0], 23)
        self.assertEquals(row.values[1], '35K')
        self.assertEquals(row.values[2], 'Alice')
        row = ds_rows[1]
        self.assertEquals(row.values[0], 32)
        self.assertEquals(row.values[1], '30K')
        self.assertEquals(row.values[2], 'Bob')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Raise error if source column is out of bounds
        with self.assertRaises(ValueError):
            self.vizual.move_column(id2, 40, 1)
        # Raise error if target position is out of bounds
        with self.assertRaises(ValueError):
            self.vizual.move_column(id2,
                                    ds.column_by_name('Name').identifier, -1)
        with self.assertRaises(ValueError):
            self.vizual.move_column(id2,
                                    ds.column_by_name('Name').identifier, 4)
        self.tear_down(engine)

    def move_row(self, engine):
        """Test functionality to move a row."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Swap first two rows
        row_ids = [row for row in reversed(row_ids)]
        row_count, id1 = self.vizual.move_row(ds.identifier, 0, 1)
        self.assertEquals(row_count, 1)
        self.assertNotEquals(id1, ds.identifier)
        ds = self.datastore.get_dataset(id1)
        ds_rows = ds.fetch_rows()
        self.assertEquals(ds.columns[0].name.upper(), 'Name'.upper())
        self.assertEquals(ds.columns[1].name.upper(), 'Age'.upper())
        self.assertEquals(ds.columns[2].name.upper(), 'Salary'.upper())
        row = ds_rows[0]
        self.assertEquals(row.values[0], 'Bob')
        self.assertEquals(row.values[1], 32)
        self.assertEquals(row.values[2], '30K')
        row = ds_rows[1]
        self.assertEquals(row.values[0], 'Alice')
        self.assertEquals(row.values[1], 23)
        self.assertEquals(row.values[2], '35K')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Swap last two rows
        row_ids = [row for row in reversed(row_ids)]
        row_count, id2 = self.vizual.move_row(id1, 1, 0)
        ds = self.datastore.get_dataset(id2)
        ds_rows = ds.fetch_rows()
        self.assertEquals(ds.columns[0].name.upper(), 'Name'.upper())
        self.assertEquals(ds.columns[1].name.upper(), 'Age'.upper())
        self.assertEquals(ds.columns[2].name.upper(), 'Salary'.upper())
        row = ds_rows[0]
        self.assertEquals(row.values[0], 'Alice')
        self.assertEquals(row.values[1], 23)
        self.assertEquals(row.values[2], '35K')
        row = ds_rows[1]
        self.assertEquals(row.values[0], 'Bob')
        self.assertEquals(row.values[1], 32)
        self.assertEquals(row.values[2], '30K')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Move first row to the end
        row_count, id3 = self.vizual.move_row(id2, 0, 2)
        row_ids = [row for row in reversed(row_ids)]
        ds = self.datastore.get_dataset(id3)
        ds_rows = ds.fetch_rows()
        row = ds_rows[0]
        self.assertEquals(row.values[0], 'Bob')
        self.assertEquals(row.values[1], 32)
        self.assertEquals(row.values[2], '30K')
        row = ds_rows[1]
        self.assertEquals(row.values[0], 'Alice')
        self.assertEquals(row.values[1], 23)
        self.assertEquals(row.values[2], '35K')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Raise error if source row is out of bounds
        with self.assertRaises(ValueError):
            self.vizual.move_row(id2, 3, 1)
        # Raise error if target position is out of bounds
        with self.assertRaises(ValueError):
            self.vizual.move_row(id2, 0, -1)
        with self.assertRaises(ValueError):
            self.vizual.move_row(id2, 1, 4)
        self.tear_down(engine)

    def rename_column(self, engine):
        """Test functionality to rename a column."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Rename first column to Firstname
        col_count, id1 = self.vizual.rename_column(
            ds.identifier,
            ds.column_by_name('Name').identifier, 'Firstname')
        self.assertEquals(col_count, 1)
        self.assertNotEquals(id1, ds.identifier)
        ds = self.datastore.get_dataset(id1)
        self.assertEquals(ds.columns[0].name.upper(), 'Firstname'.upper())
        self.assertEquals(ds.columns[1].name.upper(), 'Age'.upper())
        self.assertEquals(ds.columns[2].name.upper(), 'Salary'.upper())
        col_count, id2 = self.vizual.rename_column(
            id1,
            ds.column_by_name('Age').identifier, 'BDate')
        ds = self.datastore.get_dataset(id2)
        ds_rows = ds.fetch_rows()
        self.assertEquals(ds.columns[0].name.upper(), 'Firstname'.upper())
        self.assertEquals(ds.columns[1].name, 'BDate')
        self.assertEquals(ds.columns[2].name.upper(), 'Salary'.upper())
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Ensure exception is thrown if dataset identifier is unknown
        with self.assertRaises(ValueError):
            self.vizual.rename_column('unknown:uri', 0, 'Firstname')
        # Ensure exception is thrown for invalid column id
        with self.assertRaises(ValueError):
            self.vizual.rename_column(id2, 500, 'BDate')
        self.tear_down(engine)

    def sequence_of_steps(self, engine):
        """Test sequence of calls that modify a dataset."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        count, ds_id = self.vizual.insert_row(ds.identifier, 1)
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.insert_column(ds_id, 3, 'HDate')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.update_cell(
            ds_id,
            ds.column_by_name('HDate').identifier, 0, '180')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.update_cell(
            ds_id,
            ds.column_by_name('HDate').identifier, 1, '160')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.rename_column(
            ds_id,
            ds.column_by_name('HDate').identifier, 'Height')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.update_cell(
            ds_id,
            ds.column_by_name('Height').identifier, 2, '170')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.move_row(ds_id, 1, 2)
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.update_cell(
            ds_id,
            ds.column_by_name('Name').identifier, 2, 'Carla')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.update_cell(
            ds_id,
            ds.column_by_name('Age').identifier, 2, '45')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.update_cell(
            ds_id,
            ds.column_by_name('Salary').identifier, 2, '56K')
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.move_column(
            ds_id,
            ds.column_by_name('Salary').identifier, 4)
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.delete_column(
            ds_id,
            ds.column_by_name('Age').identifier)
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.delete_row(ds_id, 0)
        ds = self.datastore.get_dataset(ds_id)
        count, ds_id = self.vizual.delete_row(ds_id, 0)
        ds = self.datastore.get_dataset(ds_id)
        ds_rows = ds.fetch_rows()
        names = ['Name', 'Height', 'Salary']
        self.assertEquals(len(ds.columns), len(names))
        for i in range(len(names)):
            col = ds.columns[i]
            self.assertEquals(col.name.upper(), names[i].upper())
        self.assertEquals([col.identifier for col in ds.columns], [0, 3, 2])
        self.assertEquals(len(ds_rows), 1)
        self.assertEquals(ds_rows[0].values, ['Carla', '160', '56K'])
        self.assertEquals(ds_rows[0].identifier, 2)
        self.tear_down(engine)

    def sort_dataset(self, engine):
        """Test sorting a dataset."""
        self.set_up(engine)
        # Create a new dataset
        fh = self.fs.upload_file(SORT_FILE)
        ds = self.vizual.load_dataset(fh.identifier)
        count, ds_id = self.vizual.sort_dataset(ds.identifier, [1, 2, 0],
                                                [False, False, True])
        ds = self.datastore.get_dataset(ds_id)
        rows = ds.fetch_rows()
        names = ['Alice', 'Bob', 'Dave', 'Gertrud', 'Frank']
        result = list()
        for row in rows:
            name = row.values[0]
            if name in names:
                result.append(name)
        for i in range(len(names)):
            self.assertEquals(names[i], result[i])
        count, ds_id = self.vizual.sort_dataset(ds.identifier, [2, 1, 0],
                                                [True, False, True])
        ds = self.datastore.get_dataset(ds_id)
        rows = ds.fetch_rows()
        names = ['Gertrud', 'Frank', 'Bob', 'Alice', 'Dave']
        result = list()
        for row in rows:
            name = row.values[0]
            if name in names:
                result.append(name)
        for i in range(len(names)):
            self.assertEquals(names[i], result[i])
        self.tear_down(engine)

    def update_cell(self, engine):
        """Test functionality to update a dataset cell."""
        self.set_up(engine)
        # Create a new dataset
        ds = self.vizual.load_dataset(self.file.identifier)
        ds_rows = ds.fetch_rows()
        # Keep track of column and row identifier
        col_ids = [col.identifier for col in ds.columns]
        row_ids = [row.identifier for row in ds_rows]
        # Update cell [0, 0]. Ensure that one row was updated and a new
        # identifier is generated. Also ensure that the resulting datasets
        # has the new value in cell [0, 0]
        upd_rows, id1 = self.vizual.update_cell(ds.identifier, 0, 0, 'MyValue')
        self.assertEquals(upd_rows, 1)
        self.assertNotEquals(ds.identifier, id1)
        ds = self.datastore.get_dataset(id1)
        ds_rows = ds.fetch_rows()
        self.assertEquals(ds_rows[0].values[0], 'MyValue')
        upd_rows, id2 = self.vizual.update_cell(
            id1,
            ds.column_by_name('Name').identifier, 0, 'AValue')
        ds = self.datastore.get_dataset(id2)
        ds_rows = ds.fetch_rows()
        self.assertEquals(ds_rows[0].values[0], 'AValue')
        self.assertEquals(ds_rows[0].values[ds.column_index('Name')], 'AValue')
        # Ensure that row ids haven't changed
        for i in range(len(ds_rows)):
            self.assertEquals(ds_rows[i].identifier, row_ids[i])
        # Make sure column identifier haven't changed
        for i in range(len(ds.columns)):
            self.assertEquals(ds.columns[i].identifier, col_ids[i])
        # Set value to None
        upd_rows, id3 = self.vizual.update_cell(
            id2,
            ds.column_by_name('Name').identifier, 0, None)
        ds = self.datastore.get_dataset(id3)
        ds_rows = ds.fetch_rows()
        self.assertIsNone(ds_rows[0].values[0])
        self.assertIsNone(ds_rows[0].values[ds.column_index('Name')])
        # Ensure exception is thrown if column is unknown
        with self.assertRaises(ValueError):
            self.vizual.update_cell(ds.identifier, 100, 0, 'MyValue')
        # Ensure exception is thrown if row index is out ouf bounds
        with self.assertRaises(ValueError):
            self.vizual.update_cell(ds.identifier, 0, 100, 'MyValue')
        self.tear_down(engine)
Example #18
0
class TestLoadMimirDataset(unittest.TestCase):

    def setUp(self):
        """Create an empty work trails repository."""
        # Cleanup first
        self.cleanUp()
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.vizual = MimirVizualEngine(self.datastore, self.fileserver)

    def tearDown(self):
        """Clean-up by deleting directories.
        """
        self.cleanUp()

    def cleanUp(self):
        """Remove datastore and fileserver directory."""
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_load(self):
        """Run workflow with default configuration."""
        mimir.initialize()
        self.update_cell(CSV_FILE, 2, 0, 'int', 10)
        self.update_cell(CSV_FILE, 2, 0, 'int', 10.3, result_type='real')
        self.update_cell(CSV_FILE, 2, 0, 'int', None)
        self.update_cell(CSV_FILE, 3, 0, 'real', 10.3)
        self.update_cell(CSV_FILE, 3, 0, 'real', 10, result_value=10.0)
        self.update_cell(CSV_FILE, 3, 0, 'real', 'A', result_type='varchar')
        self.update_cell(CSV_FILE, 3, 0, 'real', None)
        self.update_cell(CSV_FILE, 4, 0, 'varchar', 'A')
        self.update_cell(CSV_FILE, 4, 0, 'varchar', 10, result_value='10')
        self.update_cell(CSV_FILE, 4, 0, 'varchar', 10.87, result_value='10.87')
        self.update_cell(CSV_FILE, 4, 0, 'varchar', None)
        self.update_cell(CSV_FILE, 8, 0, 'bool', 'False', result_value=False)
        self.update_cell(CSV_FILE, 8, 0, 'bool', '0', result_value=False)
        self.update_cell(CSV_FILE, 8, 0, 'bool', None)
        self.update_cell(CSV_FILE, 8, 1, 'bool', True, result_value=True)
        self.update_cell(CSV_FILE, 8, 1, 'bool', '1', result_value=True)
        self.update_cell(CSV_FILE, 8, 1, 'bool', 'A', result_value='A', result_type='varchar')
        self.update_cell(CSV_FILE, 8, 1, 'bool', 10.87, result_value='10.87', result_type='varchar')
        self.update_cell(CSV_FILE_DT, 1, 0, 'date', '2018-05-09')
        self.update_cell(CSV_FILE_DT, 1, 0, 'date', '20180509', result_value='20180509', result_type='varchar')
        self.update_cell(CSV_FILE_DT, 1, 0, 'date', None)
        self.update_cell(CSV_FILE_DT, 0, 0, 'datetime', '2018-05-09 12:03:22.0000')
        self.update_cell(CSV_FILE_DT, 0, 0, 'datetime', 'ABC', result_value='ABC', result_type='varchar')
        self.update_cell(CSV_FILE_DT, 0, 0, 'datetime', None)
        mimir.finalize()

    def update_cell(self, filename, col, row, data_type, value, result_value=None, result_type=None):
        """Update the value of the given cell. The column data type is expected
        to match the given datatype. The optional result value is the expected
        value of the cell in the modified dataset.
        """
        f_handle = self.fileserver.upload_file(filename)
        ds = self.datastore.load_dataset(f_handle)
        #print [c.name_in_rdb + ' AS ' + c.name + '(' + c.data_type + ')' for c in ds.columns]
        self.assertEquals(ds.columns[col].data_type, data_type)
        rows = ds.fetch_rows()
        self.assertNotEquals(rows[row].values[col], value)
        _, ds_id = self.vizual.update_cell(ds.identifier, col, row, value)
        ds = self.datastore.get_dataset(ds_id)
        #print [c.name_in_rdb + ' AS ' + c.name + '(' + c.data_type + ')' for c in ds.columns]
        if result_type is None:
            self.assertEquals(ds.columns[col].data_type, data_type)
        else:
            self.assertEquals(ds.columns[col].data_type, result_type)
        rows = ds.fetch_rows()
        if result_value is None:
            self.assertEquals(rows[row].values[col], value)
        else:
            self.assertEquals(rows[row].values[col], result_value)
        self.fileserver.delete_file(f_handle.identifier)
Example #19
0
class TestWebServiceAPI(unittest.TestCase):
    def setUp(self):
        """Create an new Web Service API."""
        # Clear various directories
        for d in [WORKTRAILS_DIR, DATASTORE_DIR, FILESERVER_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)
        # Setup datastore and API
        self.config = AppConfig()
        self.ENV = ExecEnv(
            FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
            packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON
                      ]).from_dict({'datastore': {
                          'directory': DATASTORE_DIR
                      }})
        self.ENGINE_ID = self.ENV.identifier
        self.config.envs[self.ENGINE_ID] = self.ENV
        self.config.fileserver = self.ENV.fileserver
        self.datastore = FileSystemDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.api = VizierWebService(
            FileSystemViztrailRepository(WORKTRAILS_DIR,
                                         {self.ENV.identifier: self.ENV}),
            self.datastore, self.fileserver, self.config)

    def tearDown(self):
        """Clean-up by deleting created directories.
        """
        for d in [WORKTRAILS_DIR, DATASTORE_DIR, FILESERVER_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_service_descriptors(self):
        """Ensure validity of the service descriptor and build information."""
        desc = self.api.service_overview()
        # The descriptor is expected to contain three elements: name, title, and
        # links. Name and title should be the same as in the default config
        self.validate_keys(desc, ['name', 'envs', 'properties', 'links'])
        self.assertEquals(desc['name'], self.config.name)
        self.assertFalse(len(desc['envs']) == 0)
        for env in desc['envs']:
            self.validate_keys(
                env, ['id', 'name', 'description', 'default', 'packages'])
        # Expect five references in the link listing: self, build, upload, doc,
        # and projects
        self.validate_links(desc['links'], [
            'self', 'build', 'doc', 'upload', 'projects', 'notebooks', 'files'
        ])
        # The build information should have two elements: components and links
        build = self.api.system_build()
        self.assertEquals(len(build), 2)
        for key in ['components', 'links']:
            self.assertTrue(key in build)
        # The components list should include three entries (projects, datasets,
        # and workflows, each with name and version information.
        components = {c['name']: c['build'] for c in build['components']}
        self.assertEquals(len(components), 3)
        for key in ['datastore', 'fileserver', 'viztrails']:
            self.assertTrue(key in components)
            for info in ['name', 'version']:
                self.assertTrue(info in components[key])

    def test_files(self):
        """Test API calls to upload and retrieve datasets."""
        # Upload a new dataset
        fh = self.api.upload_file(CSV_FILE)
        # The result should contain five elements: id, name, columns, rows, and
        # links
        self.validate_file_handle(fh)
        # Retrieve the full dataset from the API
        fh = self.api.get_file(fh['id'])
        self.validate_file_handle(fh)
        # Retrieving an unknown dataset should return None
        self.assertIsNone(self.api.get_file('invalid id'))
        self.assertIsNone(self.api.get_file('f0f0f0f0f0f0f0f0f0f0f0f0'))
        self.validate_file_listing(self.api.list_files(), 1)
        self.api.upload_file(TSV_FILE)
        self.validate_file_listing(self.api.list_files(), 2)
        self.validate_file_handle(self.api.rename_file(fh['id'], 'myfile'))
        self.validate_file_listing(self.api.list_files(), 2)
        self.assertIsNone(self.api.rename_file('invalid id', 'afile'))
        self.assertTrue(self.api.delete_file(fh['id']))
        self.validate_file_listing(self.api.list_files(), 1)

    def test_datasets(self):
        """Test retireval of datasets."""
        ds = self.datastore.load_dataset(self.fileserver.upload_file(CSV_FILE))
        ds = self.datastore.create_dataset(columns=ds.columns,
                                           rows=ds.fetch_rows())
        self.validate_dataset_handle(self.api.get_dataset(ds.identifier))
        anno = self.api.update_dataset_annotation(ds.identifier,
                                                  column_id=0,
                                                  key='comment',
                                                  value='Hello')
        anno_id = anno['annotations'][0]['id']
        self.api.update_dataset_annotation(ds.identifier,
                                           row_id=1,
                                           key='comment',
                                           value='World')
        self.api.update_dataset_annotation(ds.identifier,
                                           column_id=1,
                                           row_id=0,
                                           key='comment',
                                           value='!')
        self.validate_dataset_annotations(ds.identifier,
                                          column_id=0,
                                          expected={'comment': 'Hello'})
        self.validate_dataset_annotations(ds.identifier,
                                          row_id=1,
                                          expected={'comment': 'World'})
        self.validate_dataset_annotations(ds.identifier,
                                          column_id=1,
                                          row_id=0,
                                          expected={'comment': '!'})
        # Update annotations
        self.api.update_dataset_annotation(ds.identifier,
                                           anno_id=anno_id,
                                           column_id=0,
                                           key='comment',
                                           value='Some Name')
        self.validate_dataset_annotations(ds.identifier,
                                          column_id=0,
                                          expected={'comment': 'Some Name'})
        # Make sure unknown datasets are handeled correctly
        self.assertIsNone(self.api.get_dataset('someunknonwidentifier'))
        self.assertIsNone(
            self.api.get_dataset_annotations('someunknonwidentifier'))

    def test_projects(self):
        """Test API calls to create and manipulate projects."""
        # Create a new project
        ph = self.api.create_project(self.ENV.identifier,
                                     {'name': 'My Project'})
        self.validate_project_descriptor(ph)
        self.validate_project_handle(self.api.get_project(ph['id']))
        # Project listing
        self.validate_project_listing(self.api.list_projects(), 1)
        ph = self.api.create_project(self.ENV.identifier,
                                     {'name': 'A Project'})
        self.validate_project_handle(self.api.get_project(ph['id']))
        self.validate_project_listing(self.api.list_projects(), 2)
        # Update project properties
        props = {p['key']: p['value'] for p in ph['properties']}
        self.assertEquals(props['name'], 'A Project')
        ph = self.api.update_project_properties(ph['id'], {'name': 'New Name'})
        self.validate_project_descriptor(ph)
        props = {p['key']: p['value'] for p in ph['properties']}
        self.assertEquals(props['name'], 'New Name')
        # Module specifications
        modules = self.api.list_module_specifications_for_project(ph['id'])
        self.assertEquals(len(modules), 3)
        self.validate_keys(modules, ['modules', 'project', 'links'])
        self.validate_project_descriptor(modules['project'])
        for m in modules['modules']:
            self.validate_keys(m, ['type', 'id', 'name', 'arguments'],
                               optional_keys=['group'])
            arg_keys = ['id', 'label', 'name', 'datatype', 'index', 'required']
            for arg in m['arguments']:
                self.assertTrue(len(arg) >= len(arg_keys))
                for k in arg_keys:
                    self.assertTrue(k in arg)
        # Delete project
        self.assertTrue(self.api.delete_project(ph['id']))
        self.validate_project_listing(self.api.list_projects(), 1)
        # Retrieve non-existing project should return None
        self.assertIsNone(self.api.get_project('invalid-id'))
        # Delete a non existing project should return False
        self.assertFalse(self.api.delete_project(ph['id']))
        # Updating a non exisiting project should return None
        self.assertIsNone(
            self.api.update_project_properties(ph['id'], {'name': 'New Name'}))

    def test_spreadsheet(self):
        """Ensure that the includeDataset option is working for spreadsheet
        updates."""
        # Upload file
        fh = self.fileserver.upload_file(CSV_FILE)
        # Create project
        ph = self.api.create_project(self.ENV.identifier,
                                     {'name': 'My Project'})
        pid = ph['id']
        # Load dataset
        DS_NAME = 'myDS'
        cmd = load_dataset(fh.identifier, DS_NAME)
        result = self.api.append_module(pid, DEFAULT_BRANCH, -1, cmd)
        self.validate_keys(result, ['workflow', 'modules', 'datasets'])
        # Update cell and request to include dataset
        cmd = update_cell(DS_NAME, 0, 0, 'A')
        result = self.api.append_module(pid,
                                        DEFAULT_BRANCH,
                                        -1,
                                        cmd,
                                        includeDataset={
                                            'name': DS_NAME,
                                            'offset': 0
                                        })
        self.validate_keys(result, ['workflow', 'dataset'])
        self.validate_dataset_handle(result['dataset'])
        # In case of an error the result contains the modules
        cmd = update_cell(DS_NAME, 100, 0, 'A')
        result = self.api.append_module(pid,
                                        DEFAULT_BRANCH,
                                        -1,
                                        cmd,
                                        includeDataset={
                                            'name': DS_NAME,
                                            'offset': 0
                                        })
        self.validate_keys(result, ['workflow', 'modules', 'datasets'])

    def test_workflows(self):
        """Test API calls to retrieve and manipulate workflows."""
        # Create a new project
        ph = self.api.create_project(self.ENV.identifier,
                                     {'name': 'My Project'})
        self.validate_branch_listing(self.api.list_branches(ph['id']), 1)
        self.validate_branch_handle(
            self.api.get_branch(ph['id'], DEFAULT_BRANCH))
        wf = self.api.get_workflow(ph['id'], DEFAULT_BRANCH)
        self.validate_workflow_handle(wf)
        # Raise exception when creating branch of empty workflow
        with self.assertRaises(ValueError):
            self.api.create_branch(ph['id'], DEFAULT_BRANCH, -1, 0,
                                   {'name': 'My Branch'})
        # Result is None when creating of a non existing branch
        with self.assertRaises(ValueError):
            self.api.create_branch(ph['id'], 'unknown', -1, 0,
                                   {'name': 'My Branch'})
        # Execute a new command
        last_modified = ph['lastModifiedAt']
        result = self.api.append_module(ph['id'], DEFAULT_BRANCH, -1,
                                        python_cell('2+2'))
        self.validate_workflow_update_result(result)
        wf = self.api.get_workflow(ph['id'], DEFAULT_BRANCH)
        self.validate_workflow_handle(wf)
        modules = self.api.get_workflow_modules(ph['id'], DEFAULT_BRANCH, -1)
        self.validate_workflow_modules(modules, number_of_modules=1)
        self.assertNotEquals(last_modified, wf['project']['lastModifiedAt'])
        last_modified = wf['project']['lastModifiedAt']
        # Create a new branch
        time.sleep(1)
        desc = self.api.create_branch(ph['id'], DEFAULT_BRANCH, -1, 0,
                                      {'name': 'My Branch'})
        self.validate_branch_handle(desc)
        branch_wf = self.api.get_workflow(ph['id'], desc['id'])
        self.assertNotEquals(last_modified,
                             branch_wf['project']['lastModifiedAt'])
        last_modified = branch_wf['project']['lastModifiedAt']
        self.validate_workflow_handle(branch_wf)
        modules = self.api.get_workflow_modules(ph['id'], desc['id'], -1)
        self.validate_workflow_modules(modules, number_of_modules=1)
        # Replace module in new branch
        time.sleep(1)
        result = self.api.replace_module(ph['id'], desc['id'], -1, 0,
                                         python_cell('3+3'))
        self.validate_workflow_update_result(result)
        modules = self.api.get_workflow_modules(ph['id'], desc['id'], -1)
        self.validate_workflow_modules(modules, number_of_modules=1)
        branch_wf = self.api.get_workflow(ph['id'], desc['id'])
        # Ensure that the last modified date of the project has changed
        self.assertNotEquals(last_modified,
                             branch_wf['project']['lastModifiedAt'])
        # Append module to new branch
        self.validate_workflow_update_result(
            self.api.append_module(ph['id'], desc['id'], -1,
                                   python_cell('4+4')))
        modules = self.api.get_workflow_modules(ph['id'], desc['id'], -1)
        self.validate_workflow_modules(modules, number_of_modules=2)
        branch_wf = self.api.get_workflow_modules(ph['id'], desc['id'])
        self.assertEquals(len(branch_wf['modules']), 2)
        wf = self.api.get_workflow_modules(ph['id'], DEFAULT_BRANCH)
        self.assertEquals(len(wf['modules']), 1)
        self.validate_branch_listing(self.api.list_branches(ph['id']), 2)
        # Update new branch name
        branch_wf = self.api.update_branch(ph['id'], desc['id'],
                                           {'name': 'Some Branch'})
        self.validate_branch_handle(branch_wf)
        n1 = self.api.get_branch(ph['id'],
                                 DEFAULT_BRANCH)['properties'][0]['value']
        n2 = self.api.get_branch(ph['id'],
                                 desc['id'])['properties'][0]['value']
        self.assertEquals(n2, 'Some Branch')
        self.assertNotEquals(n1, n2)
        # Retrieving the workflow for an unknown project should return None
        self.assertIsNone(self.api.get_workflow('invalid id', DEFAULT_BRANCH))
        self.assertIsNone(
            self.api.get_workflow('f0f0f0f0f0f0f0f0f0f0f0f0', DEFAULT_BRANCH))
        # Delete workflow
        self.assertTrue(self.api.delete_branch(ph['id'], desc['id']))
        self.assertFalse(self.api.delete_branch(ph['id'], desc['id']))
        with self.assertRaises(ValueError):
            self.api.delete_branch(ph['id'], DEFAULT_BRANCH)

    def test_workflow_commands(self):
        """Test API calls to execute workflow modules."""
        # Create a new project
        pj = self.api.create_project(self.ENV.identifier,
                                     {'name': 'My Project'})
        # Use Python load command to test module execution
        self.api.append_module(pj['id'], DEFAULT_BRANCH, -1,
                               python_cell('2+2'))
        self.api.append_module(pj['id'], DEFAULT_BRANCH, -1,
                               python_cell('3+3'))
        wf_master = self.api.get_workflow_modules(pj['id'], DEFAULT_BRANCH)
        content = list_modules_arguments_values(wf_master['modules'])
        self.assertEquals(len(content), 2)
        self.assertEquals(content[0], '2+2')
        self.assertEquals(content[1], '3+3')
        branch_id = self.api.create_branch(pj['id'], DEFAULT_BRANCH, -1, 0,
                                           {'name': 'My Name'})['id']
        wf_branch = self.api.get_workflow_modules(pj['id'], branch_id)
        content = list_modules_arguments_values(wf_branch['modules'])
        self.assertEquals(len(content), 1)
        self.assertEquals(content[0], '2+2')
        # Replace first module in master and append to second branch_id
        self.api.replace_module(pj['id'], DEFAULT_BRANCH, -1, 0,
                                python_cell('4+4'))
        self.api.append_module(pj['id'], branch_id, -1, python_cell('5+5'))
        wf_master = self.api.get_workflow_modules(pj['id'], DEFAULT_BRANCH)
        content = list_modules_arguments_values(wf_master['modules'])
        self.assertEquals(len(content), 2)
        self.assertEquals(content[0], '4+4')
        self.assertEquals(content[1], '3+3')
        wf_branch = self.api.get_workflow_modules(pj['id'], branch_id)
        content = list_modules_arguments_values(wf_branch['modules'])
        self.assertEquals(len(content), 2)
        self.assertEquals(content[0], '2+2')
        self.assertEquals(content[1], '5+5')
        # Delete module
        m_count = len(wf_branch['modules'])
        m_id = wf_branch['modules'][-1]['id']
        self.api.delete_module(pj['id'], branch_id, -1, m_id)
        wf_branch = self.api.get_workflow_modules(pj['id'], branch_id)
        self.assertEquals(len(wf_branch['modules']), m_count - 1)
        for m in wf_branch['modules']:
            self.assertNotEquals(m['id'], m_id)
        self.assertIsNone(self.api.delete_module(pj['id'], branch_id, -1, 100))

    def validate_branch_listing(self, listing, number_of_branches):
        self.validate_keys(listing, ['branches', 'links'])
        self.validate_links(listing['links'], ['self', 'create', 'project'])
        self.assertEquals(len(listing['branches']), number_of_branches)
        for br in listing['branches']:
            self.validate_branch_descriptor(br)

    def validate_branch_descriptor(self, branch):
        self.validate_keys(branch, ['id', 'properties', 'links'])
        self.validate_links(branch['links'],
                            ['self', 'delete', 'head', 'project', 'update'])

    def validate_branch_handle(self, branch):
        self.validate_keys(
            branch, ['id', 'project', 'workflows', 'properties', 'links'])
        self.validate_links(branch['links'],
                            ['self', 'delete', 'head', 'project', 'update'])
        self.validate_project_descriptor(branch['project'])
        for wf in branch['workflows']:
            self.validate_workflow_descriptor(wf)

    def validate_dataset_handle(self, ds):
        self.validate_keys(ds, [
            'id', 'columns', 'rows', 'links', 'offset', 'rowcount',
            'annotatedCells'
        ])
        for col in ds['columns']:
            self.validate_keys(col, ['id', 'name'])
        for row in ds['rows']:
            self.validate_keys(row, ['id', 'index', 'values'])
        self.validate_links(
            ds['links'],
            ['self', 'download', 'annotations', 'pagefirst', 'pagefirstanno'])

    def validate_dataset_annotations(self,
                                     ds_id,
                                     column_id=-1,
                                     row_id=-1,
                                     expected=dict()):
        annotations = self.api.get_dataset_annotations(ds_id,
                                                       column_id=column_id,
                                                       row_id=row_id)
        keys = ['links', 'annotations']
        if column_id >= 0:
            keys.append('column')
        if row_id >= 0:
            keys.append('row')
        self.validate_keys(annotations, keys)
        annos = annotations['annotations']
        self.assertEquals(len(annos), len(expected))
        for anno in annos:
            self.validate_keys(anno, ['id', 'key', 'value'])
            key = anno['key']
            self.assertTrue(key in expected)
            self.assertEquals(anno['value'], expected[key])

    def validate_file_handle(self, fh):
        self.validate_keys(fh, [
            'id', 'name', 'columns', 'rows', 'filesize', 'createdAt',
            'lastModifiedAt', 'links'
        ])
        links = {l['rel']: l['href'] for l in fh['links']}
        self.validate_links(fh['links'],
                            ['self', 'delete', 'rename', 'download'])

    def validate_file_listing(self, fl, number_of_files):
        self.validate_keys(fl, ['files', 'links'])
        links = {l['rel']: l['href'] for l in fl['links']}
        self.validate_links(fl['links'], ['self', 'upload'])
        self.assertEquals(len(fl['files']), number_of_files)
        for fh in fl['files']:
            self.validate_file_handle(fh)

    def validate_keys(self, obj, keys, optional_keys=list()):
        if len(obj) > len(keys):
            for key in obj:
                self.assertTrue(key in keys or key in optional_keys,
                                msg='Invalid key ' + key)
        else:
            for key in keys:
                self.assertTrue(key in obj, msg='Missing key ' + key)

    def validate_links(self, links, keys):
        self.validate_keys({l['rel']: l['href'] for l in links}, keys)

    def validate_module_handle(self, module):
        self.validate_keys(module, [
            'id', 'command', 'text', 'stdout', 'stderr', 'datasets', 'links',
            'views'
        ])
        self.validate_keys(module['command'], ['type', 'id', 'arguments'])
        self.validate_links(module['links'], ['delete', 'insert', 'replace'])
        for ds in module['datasets']:
            self.validate_keys(ds, ['id', 'name'])

    def validate_project_descriptor(self, pd):
        self.validate_keys(pd, [
            'id', 'environment', 'createdAt', 'lastModifiedAt', 'properties',
            'links'
        ])
        links = {l['rel']: l['href'] for l in pd['links']}
        self.validate_keys(
            links,
            ['self', 'delete', 'home', 'update', 'branches', 'environment'])

    def validate_project_handle(self, ph, br_count=1):
        self.validate_keys(ph, [
            'id', 'environment', 'createdAt', 'lastModifiedAt', 'properties',
            'branches', 'links'
        ])
        self.validate_links(
            ph['links'],
            ['self', 'delete', 'home', 'update', 'branches', 'environment'])
        self.validate_keys(ph['environment'], ['id', 'modules'])
        self.assertEquals(len(ph['branches']), br_count)
        for br in ph['branches']:
            self.validate_branch_descriptor(br)

    def validate_project_listing(self, pl, number_of_projects):
        self.validate_keys(pl, ['projects', 'links'])
        self.validate_links(pl['links'], ['self', 'create', 'home'])
        self.assertEquals(len(pl['projects']), number_of_projects)
        for pj in pl['projects']:
            self.validate_project_descriptor(pj)

    def validate_workflow_descriptor(self, wf):
        self.validate_keys(wf, [
            'version', 'links', 'createdAt', 'packageId', 'commandId',
            'action', 'statement'
        ])
        self.validate_links(
            wf['links'], ['self', 'branch', 'branches', 'append', 'modules'])

    def validate_workflow_handle(self, wf):
        self.validate_keys(wf, [
            'project', 'branch', 'version', 'createdAt', 'state', 'links',
            'readOnly'
        ])
        self.validate_links(
            wf['links'], ['self', 'branch', 'branches', 'append', 'modules'])
        self.validate_project_descriptor(wf['project'])
        state = wf['state']
        self.validate_keys(state,
                           ['datasets', 'charts', 'hasError', 'moduleCount'])

    def validate_workflow_modules(self, wf, number_of_modules=0):
        self.validate_keys(wf, [
            'project', 'branch', 'version', 'modules', 'createdAt', 'links',
            'datasets', 'readOnly'
        ])
        self.validate_links(wf['links'], ['self', 'workflow'])
        self.validate_project_descriptor(wf['project'])
        self.assertEquals(len(wf['modules']), number_of_modules)
        for m in wf['modules']:
            self.validate_module_handle(m)

    def validate_workflow_update_result(self, wf):
        self.validate_keys(wf, ['workflow', 'modules', 'datasets'])
        self.validate_workflow_handle(wf['workflow'])
        for m in wf['modules']:
            self.validate_module_handle(m)
        for ds in wf['datasets']:
            self.validate_dataset_handle(ds)
Example #20
0
FILESERVER_DIR = './env/fs'

CSV_FILE = '../data/mimir/Employee.csv' #pick.csv


def cleanUp():
    """Remove datastore and fileserver directory."""
    # Delete directories
    for d in [DATASTORE_DIR, FILESERVER_DIR]:
        if os.path.isdir(d):
            shutil.rmtree(d)

cleanUp()

datastore = MimirDataStore(DATASTORE_DIR)
fileserver = DefaultFileServer(FILESERVER_DIR)
vizual = MimirVizualEngine(datastore, fileserver)

mimir.initialize()

filename = CSV_FILE
print 'LOAD ' + filename
f_handle = fileserver.upload_file(filename)
ds = datastore.load_dataset(f_handle)

ds_load = datastore.get_dataset(ds.identifier)
print [col.name_in_rdb + ' AS ' + col.name + '(' + col.data_type + ')' for col in ds_load.columns]
print str(ds.row_count) + ' row(s)'
rows = ds.fetch_rows()
for i in range(len(rows)):
    row = rows[i]
class TestDatasetPaginationReader(unittest.TestCase):
    def set_up(self, engine):
        """Create an empty file server repository."""
        # Drop project descriptor directory
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)
        # Setup project repository
        self.fs = DefaultFileServer(FILESERVER_DIR)
        if engine == ENGINEENV_DEFAULT:
            self.datastore = FileSystemDataStore(DATASTORE_DIR)
            self.vizual = DefaultVizualEngine(self.datastore, self.fs)
        elif engine == ENGINEENV_MIMIR:
            self.datastore = MimirDataStore(DATASTORE_DIR)
            self.vizual = MimirVizualEngine(self.datastore, self.fs)

    def tear_down(self, engine):
        """Clean-up by dropping file server directory.
        """
        # Drop data store directory
        if os.path.isdir(DATASTORE_DIR):
            shutil.rmtree(DATASTORE_DIR)
        # Drop project descriptor directory
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)

    def test_default_engine(self):
        """Test functionality for the default setup."""
        self.run_tests(ENGINEENV_DEFAULT)

    def test_mimir_engine(self):
        """Test functionality for the Mimir setup."""
        import vistrails.packages.mimir.init as mimir
        mimir.initialize()
        self.run_tests(ENGINEENV_MIMIR)
        mimir.finalize()

    def run_tests(self, engine):
        """Run sequence of tests for given configuration."""
        self.set_up(engine)
        ds = self.vizual.load_dataset(self.fs.upload_file(CSV_FILE).identifier)
        rows = ds.fetch_rows()
        self.assertEquals(len(rows), 7)
        rows = ds.fetch_rows(offset=1)
        self.assertEquals(len(rows), 6)
        self.assertEquals(rows[0].values[0], 'Bob')
        self.assertEquals(rows[5].values[0], 'Gertrud')
        rows = ds.fetch_rows(limit=2)
        self.assertEquals(len(rows), 2)
        self.assertEquals(rows[0].values[0], 'Alice')
        self.assertEquals(rows[1].values[0], 'Bob')
        rows = ds.fetch_rows(offset=4, limit=3)
        self.assertEquals(len(rows), 3)
        self.assertEquals(rows[0].values[0], 'Eileen')
        self.assertEquals(rows[2].values[0], 'Gertrud')
        rows = ds.fetch_rows(offset=5, limit=3)
        self.assertEquals(len(rows), 2)
        self.assertEquals(rows[0].values[0], 'Frank')
        self.assertEquals(rows[1].values[0], 'Gertrud')
        rows = ds.fetch_rows(offset=6, limit=3)
        self.assertEquals(len(rows), 1)
        self.assertEquals(rows[0].values[0], 'Gertrud')
        # Test larger dataset with deletes
        ds = self.vizual.load_dataset(self.fs.upload_file(TSV_FILE).identifier)
        rows = ds.fetch_rows(offset=0, limit=10)
        self.assertEquals(len(rows), 10)
        self.assertEquals([r.identifier for r in rows],
                          [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        _, id1 = self.vizual.delete_row(ds.identifier, 2)  # ID=2
        _, id2 = self.vizual.delete_row(id1, 4)  # ID=5
        ds = self.datastore.get_dataset(id2)
        rows = ds.fetch_rows(offset=0, limit=10)
        self.assertEquals([r.identifier for r in rows],
                          [0, 1, 3, 4, 6, 7, 8, 9, 10, 11])
        _, id1 = self.vizual.move_row(ds.identifier, 9, 1)  # ID=11
        _, id2 = self.vizual.move_row(id1, 9, 1)  # ID=10
        ds = self.datastore.get_dataset(id2)
        rows = ds.fetch_rows(offset=1, limit=10)
        self.assertEquals([r.identifier for r in rows],
                          [10, 11, 1, 3, 4, 6, 7, 8, 9, 12])
        rows = ds.fetch_rows(offset=2, limit=10)
        self.assertEquals([r.identifier for r in rows],
                          [11, 1, 3, 4, 6, 7, 8, 9, 12, 13])
        rows = ds.fetch_rows(offset=3, limit=10)
        self.assertEquals([r.identifier for r in rows],
                          [1, 3, 4, 6, 7, 8, 9, 12, 13, 14])
        self.tear_down(engine)
Example #22
0
class TestUnicodeHandling(unittest.TestCase):

    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def set_up(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)

    def set_up_default(self):
        """Setup configuration using default Vizual engine."""
        env = ExecEnv(
                FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
                packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON]
            ).from_dict({'datastore': {'directory': DATASTORE_DIR}})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = FileSystemDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(
            VIZTRAILS_DIR,
            {env.identifier: env}
        )

    def set_up_mimir(self):
        """Setup configuration using Mimir engine."""
        env = ExecEnv(
                FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
                identifier=ENGINEENV_MIMIR,
                packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON, PACKAGE_MIMIR]
            ).from_dict({'datastore': {'directory': DATASTORE_DIR}})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(
            VIZTRAILS_DIR,
            {env.identifier: env}
        )

    def test_vt_default(self):
        """Run workflow with default configuration."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        self.set_up_default()
        self.run_workflow()

    def test_vt_mimir(self):
        """Run workflows for Mimir configurations."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        self.set_up_mimir()
        self.run_workflow()
        mimir.finalize()

    def run_workflow(self):
        """Test functionality to execute a Python script that creates a dataset
        containing unicode characters."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name' : 'My Project'})
        # LOAD DATASET
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # RUN Python Script
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(PYTHON_SCRIPT)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr
        self.assertFalse(wf.has_error)
        #print wf.modules[-1].stdout[0]['data']
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        names = set(c.name.upper().replace('_', ' ') for c in ds.columns)
        self.assertTrue(len(names), 4)
        for name in ['DATE', 'IMO CODE', 'PORT OF DEPARTURE', 'PLACE OF RECEIPT']:
            self.assertTrue(name in names)
Example #23
0
class TestDataStore(unittest.TestCase):
    def setUp(self):
        """Create empty data store directory."""
        # Setup file server and upload file
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)
        os.mkdir(FILESERVER_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        # Remove directory if it exists
        if os.path.isdir(DATASTORE_DIRECTORY):
            shutil.rmtree(DATASTORE_DIRECTORY)
        os.mkdir(DATASTORE_DIRECTORY)
        self.db = MimirDataStore(DATASTORE_DIRECTORY)

    def tearDown(self):
        """Delete data store directory.
        """
        for d in [DATASTORE_DIRECTORY, FILESERVER_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_datastore(self):
        """Test functionality of the file server data store."""
        mimir.initialize()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        self.assertEquals(ds.column_counter, 3)
        self.assertEquals(ds.row_counter, 2)
        self.assertEquals(ds.row_count, 2)
        cols = [('NAME', COL_PREFIX + '0', 'varchar'),
                ('AGE', COL_PREFIX + '1', 'int'),
                ('SALARY', COL_PREFIX + '2', 'varchar')]
        control_rows = [(0, ['Alice', 23, '35K']), (1, ['Bob', 32, '30K'])]
        for column in ds.columns:
            self.validate_column(column, cols[column.identifier])
        self.validate_rowid_column(ds.rowid_column)
        self.validate_rows(ds.fetch_rows(), control_rows)
        # Get dataset and repeat tests
        ds = self.db.get_dataset(ds.identifier)
        self.assertEquals(ds.column_counter, 3)
        self.assertEquals(ds.row_counter, 2)
        self.assertEquals(len(ds.row_ids), 2)
        for column in ds.columns:
            self.validate_column(column, cols[column.identifier])
        self.validate_rowid_column(ds.rowid_column)
        self.validate_rows(ds.fetch_rows(), control_rows)
        # Create dataset
        names = ['NAME', 'AGE', 'SALARY']
        rows = ds.fetch_rows()
        rows[0].values[0] = 'Jane'
        rows = [rows[1], rows[0]]
        ds = self.db.create_dataset(columns=ds.columns, rows=rows)
        ds = self.db.get_dataset(ds.identifier)
        for i in range(3):
            col = ds.columns[i]
            self.assertEquals(col.identifier, i)
            self.assertEquals(col.name, names[i])
        rows = ds.fetch_rows()
        for i in range(len(rows)):
            row = rows[(len(rows) - 1) - i]
            self.assertEquals(row.identifier, i)
        self.assertEquals(rows[1].values[0], 'Jane')
        # DONE
        mimir.finalize()

    def validate_column(self, column, col_props):
        """Validate that column name and data type are as expected."""
        name, name_in_rdb, data_type = col_props
        self.assertEquals(column.name, name)
        self.assertEquals(column.name_in_rdb, name_in_rdb)
        self.assertEquals(column.data_type, data_type)

    def validate_rowid_column(self, col):
        """Ensure the row id column has the correct name and a data type."""
        self.assertEquals(col.name, col.name_in_rdb)
        self.assertEquals(col.name, ROW_ID)
        self.assertEquals(col.data_type, 'int')

    def validate_rows(self, dataset_rows, control_rows):
        """Make sure all data is read correctly."""
        self.assertEquals(len(dataset_rows), len(control_rows))
        for i in range(len(dataset_rows)):
            ds_row = dataset_rows[i]
            row_id, values = control_rows[i]
            self.assertEquals(ds_row.identifier, row_id)
            self.assertEquals(ds_row.values, values)
Example #24
0
class TestFileServer(unittest.TestCase):
    def setUp(self):
        """Create an empty file server repository."""
        # Drop project descriptor directory
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)
        # Setup project repository
        self.db = DefaultFileServer(SERVER_DIR)

    def tearDown(self):
        """Clean-up by dropping file server directory.
        """
        shutil.rmtree(SERVER_DIR)

    def test_delete_file(self):
        """Test delete file method."""
        f = self.db.upload_file(CSV_FILE)
        f = self.db.get_file(f.identifier)
        self.assertIsNotNone(f)
        self.assertTrue(self.db.delete_file(f.identifier))
        f = self.db.get_file(f.identifier)
        self.assertIsNone(f)

    def test_get_file(self):
        """Test file get method."""
        f = self.db.upload_file(CSV_FILE)
        f = self.db.get_file(f.identifier)
        self.assertEquals(f.columns, 3)
        self.assertEquals(f.rows, 2)
        self.assertEquals(f.name, 'dataset.csv')
        # Ensure that the file parses as a zipped TSV file
        with f.open() as csvfile:
            rows = 0
            for row in csv.reader(csvfile, delimiter=f.delimiter):
                self.assertEquals(len(row), f.columns)
                rows += 1
        self.assertEquals(rows - 1, f.rows)

    def test_list_file(self):
        """Test upload of different file types and the list files method."""
        fh = self.db.upload_file(CSV_FILE)
        self.assertFalse(fh.compressed)
        self.assertEquals(fh.delimiter, ',')
        fh = self.db.upload_file(GZIP_CSV_FILE)
        self.assertTrue(fh.compressed)
        self.assertEquals(fh.delimiter, ',')
        fh = self.db.upload_file(TSV_FILE)
        self.assertFalse(fh.compressed)
        self.assertEquals(fh.delimiter, '\t')
        fh = self.db.upload_file(GZIP_TSV_FILE)
        self.assertTrue(fh.compressed)
        self.assertEquals(fh.delimiter, '\t')
        files = self.db.list_files()
        self.assertEquals(len(files), 4)
        # Ensure that each of the files parses as a zipped TSV file
        for f in files:
            with f.open() as csvfile:
                rows = 0
                for row in csv.reader(csvfile, delimiter=f.delimiter):
                    self.assertEquals(len(row), f.columns)
                    rows += 1
            self.assertEquals(rows - 1, f.rows)

    def test_rename_file(self):
        """Test rename file method."""
        f = self.db.upload_file(CSV_FILE)
        f = self.db.get_file(f.identifier)
        f = self.db.rename_file(f.identifier, 'somename')
        self.assertEquals(f.name, 'somename')
        f = self.db.get_file(f.identifier)
        f = self.db.rename_file(f.identifier, 'somename')
        self.assertEquals(f.name, 'somename')
        f = self.db.rename_file(f.identifier, 'somename')
        self.assertEquals(f.name, 'somename')

    def test_upload_file(self):
        """Test file upload."""
        f = self.db.upload_file(CSV_FILE)
        self.assertEquals(f.columns, 3)
        self.assertEquals(f.rows, 2)
        self.assertEquals(f.name, 'dataset.csv')
Example #25
0
class TestDataStore(unittest.TestCase):

    def setup_fileserver(self):
        """Create a fresh file server."""
        if os.path.isdir(FILESERVER_DIR):
            shutil.rmtree(FILESERVER_DIR)
        os.mkdir(FILESERVER_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)

    def set_up(self, store_type):
        """Create empty data store directory."""
        if store_type == MEM_DATASTORE:
            self.db = InMemDataStore()
        else:
            # Remove directory if it exists
            if os.path.isdir(DATASTORE_DIRECTORY):
                shutil.rmtree(DATASTORE_DIRECTORY)
            os.mkdir(DATASTORE_DIRECTORY)
            if store_type == FS_DATASTORE:
                self.db = FileSystemDataStore(DATASTORE_DIRECTORY)
            elif store_type == MIMIR_DATASTORE:
                self.db = MimirDataStore(DATASTORE_DIRECTORY)

    def tear_down(self, store_type):
        """Delete data store directory.
        """
        for d in [DATASTORE_DIRECTORY, FILESERVER_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_federated_datastore(self):
        """Test functionality of the federated data store."""
        self.setup_fileserver()
        store1 = InMemDataStore()
        store2 = InMemDataStore()
        fh = self.fileserver.upload_file(CSV_FILE)
        ds1 = store1.load_dataset(fh)
        ds2 = store2.load_dataset(fh)
        fed_store = FederatedDataStore([store1, store2])
        self.assertIsNotNone(fed_store.get_dataset(ds1.identifier))
        self.assertIsNotNone(fed_store.get_dataset(ds2.identifier))
        self.assertIsNone(fed_store.get_dataset('UNDEFINED'))
        with self.assertRaises(NotImplementedError):
            fed_store.load_dataset(fh)
        self.assertIsNotNone(fed_store.update_annotation(ds1.identifier, column_id=0, key='name', value='My Name'))
        self.assertIsNotNone(fed_store.update_annotation(ds2.identifier, column_id=0, key='name', value='My Name'))
        self.assertIsNone(fed_store.update_annotation('UNDEFINED', column_id=0, key='name', value='My Name'))

    def test_fs_datastore(self):
        """Run test for file system datastore."""
        self.run_tests(FS_DATASTORE)

    def test_mem_datastore(self):
        """Run test for in-memory datastore."""
        self.run_tests(MEM_DATASTORE)

    def test_mimir_datastore(self):
        """Run test for Mimir datastore."""
        mimir.initialize()
        self.run_tests(MIMIR_DATASTORE)
        self.set_up(MIMIR_DATASTORE)
        self.load_tsv()
        self.tear_down(MIMIR_DATASTORE)
        mimir.finalize()

    def test_volatile_datastore(self):
        """Test volatile data store on top of a file system data store."""
        self.set_up(FS_DATASTORE)
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        ds_rows = ds.fetch_rows()
        self.assertEquals(len(ds_rows), ds.row_count)
        v_store = VolatileDataStore(self.db)
        # Make sure the existing dataset is accessible via the volatile store
        v_ds = DatasetClient(dataset=v_store.get_dataset(ds.identifier))
        self.assertIsNotNone(v_ds)
        self.assertEquals(v_ds.get_cell('Salary', 1), '30K')
        # Create an updated dataset. The original should be the same in both
        # stores
        v_ds.rows[1].set_value('Salary', '40K')
        v_ds = v_store.create_dataset(columns=v_ds.columns, rows=v_ds.rows)
        self.assertEquals(DatasetClient(dataset=self.db.get_dataset(ds.identifier)).get_cell('Salary', 1), '30K')
        self.assertEquals(DatasetClient(dataset=v_store.get_dataset(ds.identifier)).get_cell('Salary', 1), '30K')
        self.assertEquals(DatasetClient(dataset=v_store.get_dataset(v_ds.identifier)).get_cell('Salary', 1), '40K')
        self.assertIsNone(self.db.get_dataset(v_ds.identifier))
        # Delete both datasets. The volatile store is empty. The original should
        # be unchanged.
        self.assertTrue(v_store.delete_dataset(ds.identifier))
        self.assertTrue(v_store.delete_dataset(v_ds.identifier))
        self.assertFalse(v_store.delete_dataset(ds.identifier))
        self.assertFalse(v_store.delete_dataset(v_ds.identifier))
        self.assertIsNone(v_store.get_dataset(ds.identifier))
        self.assertIsNone(v_store.get_dataset(v_ds.identifier))
        self.assertEquals(DatasetClient(dataset=self.db.get_dataset(ds.identifier)).get_cell('Salary', 1), '30K')
        self.tear_down(FS_DATASTORE)

    def run_tests(self, store_type):
        """Run sequence of test for given data store type."""
        self.set_up(store_type)
        self.dataset_life_cycle()
        self.tear_down(store_type)
        self.set_up(store_type)
        self.datastore_init(store_type)
        self.tear_down(store_type)
        self.set_up(store_type)
        self.dataset_read()
        self.tear_down(store_type)
        self.set_up(store_type)
        self.dataset_column_index()
        self.tear_down(store_type)

    def datastore_init(self, store_type):
        """Test initalizing a datastore with existing datasets."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        if store_type == MEM_DATASTORE:
            self.db = InMemDataStore()
        elif store_type == FS_DATASTORE:
            self.db = FileSystemDataStore(DATASTORE_DIRECTORY)
        elif store_type == MIMIR_DATASTORE:
            self.db = MimirDataStore(DATASTORE_DIRECTORY)

    def dataset_column_index(self):
        """Test the column by id index of the dataset handle."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        # Ensure that the project data has three columns and two rows
        self.assertEquals(ds.column_by_id(0).name.upper(), 'NAME')
        self.assertEquals(ds.column_by_id(1).name.upper(), 'AGE')
        self.assertEquals(ds.column_by_id(2).name.upper(), 'SALARY')
        with self.assertRaises(ValueError):
            ds.column_by_id(5)
        ds.columns.append(DatasetColumn(identifier=5, name='NEWNAME'))
        self.assertEquals(ds.column_by_id(5).name.upper(), 'NEWNAME')
        with self.assertRaises(ValueError):
            ds.column_by_id(4)

    def dataset_life_cycle(self):
        """Test create and delete dataset."""
        self.setup_fileserver()
        ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        # Ensure that the project data has three columns and two rows
        self.assertEquals(len(ds.columns), 3)
        self.assertEquals(len(ds.fetch_rows()), 2)
        self.assertEquals(ds.row_count, 2)
        # Delete dataset and ensure that the dataset directory no longer exists
        self.assertTrue(self.db.delete_dataset(ds.identifier))
        self.assertFalse(self.db.delete_dataset(ds.identifier))

    def dataset_read(self):
        """Test reading a dataset."""
        self.setup_fileserver()
        dh = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
        ds = self.db.get_dataset(dh.identifier)
        ds_rows = ds.fetch_rows()
        self.assertEquals(dh.identifier, ds.identifier)
        self.assertEquals(len(dh.columns), len(ds.columns))
        self.assertEquals(len(dh.fetch_rows()), len(ds_rows))
        self.assertEquals(len(dh.fetch_rows()), len(ds_rows))
        self.assertEquals(dh.row_count, len(ds_rows))
        # Name,Age,Salary
        # Alice,23,35K
        # Bob,32,30K
        self.assertEquals(ds.column_index('Name'), 0)
        self.assertEquals(ds.column_index('Age'), 1)
        self.assertEquals(ds.column_index('Salary'), 2)
        row = ds_rows[0]
        self.assertEquals(row.values[0], 'Alice')
        self.assertEquals(int(row.values[1]), 23)
        self.assertEquals(row.values[2], '35K')
        row = ds_rows[1]
        self.assertEquals(row.values[0], 'Bob')
        self.assertEquals(int(row.values[1]), 32)
        self.assertEquals(row.values[2], '30K')

    def load_tsv(self):
        """Test writing a dataset with duplicate name twice."""
        self.setup_fileserver()
        fh = self.fileserver.upload_file(TSV_FILE)
        ds = self.db.load_dataset(fh)
        self.assertEquals(len(ds.columns), 3)
        self.assertEquals(ds.row_count, 2)
Example #26
0
 def setup_fileserver(self):
     """Create a fresh file server."""
     if os.path.isdir(FILESERVER_DIR):
         shutil.rmtree(FILESERVER_DIR)
     os.mkdir(FILESERVER_DIR)
     self.fileserver = DefaultFileServer(FILESERVER_DIR)
Example #27
0
class TestMimirLenses(unittest.TestCase):

    def setUp(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        vizual = MimirVizualEngine(self.datastore, self.fileserver)
        self.db = FileSystemViztrailRepository(
            VIZTRAILS_DIR,
            {ENV.identifier: ENV}
        )

    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_domain_lens(self):
        """Test DOMAIN lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        col_age = ds.column_by_name('Age')
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_domain(DS_NAME, col_age.identifier)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR AGE IN PEOPLE')
        self.assertFalse(wf.has_error)
        self.assertEquals(len(wf.modules), 2)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        rows = ds.fetch_rows()
        self.assertNotEquals(rows[2].values[ds.column_index('Age')], '')
        # Introduce an error. Make sure command formating is correct
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_domain('MY DS', 'MY COL')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR \'MY COL\' IN \'MY DS\'')
        mimir.finalize()

    def test_geocode_lens(self):
        """Test GEOCODE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(GEO_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertFalse(wf.has_error)
        # Geocode Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_geocode(
                DS_NAME,
                'GOOGLE',
                house_nr=ds.column_by_name('STRNUMBER').identifier,
                street=ds.column_by_name('STRNAME').identifier,
                city=ds.column_by_name('CITY').identifier,
                state=ds.column_by_name('STATE').identifier
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE HOUSE_NUMBER=STRNUMBER,STREET=STRNAME,CITY=CITY,STATE=STATE PEOPLE USING GOOGLE')
        self.assertFalse(wf.has_error)
        self.assertEquals(len(wf.modules), 2)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 6)
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_geocode(
                DS_NAME,
                'GOOGLE'
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE PEOPLE USING GOOGLE')
        self.assertFalse(wf.has_error)
        self.assertEquals(len(wf.modules), 3)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 8)
        mimir.finalize()

    def test_key_repair_lens(self):
        """Test KEY REPAIR lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(KEY_REPAIR_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds1 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME])
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_key_repair(DS_NAME, ds1.column_by_name('Empid').identifier)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR EMPID IN ' + DS_NAME.upper())
        # Get dataset
        ds2 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME])
        self.assertEquals(ds1.row_count, ds2.row_count)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 4)
        self.assertEquals(ds.row_count, 2)
        names = set()
        empids = set()
        rowids = set()
        for row in DatasetClient(dataset=ds).rows:
            rowids.add(row.identifier)
            empids.add(int(row.get_value('empid')))
            names.add(row.get_value('name'))
        self.assertTrue(1 in empids)
        self.assertTrue(2 in rowids)
        self.assertTrue('Alice' in names)
        self.assertTrue('Carla' in names)
        # Test error case and command text
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_key_repair('MY DS', 'MY COL')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR \'MY COL\' IN \'MY DS\'')
        mimir.finalize()

    def test_missing_value_lens(self):
        """Test MISSING_VALUE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value(DS_NAME, ds.column_by_name('AGE').identifier)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper())
        self.assertEquals(len(wf.modules), 2)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        rows = ds.fetch_rows()
        self.assertNotEquals(rows[2].values[ds.column_index('Age')], '')
        # Annotations
        annotations = ds.get_annotations(column_id=1, row_id=4)
        self.assertEquals(len(annotations), 2)
        # MISSING VALUE Lens with value constraint
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'New Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value(
                DS_NAME,
                ds.column_by_name('AGE').identifier,
                constraint='> 30')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper() + ' WITH CONSTRAINT > 30')
        #self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper())
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        rows = ds.fetch_rows()
        self.assertTrue(rows[2].values[ds.column_index('Age')] > 30)
        # Command text in case of error
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value('MY DS', '?', constraint='A B')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        cmd_text = wf.modules[-1].command_text.upper()
        expected_text = 'MISSING VALUES FOR ? IN \'MY DS\'' + ' WITH CONSTRAINT A B'
        self.assertEquals(cmd_text, expected_text)
        mimir.finalize()

    def test_missing_key_lens(self):
        """Test MISSING_KEY lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        # Missing Value Lens
        age_col = ds.columns[ds.column_index('Age')].identifier
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_key(DS_NAME, age_col, missing_only=True)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING KEYS FOR AGE IN ' + DS_NAME.upper())
        self.assertFalse(wf.has_error)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEquals(len(rows), 24)
        #self.db.append_workflow_module(
        #    viztrail_id=vt.identifier,
        #    command=cmd.load_dataset(f_handle.identifier, DS_NAME + '2')
        #)
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_key(
                DS_NAME,
                ds.columns[ds.column_index('Salary')].identifier,
                missing_only=True
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEquals(len(rows), 55)
        mimir.finalize()

    def test_picker_lens(self):
        """Test PICKER lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(PICKER_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_picker(DS_NAME, [
                {'pickFrom': ds.column_by_name('Age').identifier},
                {'pickFrom': ds.column_by_name('Salary').identifier}
            ])
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.modules[-1].has_error:
            print wf.modules[-1].stderr
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,SALARY IN ' + DS_NAME.upper())
        # Get dataset
        self.assertEquals(len(wf.modules[-1].datasets), 1)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        columns = [c.name for c in ds.columns]
        self.assertEquals(len(ds.columns), 5)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        # Pick another column, this time with custom name
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_picker(DS_NAME, [
                {'pickFrom': ds.column_by_name('Age').identifier},
                {'pickFrom': ds.column_by_name('Salary').identifier}
            ],
            pick_as='My Column')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Get dataset
        self.assertEquals(len(wf.modules[-1].datasets), 1)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        columns = [c.name for c in ds.columns]
        self.assertEquals(len(ds.columns), 6)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        self.assertTrue('My Column' in columns)
        # Pick from a picked column
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_picker(DS_NAME, [
                {'pickFrom': ds.column_by_name('Age').identifier},
                {'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier}
            ],
            pick_as='My Column')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.modules[-1].has_error:
            print wf.modules[-1].stderr
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,PICK_ONE_AGE_SALARY AS \'MY COLUMN\' IN ' + DS_NAME.upper())
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        mimir.finalize()

    def test_schema_matching_lens(self):
        """Test SCHEMA_MATCHING lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(DS_NAME, [
                {'column': 'BDate', 'type': 'int'},
                {'column': 'PName', 'type': 'varchar'}
            ], 'new_' + DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT, PNAME VARCHAR) AS NEW_' + DS_NAME.upper())
        # Get dataset
        self.assertEquals(len(wf.modules[-1].datasets), 2)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets['new_' + DS_NAME])
        self.assertEquals(len(ds.columns), 2)
        self.assertEquals(ds.row_count, 2)
        # Error if adding an existing dataset
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(
                DS_NAME,
                [{'column': 'BDate', 'type': 'int'}],
                'new_' + DS_NAME
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(
                DS_NAME,
                [{'column': 'BDate', 'type': 'int'}],
                'a_new_' + DS_NAME
            ),
            module_id=wf.modules[-1].identifier,
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS A_NEW_' + DS_NAME.upper())
        # Error when adding a dataset with an invalid name
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(
                DS_NAME,
                [{'column': 'BDate', 'type': 'int'}],
                'SOME NAME'
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS \'SOME NAME\'')
        mimir.finalize()

    def test_type_inference_lens(self):
        """Test TYPE INFERENCE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds1 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertFalse(wf.has_error)
        # Infer type
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_type_inference(DS_NAME, 0.6)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        print wf.modules[-1].command_text.upper()
        self.assertEquals(wf.modules[-1].command_text.upper(), 'TYPE INFERENCE FOR COLUMNS IN ' + DS_NAME.upper() + ' WITH PERCENT_CONFORM = 0.6')
        # Get dataset
        ds2 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds2.columns), 3)
        self.assertEquals(ds2.row_count, 7)
        ds1_rows = ds1.fetch_rows()
        ds2_rows = ds2.fetch_rows()
        for i in range(ds2.row_count):
            self.assertEquals(ds1_rows[i].values, ds2_rows[i].values)
        mimir.finalize()
Example #28
0
class TestWorkflows(unittest.TestCase):
    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def set_up(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)

    def set_up_default(self):
        """Setup configuration using default Vizual engine."""
        env = ExecEnv(
            FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
            packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON
                      ]).from_dict({'datastore': {
                          'directory': DATASTORE_DIR
                      }})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = FileSystemDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {env.identifier: env})

    def set_up_mimir(self):
        """Setup configuration using Mimir engine."""
        env = ExecEnv(
            FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
            identifier=ENGINEENV_MIMIR,
            packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON, PACKAGE_MIMIR
                      ]).from_dict({'datastore': {
                          'directory': DATASTORE_DIR
                      }})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {env.identifier: env})

    def test_vt_default(self):
        """Run workflow with default configuration."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        self.set_up_default()
        self.run_python_workflow()
        self.set_up_default()
        self.run_mixed_workflow()
        self.set_up_default()
        self.run_delete_modules()
        self.set_up_default()
        self.run_erroneous_workflow()
        self.set_up_default()
        self.run_update_datasets()

    def test_vt_mimir(self):
        """Run workflows for Mimir configurations."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        self.set_up_mimir()
        self.run_python_workflow()
        self.set_up_mimir()
        self.run_mixed_workflow()
        self.set_up_mimir()
        self.run_delete_modules()
        self.set_up_mimir()
        self.run_erroneous_workflow()
        mimir.finalize()

    def run_delete_modules(self):
        """Test deletion of modules."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        col_age = ds.column_by_name('Age')
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 0,
                                           '28'))
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 1,
                                           '42'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds = DatasetClient(
            self.datastore.get_dataset(wf.modules[-1].datasets['people']))
        self.assertEquals(int(ds.rows[0].get_value('Age')), 28)
        self.assertEquals(int(ds.rows[1].get_value('Age')), 42)
        # DELETE UPDATE CELL
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[1].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds = DatasetClient(
            self.datastore.get_dataset(wf.modules[-1].datasets['people']))
        self.assertEquals(int(ds.rows[0].get_value('Age')), 23)
        # DELETE LOAD (will introduce error)
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[0].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        # DELETE last remaining module
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[0].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)

    def run_erroneous_workflow(self):
        """Test workflow that has errors."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        col_age = ds.column_by_name('Age')
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 0,
                                           '28'))
        # This should create an error because of the invalid column name
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.rename_column(
                                           DS_NAME, col_age.identifier, ''))
        # This should not have any effect
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 0,
                                           '29'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        # Make sure that all workflow modules have a non-negative identifier
        # and that they are all unique
        identifier = set()
        for m in wf.modules:
            self.assertTrue(m.identifier >= 0)
            self.assertTrue(not m.identifier in identifier)
            identifier.add(m.identifier)

    def run_mixed_workflow(self):
        """Test functionality to execute a workflow module."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text,
                          'LOAD DATASET people FROM FILE dataset.csv')
        #print '(2) INSERT ROW'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.insert_row(DS_NAME, 1))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text, 'INSERT ROW INTO people AT POSITION 1')
        #print '(3) Set name to Bobby and set variables'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(SET_VARIABLES_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text, SET_VARIABLES_PY)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        #print '(4) Set age to 28'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME,
                                           ds.column_by_name('Age').identifier,
                                           1, '28'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text.upper(), 'UPDATE PEOPLE SET [AGE,1] = 28')
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        #print '(5) Change Alice to Bob'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.update_cell(DS_NAME,
                                    ds.column_by_name('Name').identifier, 0,
                                    'Bob'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text.upper(),
                          'UPDATE PEOPLE SET [NAME,0] = \'BOB\'')
        #print '(6) UPDATE DATASET WITH FILTER'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(UPDATE_DATASET_WITH_FILTER_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text, UPDATE_DATASET_WITH_FILTER_PY)
        self.assertFalse(wf.has_error)
        # Ensure that all names are Bobby
        ds = DatasetClient(
            self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]))
        age = [23, 28, 32]
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEquals(row.get_value('Name'), 'Bobby')
            self.assertEquals(int(row.get_value('Age')), age[i])

    def run_python_workflow(self):
        """Test functionality to execute a workflow module."""
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(CREATE_DATASET_PY))
        # from vizier.database.client import VizierDBClient\nv = VizierDBClient(__vizierdb__)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        self.assertEquals(wf.version, 0)
        self.assertEquals(len(wf.modules), 1)
        self.assertTrue(len(wf.modules[0].stdout) == 0)
        self.assertTrue(len(wf.modules[0].stderr) == 0)
        self.assertEquals(len(wf.modules[0].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        #print '(2) PRINT DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(PRINT_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertEquals(wf.version, 1)
        self.assertEquals(len(wf.modules), 2)
        self.assertTrue(len(wf.modules[0].stdout) == 0)
        self.assertTrue(len(wf.modules[0].stderr) == 0)
        self.assertEquals(len(wf.modules[0].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        self.assertTrue(len(wf.modules[1].stdout) == 1)
        self.assertTrue(len(wf.modules[1].stderr) == 0)
        self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob')
        self.assertEquals(len(wf.modules[1].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[1].datasets)
        ds_id = wf.modules[1].datasets[DS_NAME]
        #print '(3) UPDATE DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(UPDATE_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.version, 2)
        self.assertEquals(len(wf.modules), 3)
        self.assertTrue(len(wf.modules[0].stdout) == 0)
        self.assertTrue(len(wf.modules[0].stderr) == 0)
        self.assertEquals(len(wf.modules[0].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        self.assertEquals(wf.modules[0].datasets[DS_NAME], ds_id)
        self.assertTrue(len(wf.modules[1].stdout) == 1)
        self.assertTrue(len(wf.modules[1].stderr) == 0)
        self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob')
        self.assertEquals(len(wf.modules[1].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[1].datasets)
        self.assertEquals(wf.modules[1].datasets[DS_NAME], ds_id)
        self.assertTrue(len(wf.modules[2].stdout) == 0)
        self.assertTrue(len(wf.modules[2].stderr) == 0)
        self.assertEquals(len(wf.modules[2].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[2].datasets)
        self.assertNotEquals(wf.modules[2].datasets[DS_NAME], ds_id)
        #print '(4) PRINT DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(PRINT_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertEquals(wf.version, 3)
        self.assertEquals(len(wf.modules), 4)
        self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob')
        self.assertTrue(len(wf.modules[3].stdout) == 1)
        self.assertTrue(len(wf.modules[3].stderr) == 0)
        self.assertEquals(wf.modules[3].stdout[0]['data'], 'NoName\nNoName')
        #print '(5) UPDATE DATASET WITH FILTER'
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            module_id=wf.modules[2].identifier,
            command=cmd.python_cell(UPDATE_DATASET_WITH_FILTER_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.version, 4)
        self.assertEquals(len(wf.modules), 4)
        # print '(6) INSERT SET VARIABLES BEFORE UPDATE'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(SET_VARIABLES_ONLY_PY),
            before_id=wf.modules[2].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[4].stdout[0]['data'], 'Alice\nBobby')
        #print '(7) INTRODUCE ERROR'
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            module_id=wf.modules[1].identifier,
            command=cmd.python_cell(PRINT_UNKNOWN_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertTrue(wf.has_error)
        # Ensure that the second module has output to stderr
        self.assertNotEquals(len(wf.modules[1].stderr), 0)
        # Ensure that the last two modules hav no output (either to STDOUT or
        # STDERR)
        for m in wf.modules[2:]:
            self.assertEquals(len(m.stdout), 0)
            self.assertEquals(len(m.stderr), 0)
        #print '(8) FIX ERROR'
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            module_id=wf.modules[1].identifier,
            command=cmd.python_cell(PRINT_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        #print (9) DELETE MODULE UPDATE_DATASET_WITH_FILTER_PY
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[3].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[3].stdout[0]['data'], 'Alice\nBob')

    def run_update_datasets(self):
        """Test dropping and renaming of datasets."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertTrue(DS_NAME in wf.modules[-1].datasets)
        new_name = DS_NAME + '_renamed'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.rename_dataset(
                                           DS_NAME, new_name))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        self.assertFalse(new_name in wf.modules[0].datasets)
        self.assertFalse(DS_NAME in wf.modules[-1].datasets)
        self.assertTrue(new_name in wf.modules[-1].datasets)
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.drop_dataset(new_name))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertFalse(new_name in wf.modules[-1].datasets)
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.drop_dataset(new_name))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        # Delete the Drop Dataset that failed and replace the first drop with
        # a Python module that prints names
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[-1].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.db.replace_workflow_module(viztrail_id=vt.identifier,
                                        module_id=wf.modules[-1].identifier,
                                        command=cmd.python_cell("""
for row in vizierdb.get_dataset('""" + new_name + """').rows:
    print row.get_value('Name')
"""))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].stdout[0]['data'], 'Alice\nBob')
        self.assertFalse(DS_NAME in wf.modules[-1].datasets)
        self.assertTrue(new_name in wf.modules[-1].datasets)
Example #29
0
class TestWorkflowUpdates(unittest.TestCase):
    def setUp(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        self.config = AppConfig()
        env = ExecEnv(
            FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
            packages=[PACKAGE_VIZUAL, PACKAGE_PLOT
                      ]).from_dict({'datastore': {
                          'directory': DATASTORE_DIR
                      }})
        self.ENGINE_ID = env.identifier
        self.config.envs[self.ENGINE_ID] = env
        self.config.fileserver = env.fileserver
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)
        self.datastore = FileSystemDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {env.identifier: env})
        self.api = VizierWebService(self.db, self.datastore, self.fileserver,
                                    self.config)

    def tearDown(self):
        """Clean-up by dropping the MongoDB collection used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_view_urls(self):
        """Ensure that the urls for workflow views get updated correctly when
        the workflow is modified."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        #print '(2) PLOT'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.create_plot(
                                           DS_NAME,
                                           CHART_NAME,
                                           series=[{
                                               'series_column': 2
                                           }]))
        url = self.api.get_workflow(
            vt.identifier,
            DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href']
        self.assertTrue('master/workflows/1/modules/1/views' in url)
        # print '(3) UPDATE CELL'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, 0, 0, '28'))
        url = self.api.get_workflow(
            vt.identifier,
            DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href']
        self.assertTrue('master/workflows/2/modules/2/views' in url)
Example #30
0
class TestVizierClient(unittest.TestCase):
    def setUp(self):
        """Delete metadata file if it exists."""
        # Drop directorie
        self.tearDown()

    def tearDown(self):
        """Clean-up by dropping file server directory.
        """
        if os.path.isdir(DATASTORE_DIR):
            shutil.rmtree(DATASTORE_DIR)
        if os.path.isdir(SERVER_DIR):
            shutil.rmtree(SERVER_DIR)

    def test_fs_client(self):
        """Run tests for default engine and file server data store."""
        self.fs = DefaultFileServer(SERVER_DIR)
        self.ds = FileSystemDataStore(DATASTORE_DIR)
        self.run_client_tests(
            VizierDBClient(self.ds, dict(),
                           DefaultVizualEngine(self.ds, self.fs)))

    def test_mem_client(self):
        """Run tests for default engine and in-memory data store."""
        self.fs = DefaultFileServer(SERVER_DIR)
        self.ds = InMemDataStore()
        self.run_client_tests(
            VizierDBClient(self.ds, dict(),
                           DefaultVizualEngine(self.ds, self.fs)))

    def test_mimir_client(self):
        """Run tests for default engine and Mimir data store."""
        mimir.initialize()
        self.fs = DefaultFileServer(SERVER_DIR)
        self.ds = MimirDataStore(DATASTORE_DIR)
        self.run_client_tests(
            VizierDBClient(self.ds, dict(),
                           DefaultVizualEngine(self.ds, self.fs)))
        mimir.finalize()

    def run_client_tests(self, client):
        """Test creating and updating a dataset via the client."""
        ds = DatasetClient()
        ds.insert_column('Name')
        ds.insert_column('Age')
        ds.insert_row(['Alice', '23'])
        ds.insert_row(['Bob', '25'])
        client.create_dataset('MyDataset', ds)
        # Ensure the returned dataset contains the input data
        ds = client.get_dataset('MyDataset')
        self.assertEquals([c.name for c in ds.columns], ['Name', 'Age'])
        self.assertEquals([str(v) for v in ds.rows[0].values], ['Alice', '23'])
        self.assertEquals([str(v) for v in ds.rows[1].values], ['Bob', '25'])
        # Update dataset
        ds.rows[1].set_value('Age', '26')
        client.update_dataset('MyDataset', ds)
        ds = client.get_dataset('MyDataset')
        self.assertEquals([str(v) for v in ds.rows[1].values], ['Bob', '26'])
        # Value error when creating dataset with existing name
        with self.assertRaises(ValueError):
            client.create_dataset('MyDataset', ds)
        # Value error when retrieving unknown dataset
        with self.assertRaises(ValueError):
            client.get_dataset('SomeDataset')
        # Ensure the returned dataset contains the modified data
        client.rename_dataset('MyDataset', 'SomeDataset')
        ds = client.get_dataset('SomeDataset')
        client.update_dataset('SomeDataset', ds)
        # Move columns around
        ds = self.ds.load_dataset(self.fs.upload_file(CSV_FILE))
        ds = client.create_dataset('people', DatasetClient(ds))
        col_1 = [row.get_value(1) for row in ds.rows]
        ds.insert_column('empty', 2)
        ds = client.update_dataset('people', ds)
        col_2 = [row.get_value(2) for row in ds.rows]
        ds.move_column('empty', 1)
        ds = client.update_dataset('people', ds)
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEquals(row.values[1], col_2[i])
            self.assertEquals(row.values[2], col_1[i])
        # Rename
        ds.columns[1].name = 'allnone'
        ds = client.update_dataset('people', ds)
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEquals(row.get_value('allnone'), col_2[i])
            self.assertEquals(row.values[2], col_1[i])
        # Insert row
        row = ds.insert_row()
        row.set_value('Name', 'Zoe')
        ds = client.create_dataset('upd', ds)
        self.assertEquals(len(ds.rows), 3)
        r2 = ds.rows[2]
        self.assertEquals(r2.identifier, 2)
        self.assertEquals(r2.values, ['Zoe', None, None, None])
        # Annotations
        ds = client.get_dataset('people')
        annotations = ds.rows[0].annotations('Age')
        annotations.add('user:comment', 'My Comment')
        ds = client.update_dataset('people', ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEquals(len(annotations), 1)
        anno = annotations[0]
        self.assertEquals(anno.key, 'user:comment')
        self.assertEquals(anno.value, 'My Comment')
        ds.rows[0].annotations('Age').add('user:comment', 'Another Comment')
        ds = client.update_dataset('people', ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEquals(len(annotations), 2)
        self.assertEquals(ds.rows[0].annotations('Age').keys(),
                          ['user:comment'])
        values = [a.value for a in annotations]
        for val in ['My Comment', 'Another Comment']:
            self.assertTrue(val in values)
        ds.rows[0].annotations('Age').update(identifier=anno.identifier,
                                             key='user:issue',
                                             value='Some Issue')
        ds = client.update_dataset('people', ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEquals(len(annotations), 1)
        keys = ds.rows[0].annotations('Age').keys()
        for key in ['user:comment', 'user:issue']:
            self.assertTrue(key in keys)
        values = [
            a.value
            for a in ds.rows[0].annotations('Age').find_all('user:comment')
        ]
        for val in ['Another Comment']:
            self.assertTrue(val in values)
        values = [
            a.value
            for a in ds.rows[0].annotations('Age').find_all('user:issue')
        ]
        for val in ['Some Issue']:
            self.assertTrue(val in values)
        ds.rows[0].annotations('Age').update(identifier=anno.identifier)
        ds = client.update_dataset('people', ds)
        annotations = ds.rows[0].annotations('Age').find_all('user:issue')
        self.assertEquals(len(annotations), 0)
        annotations = ds.rows[0].annotations('Age').find_all('user:comment')
        self.assertEquals(len(annotations), 1)
        # Delete column
        ds = client.get_dataset('people')
        ds.delete_column('Age')
        client.update_dataset('people', ds)
        ds = client.get_dataset('people')
        names = [col.name.upper() for col in ds.columns]
        self.assertTrue('NAME' in names)
        self.assertFalse('AGE' in names)