Beispiel #1
0
class TestMimirAnnotations(unittest.TestCase):
    def setUp(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        vizual = MimirVizualEngine(self.datastore, self.fileserver)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {ENV.identifier: ENV})

    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_annotations(self):
        """Test DOMAIN lens."""
        # Create new work trail and create dataset from CSV file
        mimir.initialize()
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name': 'My Project'})
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value(
                DS_NAME,
                ds.column_by_name('AGE').identifier))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        annos = ds.get_annotations(column_id=1, row_id=2)
        self.assertEquals(len(annos), 2)
        for anno in annos:
            self.assertEquals(anno.key, ANNO_UNCERTAIN)
        mimir.finalize()
Beispiel #2
0
 def test_viztrail_workflow(self):
     """Test basic functionality of retrieving a workflow.
     """
     repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos)
     viztrail = repo.create_viztrail(ENGINEENV_DEFAULT, {'name': 'Name A'})
     self.assertEquals(len(repo.get_workflow(viztrail.identifier, DEFAULT_BRANCH).modules), 0)
     self.assertIsNone(repo.get_workflow(viztrail.identifier, 'unknown'))
     self.assertIsNone(repo.get_workflow('unknown', DEFAULT_BRANCH))
     self.assertIsNone(repo.get_workflow(viztrail_id=viztrail.identifier, branch_id=DEFAULT_BRANCH, workflow_version=10))
     # Re-load repository
     repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos)
     self.assertEquals(len(repo.get_workflow(viztrail.identifier, DEFAULT_BRANCH).modules), 0)
     self.assertIsNone(repo.get_workflow(viztrail.identifier, 'unknown'))
     self.assertIsNone(repo.get_workflow('unknown', DEFAULT_BRANCH))
     self.assertIsNone(repo.get_workflow(viztrail_id=viztrail.identifier, branch_id=DEFAULT_BRANCH, workflow_version=10))
Beispiel #3
0
class TestUnicodeHandling(unittest.TestCase):

    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def set_up(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)

    def set_up_default(self):
        """Setup configuration using default Vizual engine."""
        env = ExecEnv(
                FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
                packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON]
            ).from_dict({'datastore': {'directory': DATASTORE_DIR}})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = FileSystemDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(
            VIZTRAILS_DIR,
            {env.identifier: env}
        )

    def set_up_mimir(self):
        """Setup configuration using Mimir engine."""
        env = ExecEnv(
                FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
                identifier=ENGINEENV_MIMIR,
                packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON, PACKAGE_MIMIR]
            ).from_dict({'datastore': {'directory': DATASTORE_DIR}})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(
            VIZTRAILS_DIR,
            {env.identifier: env}
        )

    def test_vt_default(self):
        """Run workflow with default configuration."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        self.set_up_default()
        self.run_workflow()

    def test_vt_mimir(self):
        """Run workflows for Mimir configurations."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        self.set_up_mimir()
        self.run_workflow()
        mimir.finalize()

    def run_workflow(self):
        """Test functionality to execute a Python script that creates a dataset
        containing unicode characters."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name' : 'My Project'})
        # LOAD DATASET
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # RUN Python Script
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(PYTHON_SCRIPT)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr
        self.assertFalse(wf.has_error)
        #print wf.modules[-1].stdout[0]['data']
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        names = set(c.name.upper().replace('_', ' ') for c in ds.columns)
        self.assertTrue(len(names), 4)
        for name in ['DATE', 'IMO CODE', 'PORT OF DEPARTURE', 'PLACE OF RECEIPT']:
            self.assertTrue(name in names)
class TestWorkflowUpdates(unittest.TestCase):
    def setUp(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        self.config = AppConfig()
        env = ExecEnv(
            FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
            packages=[PACKAGE_VIZUAL, PACKAGE_PLOT
                      ]).from_dict({'datastore': {
                          'directory': DATASTORE_DIR
                      }})
        self.ENGINE_ID = env.identifier
        self.config.envs[self.ENGINE_ID] = env
        self.config.fileserver = env.fileserver
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)
        self.datastore = FileSystemDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {env.identifier: env})
        self.api = VizierWebService(self.db, self.datastore, self.fileserver,
                                    self.config)

    def tearDown(self):
        """Clean-up by dropping the MongoDB collection used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_view_urls(self):
        """Ensure that the urls for workflow views get updated correctly when
        the workflow is modified."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        #print '(2) PLOT'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.create_plot(
                                           DS_NAME,
                                           CHART_NAME,
                                           series=[{
                                               'series_column': 2
                                           }]))
        url = self.api.get_workflow(
            vt.identifier,
            DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href']
        self.assertTrue('master/workflows/1/modules/1/views' in url)
        # print '(3) UPDATE CELL'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, 0, 0, '28'))
        url = self.api.get_workflow(
            vt.identifier,
            DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href']
        self.assertTrue('master/workflows/2/modules/2/views' in url)
Beispiel #5
0
class TestWorkflows(unittest.TestCase):
    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def set_up(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)

    def set_up_default(self):
        """Setup configuration using default Vizual engine."""
        env = ExecEnv(
            FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
            packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON
                      ]).from_dict({'datastore': {
                          'directory': DATASTORE_DIR
                      }})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = FileSystemDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {env.identifier: env})

    def set_up_mimir(self):
        """Setup configuration using Mimir engine."""
        env = ExecEnv(
            FileServerConfig().from_dict({'directory': FILESERVER_DIR}),
            identifier=ENGINEENV_MIMIR,
            packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON, PACKAGE_MIMIR
                      ]).from_dict({'datastore': {
                          'directory': DATASTORE_DIR
                      }})
        self.ENGINE_ID = env.identifier
        self.set_up()
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIR,
                                               {env.identifier: env})

    def test_vt_default(self):
        """Run workflow with default configuration."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        self.set_up_default()
        self.run_python_workflow()
        self.set_up_default()
        self.run_mixed_workflow()
        self.set_up_default()
        self.run_delete_modules()
        self.set_up_default()
        self.run_erroneous_workflow()
        self.set_up_default()
        self.run_update_datasets()

    def test_vt_mimir(self):
        """Run workflows for Mimir configurations."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        self.set_up_mimir()
        self.run_python_workflow()
        self.set_up_mimir()
        self.run_mixed_workflow()
        self.set_up_mimir()
        self.run_delete_modules()
        self.set_up_mimir()
        self.run_erroneous_workflow()
        mimir.finalize()

    def run_delete_modules(self):
        """Test deletion of modules."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        col_age = ds.column_by_name('Age')
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 0,
                                           '28'))
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 1,
                                           '42'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds = DatasetClient(
            self.datastore.get_dataset(wf.modules[-1].datasets['people']))
        self.assertEquals(int(ds.rows[0].get_value('Age')), 28)
        self.assertEquals(int(ds.rows[1].get_value('Age')), 42)
        # DELETE UPDATE CELL
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[1].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds = DatasetClient(
            self.datastore.get_dataset(wf.modules[-1].datasets['people']))
        self.assertEquals(int(ds.rows[0].get_value('Age')), 23)
        # DELETE LOAD (will introduce error)
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[0].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        # DELETE last remaining module
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[0].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)

    def run_erroneous_workflow(self):
        """Test workflow that has errors."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        col_age = ds.column_by_name('Age')
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 0,
                                           '28'))
        # This should create an error because of the invalid column name
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.rename_column(
                                           DS_NAME, col_age.identifier, ''))
        # This should not have any effect
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME, col_age.identifier, 0,
                                           '29'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        # Make sure that all workflow modules have a non-negative identifier
        # and that they are all unique
        identifier = set()
        for m in wf.modules:
            self.assertTrue(m.identifier >= 0)
            self.assertTrue(not m.identifier in identifier)
            identifier.add(m.identifier)

    def run_mixed_workflow(self):
        """Test functionality to execute a workflow module."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text,
                          'LOAD DATASET people FROM FILE dataset.csv')
        #print '(2) INSERT ROW'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.insert_row(DS_NAME, 1))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text, 'INSERT ROW INTO people AT POSITION 1')
        #print '(3) Set name to Bobby and set variables'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(SET_VARIABLES_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text, SET_VARIABLES_PY)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        #print '(4) Set age to 28'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.update_cell(
                                           DS_NAME,
                                           ds.column_by_name('Age').identifier,
                                           1, '28'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text.upper(), 'UPDATE PEOPLE SET [AGE,1] = 28')
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        #print '(5) Change Alice to Bob'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.update_cell(DS_NAME,
                                    ds.column_by_name('Name').identifier, 0,
                                    'Bob'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text.upper(),
                          'UPDATE PEOPLE SET [NAME,0] = \'BOB\'')
        #print '(6) UPDATE DATASET WITH FILTER'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(UPDATE_DATASET_WITH_FILTER_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        cmd_text = wf.modules[-1].command_text
        self.assertEquals(cmd_text, UPDATE_DATASET_WITH_FILTER_PY)
        self.assertFalse(wf.has_error)
        # Ensure that all names are Bobby
        ds = DatasetClient(
            self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]))
        age = [23, 28, 32]
        for i in range(len(ds.rows)):
            row = ds.rows[i]
            self.assertEquals(row.get_value('Name'), 'Bobby')
            self.assertEquals(int(row.get_value('Age')), age[i])

    def run_python_workflow(self):
        """Test functionality to execute a workflow module."""
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        #print '(1) CREATE DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(CREATE_DATASET_PY))
        # from vizier.database.client import VizierDBClient\nv = VizierDBClient(__vizierdb__)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        self.assertEquals(wf.version, 0)
        self.assertEquals(len(wf.modules), 1)
        self.assertTrue(len(wf.modules[0].stdout) == 0)
        self.assertTrue(len(wf.modules[0].stderr) == 0)
        self.assertEquals(len(wf.modules[0].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        #print '(2) PRINT DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(PRINT_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertEquals(wf.version, 1)
        self.assertEquals(len(wf.modules), 2)
        self.assertTrue(len(wf.modules[0].stdout) == 0)
        self.assertTrue(len(wf.modules[0].stderr) == 0)
        self.assertEquals(len(wf.modules[0].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        self.assertTrue(len(wf.modules[1].stdout) == 1)
        self.assertTrue(len(wf.modules[1].stderr) == 0)
        self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob')
        self.assertEquals(len(wf.modules[1].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[1].datasets)
        ds_id = wf.modules[1].datasets[DS_NAME]
        #print '(3) UPDATE DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(UPDATE_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.version, 2)
        self.assertEquals(len(wf.modules), 3)
        self.assertTrue(len(wf.modules[0].stdout) == 0)
        self.assertTrue(len(wf.modules[0].stderr) == 0)
        self.assertEquals(len(wf.modules[0].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        self.assertEquals(wf.modules[0].datasets[DS_NAME], ds_id)
        self.assertTrue(len(wf.modules[1].stdout) == 1)
        self.assertTrue(len(wf.modules[1].stderr) == 0)
        self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob')
        self.assertEquals(len(wf.modules[1].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[1].datasets)
        self.assertEquals(wf.modules[1].datasets[DS_NAME], ds_id)
        self.assertTrue(len(wf.modules[2].stdout) == 0)
        self.assertTrue(len(wf.modules[2].stderr) == 0)
        self.assertEquals(len(wf.modules[2].datasets), 1)
        self.assertTrue(DS_NAME in wf.modules[2].datasets)
        self.assertNotEquals(wf.modules[2].datasets[DS_NAME], ds_id)
        #print '(4) PRINT DATASET'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(PRINT_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertEquals(wf.version, 3)
        self.assertEquals(len(wf.modules), 4)
        self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob')
        self.assertTrue(len(wf.modules[3].stdout) == 1)
        self.assertTrue(len(wf.modules[3].stderr) == 0)
        self.assertEquals(wf.modules[3].stdout[0]['data'], 'NoName\nNoName')
        #print '(5) UPDATE DATASET WITH FILTER'
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            module_id=wf.modules[2].identifier,
            command=cmd.python_cell(UPDATE_DATASET_WITH_FILTER_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.version, 4)
        self.assertEquals(len(wf.modules), 4)
        # print '(6) INSERT SET VARIABLES BEFORE UPDATE'
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.python_cell(SET_VARIABLES_ONLY_PY),
            before_id=wf.modules[2].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[4].stdout[0]['data'], 'Alice\nBobby')
        #print '(7) INTRODUCE ERROR'
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            module_id=wf.modules[1].identifier,
            command=cmd.python_cell(PRINT_UNKNOWN_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        self.assertTrue(wf.has_error)
        # Ensure that the second module has output to stderr
        self.assertNotEquals(len(wf.modules[1].stderr), 0)
        # Ensure that the last two modules hav no output (either to STDOUT or
        # STDERR)
        for m in wf.modules[2:]:
            self.assertEquals(len(m.stdout), 0)
            self.assertEquals(len(m.stderr), 0)
        #print '(8) FIX ERROR'
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            module_id=wf.modules[1].identifier,
            command=cmd.python_cell(PRINT_DATASET_PY))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        prev_modules = modules
        modules = set()
        for m in wf.modules:
            self.assertNotEquals(m.identifier, -1)
            self.assertFalse(m.identifier in modules)
            modules.add(m.identifier)
        # Ensure that the identifier of previous modules did not change
        for id in prev_modules:
            self.assertTrue(id in modules)
        #print (9) DELETE MODULE UPDATE_DATASET_WITH_FILTER_PY
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[3].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[3].stdout[0]['data'], 'Alice\nBob')

    def run_update_datasets(self):
        """Test dropping and renaming of datasets."""
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'})
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.load_dataset(
                                           f_handle.identifier, DS_NAME))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertTrue(DS_NAME in wf.modules[-1].datasets)
        new_name = DS_NAME + '_renamed'
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.rename_dataset(
                                           DS_NAME, new_name))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertTrue(DS_NAME in wf.modules[0].datasets)
        self.assertFalse(new_name in wf.modules[0].datasets)
        self.assertFalse(DS_NAME in wf.modules[-1].datasets)
        self.assertTrue(new_name in wf.modules[-1].datasets)
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.drop_dataset(new_name))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertFalse(new_name in wf.modules[-1].datasets)
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=cmd.drop_dataset(new_name))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        # Delete the Drop Dataset that failed and replace the first drop with
        # a Python module that prints names
        self.db.delete_workflow_module(viztrail_id=vt.identifier,
                                       module_id=wf.modules[-1].identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.db.replace_workflow_module(viztrail_id=vt.identifier,
                                        module_id=wf.modules[-1].identifier,
                                        command=cmd.python_cell("""
for row in vizierdb.get_dataset('""" + new_name + """').rows:
    print row.get_value('Name')
"""))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].stdout[0]['data'], 'Alice\nBob')
        self.assertFalse(DS_NAME in wf.modules[-1].datasets)
        self.assertTrue(new_name in wf.modules[-1].datasets)
class TestFileSystemViztrailRepository(unittest.TestCase):
    def setUp(self):
        """Create an empty work trails repository."""
        # Clear VisTrails directory
        if os.path.isdir(VIZTRAILS_DIRECTORY):
            shutil.rmtree(VIZTRAILS_DIRECTORY)
        # Setup project repository
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY,
                                               {ENV.identifier: ENV})

    def tearDown(self):
        """Clean-up by dropping viztrails directory.
        """
        shutil.rmtree(VIZTRAILS_DIRECTORY)

    def test_append_module(self):
        """Test appending modules."""
        # Create new viztrail.
        vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'})
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=python_cell('abc'))
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=load_dataset('file', 'name'))
        # The default branch should have two versions. The first versions contains
        # one module and the second version contains two modules
        self.assertEquals(len(vt.branches[DEFAULT_BRANCH].workflows), 2)
        v1 = self.db.get_workflow(
            viztrail_id=vt.identifier,
            workflow_version=vt.branches[DEFAULT_BRANCH].workflows[0].version)
        v2 = self.db.get_workflow(
            viztrail_id=vt.identifier,
            workflow_version=vt.branches[DEFAULT_BRANCH].workflows[1].version)
        head = self.db.get_workflow(viztrail_id=vt.identifier,
                                    branch_id=DEFAULT_BRANCH)
        self.assertEquals(len(v1.modules), 1)
        self.assertEquals(len(v2.modules), 2)
        self.assertEquals(len(head.modules), 2)
        # Ensure that all modules have non-negative identifier
        for m in head.modules:
            self.assertTrue(m.identifier >= 0)
        self.assertEquals(head.modules[0].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(head.modules[1].command.module_type, PACKAGE_VIZUAL)
        self.assertEquals(head.version, 1)
        # Re-load the viztrails to ensure that all information has been persisted properly
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY,
                                               {ENV.identifier: ENV})
        vt = self.db.get_viztrail(vt.identifier)
        self.assertEquals(len(vt.branches[DEFAULT_BRANCH].workflows), 2)
        v1 = self.db.get_workflow(
            viztrail_id=vt.identifier,
            workflow_version=vt.branches[DEFAULT_BRANCH].workflows[0].version)
        v2 = self.db.get_workflow(
            viztrail_id=vt.identifier,
            workflow_version=vt.branches[DEFAULT_BRANCH].workflows[1].version)
        head = self.db.get_workflow(viztrail_id=vt.identifier,
                                    branch_id=DEFAULT_BRANCH)
        self.assertEquals(len(v1.modules), 1)
        self.assertEquals(len(v2.modules), 2)
        self.assertEquals(len(head.modules), 2)
        # Ensure that all modules have non-negative identifier
        for m in head.modules:
            self.assertTrue(m.identifier >= 0)
        self.assertEquals(head.modules[0].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(head.modules[1].command.module_type, PACKAGE_VIZUAL)
        self.assertEquals(head.version, 1)
        # Append a third moduel to the head of the default branch
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=python_cell('def'))
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertEquals(len(wf.modules), 3)
        for m in wf.modules:
            self.assertTrue(m.identifier >= 0)
            self.assertEquals(m.stdout[0]['data'],
                              'SUCCESS ' + str(m.identifier))
        self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(wf.modules[1].command.module_type, PACKAGE_VIZUAL)
        self.assertEquals(wf.modules[2].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(wf.version, 2)
        # Append a module to the first version in the branch. The resulting new
        # branch HEAD is expected to contain only two modules then.
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       workflow_version=0,
                                       command=python_cell('def'))
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY,
                                               {ENV.identifier: ENV})
        vt = self.db.get_viztrail(vt.identifier)
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertEquals(len(wf.modules), 2)
        for m in wf.modules:
            self.assertTrue(m.identifier >= 0)
            self.assertEquals(m.stdout[0]['data'],
                              'SUCCESS ' + str(m.identifier))
        self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(wf.version, 3)

    def test_branching(self):
        """Test functionality to execute a workflow module."""
        # Create new viztrail and ensure that it contains exactly one branch
        vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'})
        self.assertEquals(len(vt.branches), 1)
        self.assertTrue(DEFAULT_BRANCH in vt.branches)
        self.assertEquals(vt.branches[DEFAULT_BRANCH].identifier,
                          DEFAULT_BRANCH)
        # Append two modules to the defaukt branch
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=python_cell('abc'))
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=load_dataset('file', 'name'))
        # Create a branch at the end of the default branch. The new branch
        # contains one workflow with two modules the version number is 2
        newbranch = self.db.create_branch(viztrail_id=vt.identifier,
                                          properties={'name': 'New Branch'})
        self.assertEquals(len(newbranch.workflows), 1)
        self.assertEquals(newbranch.workflows[-1].version, 2)
        wf = vt.get_workflow(branch_id=newbranch.identifier)
        self.assertEquals(wf.version, 2)
        self.assertEquals(len(wf.modules), 2)
        self.assertTrue(newbranch.identifier in vt.branches)
        # Ensure that everything has been persisted properly
        self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY,
                                               {ENV.identifier: ENV})
        vt = self.db.get_viztrail(vt.identifier)
        newbranch = vt.branches[newbranch.identifier]
        self.assertEquals(len(newbranch.workflows), 1)
        self.assertEquals(newbranch.workflows[-1].version, 2)
        wf = vt.get_workflow(branch_id=newbranch.identifier)
        self.assertEquals(wf.version, 2)
        self.assertEquals(len(wf.modules), 2)
        self.assertTrue(newbranch.identifier in vt.branches)
        self.assertEquals(newbranch.properties.get_properties()['name'],
                          'New Branch')
        # Create a third branch from the start of the master branch
        thirdbranch = self.db.create_branch(viztrail_id=vt.identifier,
                                            properties={'name': 'Next Branch'},
                                            module_id=0)
        wf = vt.get_workflow(branch_id=thirdbranch.identifier)
        self.assertEquals(wf.version, 3)
        self.assertEquals(len(wf.modules), 1)
        # Append modules at end of master and at beginning of thirdbranch
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=python_cell('abc'))
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       branch_id=thirdbranch.identifier,
                                       command=python_cell('def'),
                                       before_id=0)
        master_head = vt.get_workflow()
        self.assertEquals(len(master_head.modules), 3)
        self.assertEquals(master_head.modules[0].command.module_type,
                          PACKAGE_PYTHON)
        self.assertEquals(master_head.modules[1].command.module_type,
                          PACKAGE_VIZUAL)
        self.assertEquals(master_head.modules[2].command.module_type,
                          PACKAGE_PYTHON)
        b2_head = vt.get_workflow(branch_id=newbranch.identifier)
        self.assertEquals(len(b2_head.modules), 2)
        self.assertEquals(b2_head.modules[0].command.module_type,
                          PACKAGE_PYTHON)
        self.assertEquals(b2_head.modules[1].command.module_type,
                          PACKAGE_VIZUAL)
        b3_head = vt.get_workflow(branch_id=thirdbranch.identifier)
        self.assertEquals(len(b3_head.modules), 2)
        self.assertEquals(b3_head.modules[0].command.module_type,
                          PACKAGE_PYTHON)
        self.assertEquals(b3_head.modules[1].command.module_type,
                          PACKAGE_PYTHON)
        # Replace second module of third branch
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            branch_id=thirdbranch.identifier,
            module_id=b3_head.modules[1].identifier,
            command=load_dataset('file', 'name'))
        b3_head = vt.get_workflow(branch_id=thirdbranch.identifier)
        self.assertEquals(len(b3_head.modules), 2)
        self.assertEquals(b3_head.modules[0].command.module_type,
                          PACKAGE_PYTHON)
        self.assertEquals(b3_head.modules[1].command.module_type,
                          PACKAGE_VIZUAL)
        master_head = vt.get_workflow()
        self.assertEquals(len(master_head.modules), 3)
        self.assertEquals(master_head.modules[0].command.module_type,
                          PACKAGE_PYTHON)
        self.assertEquals(master_head.modules[1].command.module_type,
                          PACKAGE_VIZUAL)
        self.assertEquals(master_head.modules[2].command.module_type,
                          PACKAGE_PYTHON)
        b2_head = vt.get_workflow(branch_id=newbranch.identifier)
        self.assertEquals(len(b2_head.modules), 2)
        self.assertEquals(b2_head.modules[0].command.module_type,
                          PACKAGE_PYTHON)
        self.assertEquals(b2_head.modules[1].command.module_type,
                          PACKAGE_VIZUAL)
        # Ensure there are exceptions raised when branching of an unknown branch
        # or module
        with self.assertRaises(ValueError):
            self.db.create_branch(viztrail_id=vt.identifier,
                                  source_branch='unknonw-branch',
                                  properties={'name': 'New Branch'})
        with self.assertRaises(ValueError):
            self.db.create_branch(viztrail_id=vt.identifier,
                                  properties={'name': 'New Branch'},
                                  module_id=100)
        with self.assertRaises(ValueError):
            self.db.create_branch(viztrail_id=vt.identifier)
        # Test branch provenance
        self.assertEquals(newbranch.provenance.source_branch, DEFAULT_BRANCH)
        self.assertEquals(newbranch.provenance.workflow_version, 1)
        self.assertEquals(newbranch.provenance.module_id, 1)
        self.assertEquals(thirdbranch.provenance.source_branch, DEFAULT_BRANCH)
        self.assertEquals(thirdbranch.provenance.workflow_version, 1)
        self.assertEquals(thirdbranch.provenance.module_id, 0)

    def test_eval_command(self):
        """Test functionality to execute a workflow module."""
        # Create new work trail, append a module and retrieve the resulting
        # workflow from default branch HEAD.
        vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'})
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=python_cell('abc'))
        wf = vt.get_workflow()
        self.assertEquals(wf.version, 0)
        self.assertEquals(len(wf.modules), 1)
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=python_cell('def'))
        wf = vt.get_workflow(branch_id=DEFAULT_BRANCH)
        self.assertEquals(wf.version, 1)
        self.assertEquals(len(wf.modules), 2)
        self.assertEquals(len(wf.modules[0].stdout), 1)
        self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(wf.modules[0].command.command_identifier,
                          PYTHON_CODE)
        self.assertEquals(wf.modules[0].command.arguments[PYTHON_SOURCE],
                          'abc')
        self.assertEquals(len(wf.modules[1].stdout), 1)
        self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(wf.modules[1].command.command_identifier,
                          PYTHON_CODE)
        self.assertEquals(wf.modules[1].command.arguments[PYTHON_SOURCE],
                          'def')
        self.db.replace_workflow_module(viztrail_id=vt.identifier,
                                        module_id=0,
                                        command=load_dataset('file', 'ds'))
        wf = vt.get_workflow()
        self.assertEquals(wf.version, 2)
        self.assertEquals(len(wf.modules), 2)
        self.assertEquals(len(wf.modules[0].stdout), 1)
        self.assertEquals(wf.modules[0].command.module_type, PACKAGE_VIZUAL)
        self.assertEquals(wf.modules[0].command.command_identifier,
                          VIZUAL_LOAD)
        self.assertEquals(wf.modules[0].command.arguments[PARA_FILE]['fileid'],
                          'file')
        self.assertEquals(wf.modules[0].command.arguments[PARA_NAME], 'ds')
        self.assertEquals(len(wf.modules[1].stdout), 2)
        self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON)
        self.assertEquals(wf.modules[1].command.command_identifier,
                          PYTHON_CODE)
        self.assertEquals(wf.modules[1].command.arguments[PYTHON_SOURCE],
                          'def')

    def test_workflow_life_cycle(self):
        """Test functionality to execute a workflow module."""
        # Create new work trail.
        vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'})
        # Append two modules
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=python_cell('abc'))
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       command=load_dataset('file', 'name'))
        # Create a branch at the end of the default branch
        newbranch = self.db.create_branch(viztrail_id=vt.identifier,
                                          properties={'name': 'New Branch'})
        # Append modules at end ofnew branch
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       branch_id=newbranch.identifier,
                                       command=python_cell('xyz'))
        self.db.append_workflow_module(viztrail_id=vt.identifier,
                                       branch_id=newbranch.identifier,
                                       command=load_dataset('file', 'myname'),
                                       before_id=0)
        # Ensure that all version files exist
        self.check_files(vt.identifier, vt.branches[DEFAULT_BRANCH].workflows,
                         True)
        new_versions = vt.branches[newbranch.identifier].workflows
        self.check_files(vt.identifier, new_versions, True)
        # Delete new branch. Ensure that only the master versions exist
        self.assertTrue(
            self.db.delete_branch(viztrail_id=vt.identifier,
                                  branch_id=newbranch.identifier))
        self.check_files(vt.identifier, vt.branches[DEFAULT_BRANCH].workflows,
                         True)
        self.check_files(vt.identifier, new_versions, False)
        # Deleting a non-existing branch should return False
        self.assertFalse(
            self.db.delete_branch(viztrail_id=vt.identifier,
                                  branch_id=newbranch.identifier))
        self.assertFalse(
            self.db.delete_branch(viztrail_id=vt.identifier,
                                  branch_id='unknown'))
        # Deleting master branch should raise exception
        with self.assertRaises(ValueError):
            self.db.delete_branch(viztrail_id=vt.identifier,
                                  branch_id=DEFAULT_BRANCH)

    def test_viztrail_life_cycle(self):
        """Test API methods to create and delete work trails."""
        # Create work trail and ensure that deleting it returns True
        vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'})
        # Ensure that the viztrail has property name = 'My Project'
        self.assertEquals(vt.properties.get_properties()['name'], 'My Project')
        self.assertEquals(len(self.db.list_viztrails()), 1)
        self.assertTrue(self.db.delete_viztrail(vt.identifier))
        self.assertEquals(len(self.db.list_viztrails()), 0)
        # Multiple deletes should return False
        self.assertFalse(self.db.delete_viztrail(vt.identifier))
        # Deleting an unknown work trail should return False
        self.assertFalse(self.db.delete_viztrail('invalid id'))
        self.assertFalse(self.db.delete_viztrail('f0f0f0f0f0f0f0f0f0f0f0f0'))
        # Cannot create viztrail for unknown engine
        with self.assertRaises(ValueError):
            self.db.create_viztrail('UNKNOWN', {'name': 'My Project'})

    def check_files(self, viztrail_id, versions, check_exists):
        for wf_desc in versions:
            filename = os.path.join(VIZTRAILS_DIRECTORY, viztrail_id,
                                    str(wf_desc.version) + '.yaml')
            self.assertEquals(os.path.isfile(filename), check_exists)
Beispiel #7
0
class TestMimirLenses(unittest.TestCase):

    def setUp(self):
        """Create an empty work trails repository."""
        # Create fresh set of directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)
            os.mkdir(d)
        self.datastore = MimirDataStore(DATASTORE_DIR)
        self.fileserver = DefaultFileServer(FILESERVER_DIR)
        vizual = MimirVizualEngine(self.datastore, self.fileserver)
        self.db = FileSystemViztrailRepository(
            VIZTRAILS_DIR,
            {ENV.identifier: ENV}
        )

    def tearDown(self):
        """Clean-up by dropping the MongoDB colelction used by the engine.
        """
        # Delete directories
        for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]:
            if os.path.isdir(d):
                shutil.rmtree(d)

    def test_domain_lens(self):
        """Test DOMAIN lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        col_age = ds.column_by_name('Age')
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_domain(DS_NAME, col_age.identifier)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR AGE IN PEOPLE')
        self.assertFalse(wf.has_error)
        self.assertEquals(len(wf.modules), 2)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        rows = ds.fetch_rows()
        self.assertNotEquals(rows[2].values[ds.column_index('Age')], '')
        # Introduce an error. Make sure command formating is correct
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_domain('MY DS', 'MY COL')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR \'MY COL\' IN \'MY DS\'')
        mimir.finalize()

    def test_geocode_lens(self):
        """Test GEOCODE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(GEO_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertFalse(wf.has_error)
        # Geocode Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_geocode(
                DS_NAME,
                'GOOGLE',
                house_nr=ds.column_by_name('STRNUMBER').identifier,
                street=ds.column_by_name('STRNAME').identifier,
                city=ds.column_by_name('CITY').identifier,
                state=ds.column_by_name('STATE').identifier
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE HOUSE_NUMBER=STRNUMBER,STREET=STRNAME,CITY=CITY,STATE=STATE PEOPLE USING GOOGLE')
        self.assertFalse(wf.has_error)
        self.assertEquals(len(wf.modules), 2)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 6)
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_geocode(
                DS_NAME,
                'GOOGLE'
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE PEOPLE USING GOOGLE')
        self.assertFalse(wf.has_error)
        self.assertEquals(len(wf.modules), 3)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 8)
        mimir.finalize()

    def test_key_repair_lens(self):
        """Test KEY REPAIR lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(KEY_REPAIR_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds1 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME])
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_key_repair(DS_NAME, ds1.column_by_name('Empid').identifier)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR EMPID IN ' + DS_NAME.upper())
        # Get dataset
        ds2 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME])
        self.assertEquals(ds1.row_count, ds2.row_count)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 4)
        self.assertEquals(ds.row_count, 2)
        names = set()
        empids = set()
        rowids = set()
        for row in DatasetClient(dataset=ds).rows:
            rowids.add(row.identifier)
            empids.add(int(row.get_value('empid')))
            names.add(row.get_value('name'))
        self.assertTrue(1 in empids)
        self.assertTrue(2 in rowids)
        self.assertTrue('Alice' in names)
        self.assertTrue('Carla' in names)
        # Test error case and command text
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_key_repair('MY DS', 'MY COL')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR \'MY COL\' IN \'MY DS\'')
        mimir.finalize()

    def test_missing_value_lens(self):
        """Test MISSING_VALUE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value(DS_NAME, ds.column_by_name('AGE').identifier)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper())
        self.assertEquals(len(wf.modules), 2)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        rows = ds.fetch_rows()
        self.assertNotEquals(rows[2].values[ds.column_index('Age')], '')
        # Annotations
        annotations = ds.get_annotations(column_id=1, row_id=4)
        self.assertEquals(len(annotations), 2)
        # MISSING VALUE Lens with value constraint
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'New Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value(
                DS_NAME,
                ds.column_by_name('AGE').identifier,
                constraint='> 30')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.has_error:
            print wf.modules[-1].stderr[0]
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper() + ' WITH CONSTRAINT > 30')
        #self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper())
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        rows = ds.fetch_rows()
        self.assertTrue(rows[2].values[ds.column_index('Age')] > 30)
        # Command text in case of error
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_value('MY DS', '?', constraint='A B')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        cmd_text = wf.modules[-1].command_text.upper()
        expected_text = 'MISSING VALUES FOR ? IN \'MY DS\'' + ' WITH CONSTRAINT A B'
        self.assertEquals(cmd_text, expected_text)
        mimir.finalize()

    def test_missing_key_lens(self):
        """Test MISSING_KEY lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        # Missing Value Lens
        age_col = ds.columns[ds.column_index('Age')].identifier
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_key(DS_NAME, age_col, missing_only=True)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING KEYS FOR AGE IN ' + DS_NAME.upper())
        self.assertFalse(wf.has_error)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEquals(len(rows), 24)
        #self.db.append_workflow_module(
        #    viztrail_id=vt.identifier,
        #    command=cmd.load_dataset(f_handle.identifier, DS_NAME + '2')
        #)
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_missing_key(
                DS_NAME,
                ds.columns[ds.column_index('Salary')].identifier,
                missing_only=True
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Get dataset
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds.columns), 3)
        rows = ds.fetch_rows()
        self.assertEquals(len(rows), 55)
        mimir.finalize()

    def test_picker_lens(self):
        """Test PICKER lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(PICKER_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_picker(DS_NAME, [
                {'pickFrom': ds.column_by_name('Age').identifier},
                {'pickFrom': ds.column_by_name('Salary').identifier}
            ])
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.modules[-1].has_error:
            print wf.modules[-1].stderr
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,SALARY IN ' + DS_NAME.upper())
        # Get dataset
        self.assertEquals(len(wf.modules[-1].datasets), 1)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        columns = [c.name for c in ds.columns]
        self.assertEquals(len(ds.columns), 5)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        # Pick another column, this time with custom name
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_picker(DS_NAME, [
                {'pickFrom': ds.column_by_name('Age').identifier},
                {'pickFrom': ds.column_by_name('Salary').identifier}
            ],
            pick_as='My Column')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Get dataset
        self.assertEquals(len(wf.modules[-1].datasets), 1)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        columns = [c.name for c in ds.columns]
        self.assertEquals(len(ds.columns), 6)
        self.assertTrue('PICK_ONE_AGE_SALARY' in columns)
        self.assertTrue('My Column' in columns)
        # Pick from a picked column
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_picker(DS_NAME, [
                {'pickFrom': ds.column_by_name('Age').identifier},
                {'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier}
            ],
            pick_as='My Column')
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        if wf.modules[-1].has_error:
            print wf.modules[-1].stderr
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,PICK_ONE_AGE_SALARY AS \'MY COLUMN\' IN ' + DS_NAME.upper())
        ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        mimir.finalize()

    def test_schema_matching_lens(self):
        """Test SCHEMA_MATCHING lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        # Missing Value Lens
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(DS_NAME, [
                {'column': 'BDate', 'type': 'int'},
                {'column': 'PName', 'type': 'varchar'}
            ], 'new_' + DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT, PNAME VARCHAR) AS NEW_' + DS_NAME.upper())
        # Get dataset
        self.assertEquals(len(wf.modules[-1].datasets), 2)
        ds = self.datastore.get_dataset(wf.modules[-1].datasets['new_' + DS_NAME])
        self.assertEquals(len(ds.columns), 2)
        self.assertEquals(ds.row_count, 2)
        # Error if adding an existing dataset
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(
                DS_NAME,
                [{'column': 'BDate', 'type': 'int'}],
                'new_' + DS_NAME
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.db.replace_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(
                DS_NAME,
                [{'column': 'BDate', 'type': 'int'}],
                'a_new_' + DS_NAME
            ),
            module_id=wf.modules[-1].identifier,
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS A_NEW_' + DS_NAME.upper())
        # Error when adding a dataset with an invalid name
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_schema_matching(
                DS_NAME,
                [{'column': 'BDate', 'type': 'int'}],
                'SOME NAME'
            )
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertTrue(wf.has_error)
        self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS \'SOME NAME\'')
        mimir.finalize()

    def test_type_inference_lens(self):
        """Test TYPE INFERENCE lens."""
        # Create new work trail and retrieve the HEAD workflow of the default
        # branch
        mimir.initialize()
        f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE)
        vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'})
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.load_dataset(f_handle.identifier, DS_NAME)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        ds1 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertFalse(wf.has_error)
        # Infer type
        self.db.append_workflow_module(
            viztrail_id=vt.identifier,
            command=cmd.mimir_type_inference(DS_NAME, 0.6)
        )
        wf = self.db.get_workflow(viztrail_id=vt.identifier)
        self.assertFalse(wf.has_error)
        print wf.modules[-1].command_text.upper()
        self.assertEquals(wf.modules[-1].command_text.upper(), 'TYPE INFERENCE FOR COLUMNS IN ' + DS_NAME.upper() + ' WITH PERCENT_CONFORM = 0.6')
        # Get dataset
        ds2 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
        self.assertEquals(len(ds2.columns), 3)
        self.assertEquals(ds2.row_count, 7)
        ds1_rows = ds1.fetch_rows()
        ds2_rows = ds2.fetch_rows()
        for i in range(ds2.row_count):
            self.assertEquals(ds1_rows[i].values, ds2_rows[i].values)
        mimir.finalize()
Beispiel #8
0
fileserver = DefaultFileServer(FILESERVER_DIR)
vizual = MimirVizualEngine(datastore, fileserver)
db = FileSystemViztrailRepository(VIZTRAILS_DIR, {ENV.identifier: ENV})

mimir.initialize()

vt = db.create_viztrail(ENV.identifier, {'name': 'My Project'})

#
# LOAD DATASET
#
f_handle = fileserver.upload_file(CSV_FILE)
db.append_workflow_module(viztrail_id=vt.identifier,
                          command=cmd.load_dataset(f_handle.identifier,
                                                   DS_NAME))
wf = db.get_workflow(viztrail_id=vt.identifier)
ds = datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])
print_dataset(ds)
"""
#
# PICKER LENS
#
db.append_workflow_module(
    viztrail_id=vt.identifier,
    command=cmd.mimir_picker(
        DS_NAME,
        [
            {'pickFrom': 'A'},
            {'pickFrom': 'B'}
        ],
        pick_as='A_B')