Example #1
0
    def execute_script(self, args, context):
        """Execute a R script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get R script from user arguments
        source = args.get_value(cmd.PARA_R_SOURCE)
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        outputs = ModuleOutputs()
        
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o]
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.identifier
        # Run the r code
        try:
            evalresp = mimir.evalR(mimir_table_names, source)
            ostd = evalresp['stdout']
            oerr = evalresp['stderr']
            if not ostd == '':
                outputs.stdout.append(HtmlOutput(ostd))
            if not oerr == '':
                outputs.stderr.append(TextOutput(oerr))
        except Exception as ex:
            outputs.error(ex)
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(HtmlOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
        provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(
            is_success=(len(outputs.stderr) == 0),
            outputs=outputs,
            provenance=provenance
        )
Example #2
0
 def show(self, value, mime_type=None, force_to_string=True):
     if not issubclass(type(value), OutputObject):
         if mime_type is not None:
             value = OutputObject(value=value, type=mime_type)
         elif type(value) is str:
             value = TextOutput(value=value)
         elif type(value) is DatasetClient:
             from vizier.api.webservice import server
             ds_handle = server.api.datasets.get_dataset(
                 project_id=self.project_id,
                 dataset_id=value.dataset.identifier,
                 offset=0,
                 limit=10)
             value = DatasetOutput(ds_handle)
         elif issubclass(type(value), BokehLayout):
             value = vizier_bokeh_render(value)
             value = HtmlOutput(value=value)
         elif issubclass(type(value), MatplotlibFigure):
             value = HtmlOutput(value=vizier_matplotlib_render(value))
         elif issubclass(type(value), MatplotlibAxes):
             value = HtmlOutput(
                 value=vizier_matplotlib_render(value.get_figure()))
         else:
             repr_html = getattr(value, "_repr_html_", None)
             if repr_html is not None:
                 value = HtmlOutput(str(repr_html()))
             elif force_to_string:
                 value = TextOutput(value=str(value))
             else:
                 return
     self.stdout.append(value)
Example #3
0
    def execute_script(self, args, context):
        """Execute a Markdown script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get Markdown script from user arguments
        source = args.get_value(cmd.PARA_MARKDOWN_SOURCE)
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        outputs = ModuleOutputs()
        # Run the markdown code
        try:
            #we should validate the markdown here
            ostd = source
            oerr = ''
            if not ostd == '':
                outputs.stdout.append(MarkdownOutput(ostd))
            if not oerr == '':
                outputs.stderr.append(TextOutput(oerr))
        except Exception as ex:
            outputs.error(ex)
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(MarkdownOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
        provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
 def test_single_append(self):
     """Test appending a single module to an empty viztrail branch."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     command = python_cell(source='print 2+2')
     ts = get_current_time()
     module = OSModuleHandle.create_module(
         command=command,
         external_form='print 2+2',
         state=MODULE_SUCCESS,
         outputs=ModuleOutputs(stdout=[TextOutput('4')]),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(created_at=ts,
                                   started_at=ts,
                                   finished_at=ts),
         module_folder=vt.modules_folder,
         object_store=vt.object_store)
     wf = branch.append_workflow(modules=[module],
                                 action=ACTION_INSERT,
                                 command=command)
     # We expect that there exists a file for the workflow handle and one for
     # the new module
     self.assertTrue(
         os.path.isfile(os.path.join(branch.base_path, wf.identifier)))
     self.assertTrue(
         os.path.isfile(os.path.join(wf.modules[-1].module_path)))
     # Load the viztrail and get the module at the branch head
     vt = OSViztrailHandle.load_viztrail(base_path)
     module = vt.get_default_branch().get_head().modules[-1]
     self.assertEqual(module.external_form, 'print 2+2')
     self.assertEqual(module.outputs.stdout[-1].value, '4')
Example #5
0
    def compute_drop_dataset(self, args, context):
        """Execute drop dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get dataset name and remove the associated entry from the
        # dictionary of datasets in the context. Will raise exception if the
        # specified dataset does not exist.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        ds = context.get_dataset(ds_name)
        datasets = dict(context.datasets)
        del datasets[ds_name]
        return ExecResult(outputs=ModuleOutputs(
            stdout=[TextOutput('Dataset \'' + ds_name + '\' deleted')]),
                          provenance=ModuleProvenance(read=dict(),
                                                      write=dict(),
                                                      delete=[ds_name]))
Example #6
0
def execute(task_id, project_id, command_doc, context, resources):
    """Execute the givven command.

    Parameters:
    -----------
    task_id: string
        Unique task identifier
    project_id: string
        Unique project identifier
    command_doc : dict
        Dictionary serialization of the module command
    context: dict
        Dictionary of available resources in the database state. The key is
        the resource name. Values are resource identifiers.
    resources: dict
        Optional information about resources that were generated during a
        previous execution of the command
    """
    # Create a remote workflow controller for the given task
    controller = worker_env.get_controller(project_id)
    # Notify the workflow controller that the task started to run
    controller.set_running(task_id=task_id, started_at=get_current_time())
    # Get the processor and execute the command. In case of an unknown package
    # the result is set to error.
    command = ModuleCommand.from_dict(command_doc)
    if command.package_id in worker_env.processors:
        processor = worker_env.processors[command.package_id]
        _, exec_result = exec_command(
            task_id=task_id,
            command=command,
            context=TaskContext(
                project_id=project_id,
                datastore=worker_env.datastores.get_datastore(project_id),
                filestore=worker_env.filestores.get_filestore(project_id),
                datasets=context[labels.CONTEXT_DATASETS],
                resources=resources,
                dataobjects=context[labels.CONTEXT_DATAOBJECTS]
            ),
            processor=processor
        )
    else:
        message = 'unknown package \'' + str(command.package_id) + '\''
        exec_result = ExecResult(
            is_success=False,
            outputs=ModuleOutputs(stderr=[TextOutput(message)])
        )
    # Notify the workflow controller that the task has finished
    if exec_result.is_success:
        controller.set_success(
            task_id=task_id,
            outputs=exec_result.outputs,
            provenance=exec_result.provenance
        )
    else:
        controller.set_error(
            task_id=task_id,
            outputs=exec_result.outputs
        )
 def test_load_with_dataset_delete(self):
     """Test loading workflows where each module creates a new dataset and
     deletes the previous dataset (except for the first module).
     """
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         deleted_datasets = list()
         if i > 0:
             deleted_datasets.append('DS' + str(i - 1))
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(write={
                 'DS' + str(i):
                 DatasetDescriptor(
                     identifier=str(i),
                     name='DS' + str(i),
                     columns=[
                         DatasetColumn(identifier=j, name=str(j))
                         for j in range(i)
                     ],
                 )
             },
                                         delete=deleted_datasets),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
     vt = OSViztrailHandle.load_viztrail(base_path)
     workflow = vt.get_default_branch().get_head()
     self.assertEqual(len(workflow.modules), 5)
     datasets = {}
     for i in range(5):
         module = workflow.modules[i]
         datasets = module.provenance.get_database_state(datasets)
         self.assertEqual(len(datasets), 1)
         key = 'DS' + str(i)
         self.assertTrue(key in datasets)
         self.assertEqual(len(datasets[key].columns), i)
Example #8
0
def print_lens_annotations(outputs, annotations):
    """Add annotation infromation for given lens to cell output.

    Parameters
    ----------
    outputs: vizier.workflow.module.ModuleOutputs
        Cell outputt streams
    annotations: dict
        Annotations from first 200 rows of queried lens
    """
    if not annotations is None:
        try:
            annotations = int(float(annotations))
        except:
            annotations = 0
        if annotations > 0:
            outputs.stdout.append(TextOutput('Repairs in first 200 rows:'))
            outputs.stdout.append(TextOutput(str(annotations)))
 def test_load_active(self):
     """Test loading workflows with active modules."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.get_history()), (i + 1))
     # This is a hack to simulate loading workflows with active modules
     # Change state of last two modules in branch head to an active state
     m = branch.get_head().modules[-2]
     m.state = MODULE_RUNNING
     m.write_module()
     m = branch.get_head().modules[-1]
     m.state = MODULE_RUNNING
     m.write_module()
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     self.assertTrue(branch.get_head().modules[0].is_success)
     self.assertTrue(branch.get_head().modules[1].is_success)
     self.assertTrue(branch.get_head().modules[2].is_success)
     self.assertTrue(branch.get_head().modules[3].is_canceled)
     self.assertTrue(branch.get_head().modules[4].is_canceled)
     # Change state of last module in second workflow to an active state
     m = branch.get_head().modules[1]
     m.state = MODULE_RUNNING
     m.write_module()
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     wf = branch.get_workflow(branch.get_history()[1].identifier)
     self.assertTrue(wf.modules[0].is_success)
     self.assertTrue(wf.modules[1].is_canceled)
Example #10
0
def print_dataset_schema(outputs, name, columns):
    """Add schema infromation for given dataset to cell output.

    Parameters
    ----------
    outputs: vizier.workflow.module.ModuleOutputs
        Cell outputt streams
    name: string
        Dataset name
    columns: list(vizier.datasetore.base.DatasetColumn)
        Columns in the dataset schema
    """
    outputs.stdout.append(TextOutput(name + ' ('))
    for i in range(len(columns)):
        text = '  ' + str(columns[i])
        if i != len(columns) - 1:
            text += ','
        outputs.stdout.append(TextOutput(text))
    outputs.stdout.append(TextOutput(')'))
Example #11
0
    def compute_empty_dataset(self, args, context):
        """Execute empty dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        default_columns = [("''", "unnamed_column")]
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        try:
            source = "SELECT {};".format(", ".join(
                default_val + " AS " + col_name
                for default_val, col_name in default_columns))
            view_name, dependencies = mimir.createView(dict(), source)

            columns = [
                MimirDatasetColumn(identifier=col_id,
                                   name_in_dataset=col_defn[1])
                for col_defn, col_id in zip(default_columns,
                                            range(len(default_columns)))
            ]

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=1)
            provenance = ModuleProvenance(
                write={
                    ds_name:
                    DatasetDescriptor(identifier=ds.identifier,
                                      columns=ds.columns,
                                      row_count=ds.row_count)
                },
                read=dict(
                )  # Need to explicitly declare a lack of dependencies.
            )
            outputs.stdout.append(
                TextOutput("Empty dataset '{}' created".format(ds_name)))
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
 def test_init(self):
     """test getter and setter methods for output streams."""
     # Ensure that lists are initialized properly
     out = ModuleOutputs()
     self.assertEqual(len(out.stderr), 0)
     self.assertEqual(len(out.stdout), 0)
     out.stdout.append(TextOutput(value='Hello World'))
     out.stderr.append(OutputObject(type='ERROR', value='Some Error'))
     out = ModuleOutputs(stdout=out.stdout, stderr=out.stderr)
     self.assertEqual(len(out.stderr), 1)
     self.assertEqual(len(out.stdout), 1)
     self.assertTrue(out.stdout[0].is_text)
     self.assertFalse(out.stderr[0].is_text)
Example #13
0
 def write_safe(self):
     """The write safe method writes the current module state to the object
     store. It catches any occuring exception and sets the module into error
     state if an exception occurs. This method is used to ensure that the
     state of the module is in error (i.e., the workflow cannot further be
     executed) if a state change fails.
     """
     try:
         self.write_module()
     except Exception as ex:
         self.state = mstate.MODULE_ERROR
         self.outputs = ModuleOutputs(stderr=[TextOutput(str(ex))])
         self.datasets = dict()
 def test_completed_append(self):
     """Test appending a completed workflow to a branch."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     for i in range(10):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
     head_modules = branch.get_head().modules
     wf = branch.append_workflow(modules=head_modules[:-1],
                                 action=ACTION_DELETE,
                                 command=head_modules[-1].command)
     self.assertEqual(len(wf.modules), 9)
     self.assertEqual(wf.descriptor.identifier, '0000000A')
     self.assertEqual(wf.descriptor.action, ACTION_DELETE)
     self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON)
     self.assertEqual(wf.descriptor.command_id, PYTHON_CODE)
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     history = branch.get_history()
     self.assertEqual(len(history), 11)
     wf = branch.get_head()
     self.assertEqual(len(wf.modules), 9)
     self.assertEqual(wf.descriptor.identifier, '0000000A')
     self.assertEqual(wf.descriptor.action, ACTION_DELETE)
     self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON)
     self.assertEqual(wf.descriptor.command_id, PYTHON_CODE)
 def test_state(self):
     """Ensure that only one of the state flag is True at the same time."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     # Pending
     self.assertTrue(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Running
     module.set_running(external_form='TEST MODULE')
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertTrue(module.is_running)
     self.assertFalse(module.is_success)
     # Canceled
     module.set_canceled()
     self.assertFalse(module.is_pending)
     self.assertTrue(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Error
     module.set_error()
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertTrue(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Success
     module.set_success()
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertTrue(module.is_success)
 def test_running(self):
     """Update module state from pending to running."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         datasets={'DS1': DS1},
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2')},
             resources={'fileid': '0123456789'}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     self.assertTrue(module.is_running)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertEqual(len(module.datasets), 0)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     # Read module from object store and ensure that tall changes have been
     # materialized properly
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_running)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertEqual(len(module.datasets), 0)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     # Set running with all optional parameters
     module.set_running(started_at=module.timestamp.created_at,
                        external_form='Some form')
     self.assertEqual(module.timestamp.started_at,
                      module.timestamp.created_at)
     self.assertEqual(module.external_form, 'Some form')
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertEqual(module.timestamp.started_at,
                      module.timestamp.created_at)
     self.assertEqual(module.external_form, 'Some form')
Example #17
0
    def create_exec_result(self,
                           dataset_name,
                           input_dataset=None,
                           output_dataset=None,
                           database_state=None,
                           stdout=None,
                           resources=None):
        """Create execution result object for a successfully completed task.
        Assumes that a single datasets has been modified.

        Note that this method is not suitable to generate the result object for
        the drop dataset and rename dataset commands.

        Parameters
        ----------
        dataset_name: string
            Name of the manipulated dataset
        input_dataset: vizier.datastore.dataset.DatasetDescriptor
            Descriptor for the input dataset
        output_dataset: vizier.datastore.dataset.DatasetDescriptor, optional
            Descriptor for the resulting dataset
        database_state: dict, optional
            Identifier for datasets in the database state agains which a task
            was executed (keyed by user-provided name)
        stdout= list(string), optional
            Lines in the command output
        resources: dict, optional
            Optional resources that were generated by the command

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        if not output_dataset is None:
            ds = DatasetDescriptor(identifier=output_dataset.identifier,
                                   columns=output_dataset.columns,
                                   row_count=output_dataset.row_count)
        else:
            ds = None
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput(line)
                                          for line in stdout]),
            provenance=ModuleProvenance(
                read={dataset_name: input_dataset.identifier}
                if not input_dataset is None else None,
                write={dataset_name: ds},
                resources=resources))
 def test_multi_append(self):
     """Test appending modules to viztrail branch."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(10):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.get_history()), (i + 1))
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     history = branch.get_history()
     self.assertEqual(len(history), 10)
     for i in range(10):
         wf = branch.get_workflow(history[i].identifier)
         self.assertEqual(len(wf.modules), (i + 1))
         for m in range(i + 1):
             module = wf.modules[m]
             self.assertEqual(module.external_form,
                              'print ' + str(m) + '+' + str(m))
             self.assertEqual(module.outputs.stdout[-1].value, str(m + m))
Example #19
0
    def compute_rename_dataset(self, args, context):
        """Execute rename dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get name of existing dataset and the new dataset name. Raise
        # exception if a dataset with the new name already exists or if the new
        # dataset name is not a valid name.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        new_name = args.get_value(pckg.PARA_NAME).lower()
        if new_name in context.datasets:
            raise ValueError('dataset \'' + new_name + '\' exists')
        if not is_valid_name(new_name):
            raise ValueError('invalid dataset name \'' + new_name + '\'')
        #  Get dataset. Raises exception if the dataset does not exist.
        ds = context.get_dataset(ds_name)
        # Adjust database state
        datasets = dict(context.datasets)
        del datasets[ds_name]
        datasets[new_name] = ds
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput('1 dataset renamed')]),
            provenance=ModuleProvenance(read=dict(),
                                        write={
                                            new_name:
                                            DatasetDescriptor(
                                                identifier=ds.identifier,
                                                columns=ds.columns,
                                                row_count=ds.row_count)
                                        },
                                        delete=[ds_name]))
 def test_load_with_missing_modules(self):
     """Test loading workflows with active modules."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.get_history()), (i + 1))
     # Delete the file for the third module to simulate an error condition in
     # which a file wasn't written properly
     os.remove(branch.head.modules[2].module_path)
     self.assertFalse(os.path.isfile(branch.head.modules[2].module_path))
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     self.assertTrue(branch.head.get_state().is_error)
     self.assertTrue(branch.head.modules[2].is_error)
 def test_safe_write(self):
     """Update module state with write error."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     self.assertTrue(module.is_running)
     module.set_success(outputs=ModuleOutputs(stderr=[None]))
     self.assertTrue(module.is_error)
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_running)
Example #22
0
 def test_cache_active_workflows(self):
     """Test caching for workflows that are active."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     command = python_cell(source='print 2+2')
     pending_module = OSModuleHandle.create_module(
         command=command,
         external_form='print 2+2',
         state=MODULE_PENDING,
         timestamp=ModuleTimestamp(created_at=get_current_time()),
         outputs=ModuleOutputs(),
         provenance=ModuleProvenance(),
         module_folder=vt.modules_folder,
         object_store=vt.object_store)
     wf = branch.append_workflow(modules=[pending_module],
                                 action=ACTION_INSERT,
                                 command=command)
     self.assertFalse(wf.identifier in [w.identifier for w in branch.cache])
     for i in range(DEFAULT_CACHE_SIZE):
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print 2+2',
             state=MODULE_SUCCESS,
             timestamp=ModuleTimestamp(created_at=get_current_time(),
                                       started_at=get_current_time(),
                                       finished_at=get_current_time()),
             outputs=ModuleOutputs(stdout=[TextOutput('4')]),
             provenance=ModuleProvenance(),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         branch.append_workflow(modules=branch.head.modules + [module],
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.cache), (i + 1))
         self.assertTrue(
             wf.identifier in [w.identifier for w in branch.cache])
     module = OSModuleHandle.create_module(
         command=command,
         external_form='print 2+2',
         state=MODULE_SUCCESS,
         timestamp=ModuleTimestamp(created_at=get_current_time(),
                                   started_at=get_current_time(),
                                   finished_at=get_current_time()),
         outputs=ModuleOutputs(stdout=[TextOutput('4')]),
         provenance=ModuleProvenance(),
         module_folder=vt.modules_folder,
         object_store=vt.object_store)
     branch.append_workflow(modules=branch.head.modules + [module],
                            action=ACTION_INSERT,
                            command=command)
     # The active workflow should not be removed
     self.assertEqual(len(branch.cache), DEFAULT_CACHE_SIZE + 1)
     self.assertTrue(wf.identifier in [w.identifier for w in branch.cache])
     # Set module state to error and append another workflow. This should
     # evict two workflows
     second_wf = branch.cache[1]
     third_wf = branch.cache[2]
     pending_module.set_error()
     module = OSModuleHandle.create_module(
         command=command,
         external_form='print 2+2',
         state=MODULE_SUCCESS,
         timestamp=ModuleTimestamp(created_at=get_current_time(),
                                   started_at=get_current_time(),
                                   finished_at=get_current_time()),
         outputs=ModuleOutputs(stdout=[TextOutput('4')]),
         provenance=ModuleProvenance(),
         module_folder=vt.modules_folder,
         object_store=vt.object_store)
     branch.append_workflow(modules=branch.head.modules + [module],
                            action=ACTION_INSERT,
                            command=command)
     # The active workflow should not be removed
     self.assertEqual(len(branch.cache), DEFAULT_CACHE_SIZE)
     self.assertFalse(wf.identifier in [w.identifier for w in branch.cache])
     self.assertFalse(
         second_wf.identifier in [w.identifier for w in branch.cache])
     self.assertTrue(
         third_wf.identifier in [w.identifier for w in branch.cache])
Example #23
0
    def compute(self, command_id: str, arguments: "ModuleArguments",
                context: TaskContext) -> ExecResult:
        """Compute results for commands in the sampling package using 
        the set of user-provided arguments and the current database 
        state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """

        input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower()
        input_dataset: DatasetDescriptor = context.get_dataset(input_ds_name)
        if input_dataset is None:
            raise ValueError('unknown dataset \'' + input_ds_name + '\'')

        output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET,
                                             raise_error=False)
        if output_ds_name is None or output_ds_name == "":
            output_ds_name = input_ds_name + "_SAMPLE"
        output_ds_name = output_ds_name.lower()

        # Load the sampling configuration
        sample_mode = None

        if command_id == cmd.BASIC_SAMPLE:
            sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE))
            if sampling_rate > 1.0 or sampling_rate < 0.0:
                raise Exception("Sampling rate must be between 0.0 and 1.0")
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY,
                "probability": sampling_rate
            }
        elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE:
            column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN)
            column_defn = input_dataset.columns[column]
            if command_id == cmd.MANUAL_STRATIFIED_SAMPLE:
                strata = [{
                    "value":
                    stratum.get_value(cmd.PARA_STRATUM_VALUE),
                    "probability":
                    stratum.get_value(cmd.PARA_SAMPLING_RATE)
                } for stratum in arguments.get_value(cmd.PARA_STRATA)]
            else:
                probability = arguments.get_value(cmd.PARA_SAMPLING_RATE)
                strata = self.get_automatic_strata(input_dataset, column_defn,
                                                   probability)
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_STRATIFIED_ON,
                "column": column_defn.name,
                "type": column_defn.data_type,
                "strata": strata
            }
        else:
            raise Exception("Unknown sampling command: {}".format(command_id))

        table_name, schema = mimir.createSample(input_dataset.identifier,
                                                sample_mode,
                                                result_name="SAMPLE_" +
                                                get_unique_identifier())
        ds = MimirDatasetHandle.from_mimir_result(table_name,
                                                  schema,
                                                  properties={},
                                                  name=output_ds_name)

        # And start rendering some output
        outputs = ModuleOutputs()
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        if ds_output is not None:
            ds_output['name'] = output_ds_name
            outputs.stdout.append(DatasetOutput(ds_output))
        else:
            outputs.stderr.append(TextOutput("Error displaying dataset"))

        # Record Reads and writes
        provenance = ModuleProvenance(
            read={input_ds_name: input_dataset.identifier},
            write={
                output_ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=output_ds_name,
                                  columns=ds.columns)
            })

        # Return task result
        return ExecResult(outputs=outputs, provenance=provenance)
Example #24
0
    def execute_script(self, args: ModuleArguments,
                       context: TaskContext) -> ExecResult:
        """Execute a Python script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get Python script from user arguments.  It is the source for VizierDBClient
        cell_src = args.get_value(cmd.PYTHON_SOURCE)

        # prepend python objects exported in previous cells to the source
        exported_methods = [
            context.datastore.get_object(descriptor.identifier).decode()
            for name, descriptor in context.dataobjects.items()
            if descriptor.artifact_type == ARTIFACT_TYPE_PYTHON
        ]
        overrides = [
            "def show(x):", "  global vizierdb", "  vizierdb.show(x)",
            "def export(x):", "  global vizierdb",
            "  vizierdb.export_module(x)", "def return_type(dt):",
            "  def wrap(x):", "    return x", "  return wrap", "pass"
        ]

        injected_source = "\n".join(exported_methods + overrides)
        injected_lines = len([x for x in injected_source if x == '\n']) + 1

        source = injected_source + '\n' + cell_src

        # Initialize the scope variables that are available to the executed
        # Python script. At this point this includes only the client to access
        # and manipulate datasets in the undelying datastore
        #
        # Use "any" type, since there's a (probably unnecessary) hack down
        # below that creates something that pretends to be a client.
        client: Any = VizierDBClient(datastore=context.datastore,
                                     datasets=context.datasets,
                                     source=cell_src,
                                     dataobjects=context.dataobjects,
                                     project_id=context.project_id,
                                     output_format=args.get_value(
                                         cmd.OUTPUT_FORMAT,
                                         default_value=OUTPUT_TEXT))
        variables = {VARS_DBCLIENT: client, VARS_OPEN: client.pycell_open}
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream: List[Tuple[str, str]] = list()
        sys.stdout = cast(TextIO, OutputStream(tag='out', stream=stream))
        sys.stderr = cast(TextIO, OutputStream(tag='err', stream=stream))
        # Keep track of exception that is thrown by the code
        exception = None
        resdata: Dict[str, Any] = dict()
        # Run the Python code
        try:
            python_cell_preload(variables, client=client)
            if SANDBOX_PYTHON_EXECUTION == "True":
                json_data = {
                    'source': source,
                    'datasets': context.datasets,
                    'dataobjects': context.dataobjects,
                    'datastore': context.datastore.__class__.__name__,
                    'basepath': context.datastore.base_path,
                    'project_id': context.project_id,
                    'output_format': client.output_format
                }
                res = requests.post(SANDBOX_PYTHON_URL, json=json_data)
                resdata = res.json()
                client = DotDict()
                for key, value in resdata['provenance'].items():
                    client.setattr(key, value)
                client.setattr('descriptors', {})
                client.setattr('datastore', context.datastore)
                client.setattr('datasets', resdata['datasets'])
                client.setattr('dataobjects', resdata['dataobjects'])
                client.setattr('output_format', resdata['output_format'])
                client.setattr('stdout', [
                    OutputObject(type=item['type'], value=item['value'])
                    for item in resdata.get('explicit_stdout', [])
                ])

            else:
                exec(source, variables, variables)

        except Exception as ex:
            exception = ex
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        outputs = ModuleOutputs()
        is_success = (exception is None)
        if SANDBOX_PYTHON_EXECUTION == "True":
            for text in resdata['stdout']:
                outputs.stdout.append(
                    OutputObject(value=text, type=client.output_format))
            for text in resdata['stderr']:
                outputs.stderr.append(TextOutput(text))
                is_success = False
        else:
            for tag, text in stream:
                text = ''.join(text).strip()
                if tag == 'out':
                    outputs.stdout.append(
                        OutputObject(value=text, type=client.output_format))
                else:
                    outputs.stderr.append(TextOutput(text))
                    is_success = False
        for output in client.stdout:
            outputs.stdout.append(output)

        if is_success:
            # Create provenance information. Ensure that all dictionaries
            # contain elements of expected types, i.e, ensure that the user did
            # not attempt anything tricky.
            read = dict()
            for name in client.read:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')
                if name in context.datasets:
                    read[name] = context.datasets[name].identifier
                    if not isinstance(read[name], str):
                        raise RuntimeError(
                            'invalid element in read mapping dictionary: {} (expecting str)'
                            .format(read[name]))
                elif name in context.dataobjects:
                    read[name] = context.dataobjects[name].identifier
                    if not isinstance(read[name], str):
                        raise RuntimeError(
                            'invalid element in read mapping dictionary: {} (expecting str)'
                            .format(read[name]))
                else:
                    raise RuntimeError('Unknown read artifact {}'.format(name))
            write = dict()
            for name in client.write:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')

                if name in client.datasets:
                    write_descriptor = client.datasets[name]
                    if not isinstance(write_descriptor, ArtifactDescriptor):
                        raise RuntimeError(
                            'invalid element in write mapping dictionary: {} (expecting str)'
                            .format(name))
                    else:
                        write[name] = write_descriptor
                elif name in client.dataobjects:
                    #wr_id = client.dataobjects[name]
                    write_descriptor = client.dataobjects[name]
                    #write_descriptor = client.datastore.get_object(identifier=wr_id)
                    if not isinstance(write_descriptor, ArtifactDescriptor):
                        raise RuntimeError(
                            'invalid element in write mapping dictionary: {} (expecting str)'
                            .format(name))
                    else:
                        write[name] = write_descriptor
                else:
                    raise RuntimeError(
                        'Unknown write artifact {}'.format(name))
            print("Pycell Execution Finished")
            print("     read: {}".format(read))
            print("     write: {}".format(write))
            provenance = ModuleProvenance(read=read,
                                          write=write,
                                          delete=client.delete)
        else:
            print("ERROR: {}".format(exception))
            assert (exception is not None)
            outputs.error(exception, offset_lines=-injected_lines)
            provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(is_success=is_success,
                          outputs=outputs,
                          provenance=provenance)
Example #25
0
 def test_branch_cache(self):
     """Test appending a single module to an empty viztrail branch."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     command = python_cell(source='print 2+2')
     module = OSModuleHandle.create_module(
         command=command,
         external_form='print 2+2',
         state=MODULE_SUCCESS,
         timestamp=ModuleTimestamp(created_at=get_current_time(),
                                   started_at=get_current_time(),
                                   finished_at=get_current_time()),
         outputs=ModuleOutputs(stdout=[TextOutput('4')]),
         provenance=ModuleProvenance(),
         module_folder=vt.modules_folder,
         object_store=vt.object_store)
     wf = branch.append_workflow(modules=[module],
                                 action=ACTION_INSERT,
                                 command=command)
     self.assertFalse(wf.identifier in [w.identifier for w in branch.cache])
     for i in range(DEFAULT_CACHE_SIZE):
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print 2+2',
             state=MODULE_SUCCESS,
             timestamp=ModuleTimestamp(created_at=get_current_time(),
                                       started_at=get_current_time(),
                                       finished_at=get_current_time()),
             outputs=ModuleOutputs(stdout=[TextOutput('4')]),
             provenance=ModuleProvenance(),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         branch.append_workflow(modules=branch.head.modules + [module],
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.cache), (i + 1))
         self.assertTrue(
             wf.identifier in [w.identifier for w in branch.cache])
     module = OSModuleHandle.create_module(
         command=command,
         external_form='print 2+2',
         state=MODULE_SUCCESS,
         timestamp=ModuleTimestamp(created_at=get_current_time(),
                                   started_at=get_current_time(),
                                   finished_at=get_current_time()),
         outputs=ModuleOutputs(stdout=[TextOutput('4')]),
         provenance=ModuleProvenance(),
         module_folder=vt.modules_folder,
         object_store=vt.object_store)
     branch.append_workflow(modules=branch.head.modules + [module],
                            action=ACTION_INSERT,
                            command=command)
     self.assertEqual(len(branch.cache), DEFAULT_CACHE_SIZE)
     self.assertFalse(wf.identifier in [w.identifier for w in branch.cache])
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     self.assertEqual(len(branch.cache), 0)
     self.assertFalse(wf.identifier in [w.identifier for w in branch.cache])
     branch.get_workflow(wf.identifier)
     self.assertTrue(wf.identifier in [w.identifier for w in branch.cache])
     for wf_desc in branch.get_history():
         if wf_desc.identifier != wf.identifier:
             branch.get_workflow(wf_desc.identifier)
     self.assertEqual(len(branch.cache), DEFAULT_CACHE_SIZE)
     self.assertFalse(wf.identifier in [w.identifier for w in branch.cache])
Example #26
0
    def execute_script(self, args, context):
        """Execute a Python script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get Python script from user arguments
        source = args.get_value(cmd.PYTHON_SOURCE)
        # Initialize the scope variables that are available to the executed
        # Python script. At this point this includes only the client to access
        # and manipulate datasets in the undelying datastore
        client = VizierDBClient(
            datastore=context.datastore,
            datasets=context.datasets
        )
        variables = {VARS_DBCLIENT: client}
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        # Keep track of exception that is thrown by the code
        exception = None
        # Run the Python code
        try:
            python_cell_preload(variables)
            exec(source, variables, variables)
        except Exception as ex:
            exception = ex
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        outputs = ModuleOutputs()
        is_success = (exception is None)
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(HtmlOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
                is_success = False
        if is_success:
            # Create provenance information. Ensure that all dictionaries
            # contain elements of expected types, i.e, ensure that the user did
            # not attempt anything tricky.
            read = dict()
            for name in client.read:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')
                if name in context.datasets:
                    read[name] = context.datasets[name]
                    if not isinstance(read[name], str):
                        raise RuntimeError('invalid element in mapping dictionary')
                else:
                    read[name] = None
            write = dict()
            for name in client.write:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')
                ds_id = client.datasets[name]
                if not ds_id is None:
                    if not isinstance(ds_id, str):
                        raise RuntimeError('invalid value in mapping dictionary')
                    elif ds_id in client.descriptors:
                        write[name] = client.descriptors[ds_id]
                    else:
                        write[name] = client.datastore.get_descriptor(ds_id)
                else:
                    write[name] = None
            provenance = ModuleProvenance(
                read=read,
                write=write,
                delete=client.delete
            )
        else:
            outputs.error(exception)
            provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(
            is_success=is_success,
            outputs=outputs,
            provenance=provenance
        )
Example #27
0
    def execute_query(self, args: ModuleArguments,
                      context: TaskContext) -> ExecResult:
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o].identifier
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.identifier
        # Module outputs
        outputs = ModuleOutputs()
        is_success = True
        functions = {
            name: context.dataobjects[name].identifier
            for name in context.dataobjects
            if context.dataobjects[name].obj_type == ARTIFACT_TYPE_PYTHON
        }
        try:
            # Create the view from the SQL source
            view_name, dependencies, mimirSchema, properties, functionDeps = mimir.createView(
                datasets=mimir_table_names,
                query=source,
                functions=dict(functions))
            ds = MimirDatasetHandle.from_mimir_result(view_name, mimirSchema,
                                                      properties, ds_name)

            print(mimirSchema)

            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            from vizier.api.webservice import server

            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            if ds_output is None:
                outputs.stderr.append(
                    TextOutput("Error displaying dataset {}".format(ds_name)))
            else:
                ds_output['name'] = ds_name
                outputs.stdout.append(DatasetOutput(ds_output))

            dependenciesDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.datasets.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            functionDepDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.dataobjects.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=ds_name,
                                  columns=ds.columns)
            },
                                          read={
                                              **dependenciesDict,
                                              **functionDepDict
                                          })
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
            is_success = False
        # Return execution result
        return ExecResult(is_success=is_success,
                          outputs=outputs,
                          provenance=provenance)
Example #28
0
 def test_outputs(self):
     """Test reading and writing modules with output information."""
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         outputs=ModuleOutputs(),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(),
         module_folder=MODULE_DIR)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     self.assertEqual(len(m.outputs.stderr), 0)
     self.assertEqual(len(m.outputs.stdout), 0)
     # Module with error output
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         outputs=ModuleOutputs(stderr=[TextOutput('Some text')]),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(),
         module_folder=MODULE_DIR)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     self.assertEqual(len(m.outputs.stderr), 1)
     self.assertTrue(m.outputs.stderr[0].is_text)
     self.assertEqual(m.outputs.stderr[0].value, 'Some text')
     self.assertEqual(len(m.outputs.stdout), 0)
     # Module with standard output
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         outputs=ModuleOutputs(stdout=[
             TextOutput('Some text'),
             OutputObject(type='chart', value='123')
         ]),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(),
         module_folder=MODULE_DIR)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     self.assertEqual(len(m.outputs.stdout), 2)
     self.assertTrue(m.outputs.stdout[0].is_text)
     self.assertEqual(m.outputs.stdout[0].value, 'Some text')
     self.assertFalse(m.outputs.stdout[1].is_text)
     self.assertEqual(m.outputs.stdout[1].value, '123')
     self.assertEqual(len(m.outputs.stderr), 0)
     # Module with standard error and standard output
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         outputs=ModuleOutputs(stderr=[TextOutput('Some text')],
                               stdout=[
                                   TextOutput('Some text'),
                                   OutputObject(type='chart', value='123')
                               ]),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(),
         module_folder=MODULE_DIR)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     self.assertEqual(len(m.outputs.stdout), 2)
     self.assertEqual(len(m.outputs.stderr), 1)
 def test_success(self) -> None:
     """Update module state from pending to success."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     module.set_success()
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertTrue(module.provenance.read == {})
     self.assertTrue(module.provenance.write == {})
     # Read module from object store and ensure that tall changes have been
     # materialized properly
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertTrue(module.provenance.read == {})
     self.assertTrue(module.provenance.write == {})
     # Set success with all optional parameters
     ts = get_current_time()
     module.set_success(
         finished_at=ts,
         outputs=ModuleOutputs(stdout=[TextOutput('XYZ')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 1)
     self.assertEqual(module.outputs.stdout[0].value, 'XYZ')
     self.assertIsNotNone(module.provenance.read)
     self.assertEqual(module.provenance.read['DS1'], 'ID1')
     self.assertIsNotNone(module.provenance.write)
     self.assertEqual(module.provenance.write['DS1'].identifier, 'ID2')
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path,
                                         prev_state=dict())
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 1)
     self.assertEqual(module.outputs.stdout[0].value, 'XYZ')
     self.assertIsNotNone(module.provenance.read)
     self.assertEqual(module.provenance.read['DS1'], 'ID1')
     self.assertIsNotNone(module.provenance.write)
     self.assertEqual(module.provenance.write['DS1'].identifier, 'ID2')
 def test_error(self):
     """Update module state from pending to error."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2', name='ID2')},
             resources={'fileid': '0123456789'}),
         timestamp=ModuleTimestamp())
     module.set_error()
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')
     # Read module from object store and ensure that tall changes have been
     # materialized properly
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')
     # Set canceled with timestamp and output information
     ts = get_current_time()
     module.set_error(
         finished_at=ts,
         outputs=ModuleOutputs(stderr=[TextOutput('Some Error')]))
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 1)
     self.assertEqual(module.outputs.stderr[0].value, 'Some Error')
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 1)
     self.assertEqual(module.outputs.stderr[0].value, 'Some Error')
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')