def execute_script(self, args, context): """Execute a R script in the given context. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get R script from user arguments source = args.get_value(cmd.PARA_R_SOURCE) # Redirect standard output and standard error streams out = sys.stdout err = sys.stderr stream = list() sys.stdout = OutputStream(tag='out', stream=stream) sys.stderr = OutputStream(tag='err', stream=stream) outputs = ModuleOutputs() mimir_table_names = dict() for ds_name_o in context.datasets: dataset_id = context.datasets[ds_name_o] dataset = context.datastore.get_dataset(dataset_id) if dataset is None: raise ValueError('unknown dataset \'' + ds_name_o + '\'') mimir_table_names[ds_name_o] = dataset.identifier # Run the r code try: evalresp = mimir.evalR(mimir_table_names, source) ostd = evalresp['stdout'] oerr = evalresp['stderr'] if not ostd == '': outputs.stdout.append(HtmlOutput(ostd)) if not oerr == '': outputs.stderr.append(TextOutput(oerr)) except Exception as ex: outputs.error(ex) finally: # Make sure to reverse redirection of output streams sys.stdout = out sys.stderr = err # Set module outputs for tag, text in stream: text = ''.join(text).strip() if tag == 'out': outputs.stdout.append(HtmlOutput(text)) else: outputs.stderr.append(TextOutput(text)) provenance = ModuleProvenance() # Return execution result return ExecResult( is_success=(len(outputs.stderr) == 0), outputs=outputs, provenance=provenance )
def compute_empty_dataset(self, args, context): """Execute empty dataset command. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ outputs = ModuleOutputs() default_columns = [("''", "unnamed_column")] ds_name = args.get_value(pckg.PARA_NAME).lower() if ds_name in context.datasets: raise ValueError('dataset \'' + ds_name + '\' exists') if not is_valid_name(ds_name): raise ValueError('invalid dataset name \'' + ds_name + '\'') try: source = "SELECT {};".format(", ".join( default_val + " AS " + col_name for default_val, col_name in default_columns)) view_name, dependencies = mimir.createView(dict(), source) columns = [ MimirDatasetColumn(identifier=col_id, name_in_dataset=col_defn[1]) for col_defn, col_id in zip(default_columns, range(len(default_columns))) ] ds = context.datastore.register_dataset(table_name=view_name, columns=columns, row_counter=1) provenance = ModuleProvenance( write={ ds_name: DatasetDescriptor(identifier=ds.identifier, columns=ds.columns, row_count=ds.row_count) }, read=dict( ) # Need to explicitly declare a lack of dependencies. ) outputs.stdout.append( TextOutput("Empty dataset '{}' created".format(ds_name))) except Exception as ex: provenance = ModuleProvenance() outputs.error(ex) return ExecResult(is_success=(len(outputs.stderr) == 0), outputs=outputs, provenance=provenance)
def test_init(self): """test getter and setter methods for output streams.""" # Ensure that lists are initialized properly out = ModuleOutputs() self.assertEqual(len(out.stderr), 0) self.assertEqual(len(out.stdout), 0) out.stdout.append(TextOutput(value='Hello World')) out.stderr.append(OutputObject(type='ERROR', value='Some Error')) out = ModuleOutputs(stdout=out.stdout, stderr=out.stderr) self.assertEqual(len(out.stderr), 1) self.assertEqual(len(out.stdout), 1) self.assertTrue(out.stdout[0].is_text) self.assertFalse(out.stderr[0].is_text)
def execute_script(self, args, context): """Execute a Markdown script in the given context. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get Markdown script from user arguments source = args.get_value(cmd.PARA_MARKDOWN_SOURCE) # Redirect standard output and standard error streams out = sys.stdout err = sys.stderr stream = list() sys.stdout = OutputStream(tag='out', stream=stream) sys.stderr = OutputStream(tag='err', stream=stream) outputs = ModuleOutputs() # Run the markdown code try: #we should validate the markdown here ostd = source oerr = '' if not ostd == '': outputs.stdout.append(MarkdownOutput(ostd)) if not oerr == '': outputs.stderr.append(TextOutput(oerr)) except Exception as ex: outputs.error(ex) finally: # Make sure to reverse redirection of output streams sys.stdout = out sys.stderr = err # Set module outputs for tag, text in stream: text = ''.join(text).strip() if tag == 'out': outputs.stdout.append(MarkdownOutput(text)) else: outputs.stderr.append(TextOutput(text)) provenance = ModuleProvenance() # Return execution result return ExecResult(is_success=(len(outputs.stderr) == 0), outputs=outputs, provenance=provenance)
def exec_command(task_id, command, context, processor): """The function executes a given task using a package task processor. Returns a pair of task identifier and execution result. Parameters ---------- task_id: string Unique task identifier command : vizier.viztrail.command.ModuleCommand Specification of the command that is to be executed context: vizier.engine.task.base.TaskContext Context for the executed task processor: vizier.engine.task.processor.TaskProcessor Task processor to execute the given command Returns ------- (string, vizier.engine.task.processor.ExecResult) """ try: result = processor.compute( command_id=command.command_id, arguments=command.arguments, context=context ) except Exception as ex: outputs = ModuleOutputs().error(ex) result = ExecResult(is_success=False, outputs=outputs) return task_id, result
def __init__(self, is_success: bool = True, outputs: ModuleOutputs = ModuleOutputs(), provenance: ModuleProvenance = ModuleProvenance(), updated_arguments: ModuleArguments = None): """Initialize the result components. Parameters ---------- is_success: bool Flag indicating if execution was successful outputs: vizier.viztrail.module.output.ModuleOutputs, optional Outputs to STDOUT and STDERR generated during task execution provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional Provenance information about datasets that were read and writen during task execution. updated_arguments: vizier.viztrail.command.ModuleArguments, optional If provided, the module's arguments will be overridden by the provided argument list. This functionality should *only* be used when the module needs to infer/guess some of its arguments (e.g., Load Dataset needs to actually try to load the dataset to have type/name information for columns). If updated arguments are provided, it is up to the processor to guarantee that the updated arguments are *idempotent* with its current execution (although idempotence with changes to the data and/or processor implementation need not be enforced.) """ self.is_success = is_success self.outputs = outputs self.provenance = provenance self.updated_arguments = updated_arguments
def OUTPUTS(obj): """Convert a set of module output streams from the default dictionary serialization into a ModuleOutputs object. Raises a ValueError if the given dictionary is not a proper output stream serialization. Parameters ---------- obj: dict Default output serialization for a pair of module output streams Returns ------- vizier.viztrail.module.output.ModuleOutputs """ try: return ModuleOutputs(stdout=[ OutputObject(type=o['type'], value=o['value']) for o in obj['stdout'] ], stderr=[ OutputObject(type=o['type'], value=o['value']) for o in obj['stderr'] ]) except KeyError as ex: raise ValueError(ex)
def compute_drop_dataset(self, args, context): """Execute drop dataset command. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get dataset name and remove the associated entry from the # dictionary of datasets in the context. Will raise exception if the # specified dataset does not exist. ds_name = args.get_value(pckg.PARA_DATASET).lower() ds = context.get_dataset(ds_name) datasets = dict(context.datasets) del datasets[ds_name] return ExecResult(outputs=ModuleOutputs( stdout=[TextOutput('Dataset \'' + ds_name + '\' deleted')]), provenance=ModuleProvenance(read=dict(), write=dict(), delete=[ds_name]))
def test_single_append(self): """Test appending a single module to an empty viztrail branch.""" base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties={}, base_path=base_path) branch = vt.get_default_branch() command = python_cell(source='print 2+2') ts = get_current_time() module = OSModuleHandle.create_module( command=command, external_form='print 2+2', state=MODULE_SUCCESS, outputs=ModuleOutputs(stdout=[TextOutput('4')]), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) wf = branch.append_workflow(modules=[module], action=ACTION_INSERT, command=command) # We expect that there exists a file for the workflow handle and one for # the new module self.assertTrue( os.path.isfile(os.path.join(branch.base_path, wf.identifier))) self.assertTrue( os.path.isfile(os.path.join(wf.modules[-1].module_path))) # Load the viztrail and get the module at the branch head vt = OSViztrailHandle.load_viztrail(base_path) module = vt.get_default_branch().get_head().modules[-1] self.assertEqual(module.external_form, 'print 2+2') self.assertEqual(module.outputs.stdout[-1].value, '4')
def set_success(self, finished_at: datetime = get_current_time(), outputs: ModuleOutputs = ModuleOutputs(), provenance: ModuleProvenance = ModuleProvenance(), updated_arguments: Optional[ModuleArguments] = None ): """Set status of the module to success. The finished_at property of the timestamp is set to the given value or the current time (if None). If case of a successful module execution the database state and module provenance information are also adjusted together with the module output streams. Parameters ---------- finished_at: datetime.datetime, optional Timestamp when module started running outputs: vizier.viztrail.module.output.ModuleOutputs, optional Output streams for module provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional Provenance information about datasets that were read and writen by previous execution of the module. """ # Update state, timestamp, database state, outputs and provenance # information. super().set_success(finished_at, outputs, provenance, updated_arguments) # Materialize module state self.write_safe()
def set_error( self, task_id: str, finished_at: datetime = get_current_time(), outputs: ModuleOutputs = ModuleOutputs() ) -> Optional[bool]: """Set status of the module that is associated with the given task identifier to error. The finished_at property of the timestamp is set to the given value or the current time (if None). The module outputs are adjusted to the given value. The output streams are empty if no value is given for the outputs parameter. Cancels all pending modules in the workflow. Returns True if the state of the workflow was changed and False otherwise. The result is None if the project or task did not exist. Parameters ---------- task_id : string Unique task identifier finished_at: datetime.datetime, optional Timestamp when module started running outputs: vizier.viztrail.module.output.ModuleOutputs, optional Output streams for module Returns ------- bool """ raise NotImplementedError
def set_success(self, finished_at: datetime = get_current_time(), outputs: ModuleOutputs = ModuleOutputs(), provenance: ModuleProvenance = ModuleProvenance(), updated_arguments: Optional[ModuleArguments] = None): """Set status of the module to success. The finished_at property of the timestamp is set to the given value or the current time (if None). If case of a successful module execution the database state and module provenance information are also adjusted together with the module output streams. Parameters ---------- finished_at: datetime.datetime, optional Timestamp when module started running outputs: vizier.viztrail.module.output.ModuleOutputs, optional Output streams for module provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional Provenance information about datasets that were read and writen by previous execution of the module. """ # Update state, timestamp, database state, outputs and provenance # information. self.state = MODULE_SUCCESS self.timestamp.finished_at = finished_at # If the module is set to success straight from pending state the # started_at timestamp may not have been set. if self.timestamp.started_at is None: self.timestamp.started_at = self.timestamp.finished_at if updated_arguments is not None: self.command.arguments = updated_arguments self.outputs = outputs self.provenance = provenance
def execute(task_id, project_id, command_doc, context, resources): """Execute the givven command. Parameters: ----------- task_id: string Unique task identifier project_id: string Unique project identifier command_doc : dict Dictionary serialization of the module command context: dict Dictionary of available resources in the database state. The key is the resource name. Values are resource identifiers. resources: dict Optional information about resources that were generated during a previous execution of the command """ # Create a remote workflow controller for the given task controller = worker_env.get_controller(project_id) # Notify the workflow controller that the task started to run controller.set_running(task_id=task_id, started_at=get_current_time()) # Get the processor and execute the command. In case of an unknown package # the result is set to error. command = ModuleCommand.from_dict(command_doc) if command.package_id in worker_env.processors: processor = worker_env.processors[command.package_id] _, exec_result = exec_command( task_id=task_id, command=command, context=TaskContext( project_id=project_id, datastore=worker_env.datastores.get_datastore(project_id), filestore=worker_env.filestores.get_filestore(project_id), datasets=context[labels.CONTEXT_DATASETS], resources=resources, dataobjects=context[labels.CONTEXT_DATAOBJECTS] ), processor=processor ) else: message = 'unknown package \'' + str(command.package_id) + '\'' exec_result = ExecResult( is_success=False, outputs=ModuleOutputs(stderr=[TextOutput(message)]) ) # Notify the workflow controller that the task has finished if exec_result.is_success: controller.set_success( task_id=task_id, outputs=exec_result.outputs, provenance=exec_result.provenance ) else: controller.set_error( task_id=task_id, outputs=exec_result.outputs )
def test_load_with_dataset_delete(self): """Test loading workflows where each module creates a new dataset and deletes the previous dataset (except for the first module). """ base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties={}, base_path=base_path) branch = vt.get_default_branch() # Append ten modules for i in range(5): ts = get_current_time() deleted_datasets = list() if i > 0: deleted_datasets.append('DS' + str(i - 1)) command = python_cell(source='print ' + str(i) + '+' + str(i)) module = OSModuleHandle.create_module( command=command, external_form='print ' + str(i) + '+' + str(i), state=MODULE_SUCCESS, outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]), provenance=ModuleProvenance(write={ 'DS' + str(i): DatasetDescriptor( identifier=str(i), name='DS' + str(i), columns=[ DatasetColumn(identifier=j, name=str(j)) for j in range(i) ], ) }, delete=deleted_datasets), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) if not branch.head is None: modules = branch.head.modules + [module] else: modules = [module] branch.append_workflow(modules=modules, action=ACTION_INSERT, command=command) vt = OSViztrailHandle.load_viztrail(base_path) workflow = vt.get_default_branch().get_head() self.assertEqual(len(workflow.modules), 5) datasets = {} for i in range(5): module = workflow.modules[i] datasets = module.provenance.get_database_state(datasets) self.assertEqual(len(datasets), 1) key = 'DS' + str(i) self.assertTrue(key in datasets) self.assertEqual(len(datasets[key].columns), i)
def compute_simple_chart(self, args, context): """Execute simple chart command. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get dataset name and the associated dataset. This will raise an # exception if the dataset name is unknown. ds_name = args.get_value(pckg.PARA_DATASET) ds = context.get_dataset(ds_name) # Get user-provided name for the new chart and verify that it is a # valid name chart_name = args.get_value(pckg.PARA_NAME, default_value=ds_name + ' Plot') if chart_name == '' or chart_name == None: chart_name = ds_name + ' Plot' if not is_valid_name(chart_name): raise ValueError('invalid chart name \'' + str(chart_name) + '\'') chart_args = args.get_value(cmd.PARA_CHART) chart_type = chart_args.get_value(cmd.PARA_CHART_TYPE) grouped_chart = chart_args.get_value(cmd.PARA_CHART_GROUPED) # Create a new chart view handle and add the series definitions view = ChartViewHandle(dataset_name=ds_name, chart_name=chart_name, chart_type=chart_type, grouped_chart=grouped_chart) # The data series index for x-axis values is optional if args.has(cmd.PARA_XAXIS): x_axis = args.get_value(cmd.PARA_XAXIS) # X-Axis column may be empty. In that case, we ignore the # x-axis spec add_data_series(args=x_axis, view=view, dataset=ds, col_arg_id=cmd.PARA_XAXIS_COLUMN, range_arg_id=cmd.PARA_XAXIS_RANGE) view.x_axis = 0 # Definition of data series. Each series is a pair of column # identifier and a printable label. for data_series in args.get_value(cmd.PARA_SERIES): add_data_series(args=data_series, view=view, dataset=ds) # Execute the query and get the result rows = ChartQuery.exec_query(ds, view) # Add chart view handle as module output return ExecResult( outputs=ModuleOutputs(stdout=[ChartOutput(view=view, rows=rows)]), provenance=ModuleProvenance(read={ds_name: ds.identifier}, write=dict(), charts=[view]))
def test_timestamps(self): """Test reading and writing modules with different timestamp values.""" mod0 = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_PENDING, outputs=ModuleOutputs(), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(), module_folder=MODULE_DIR) m = OSModuleHandle.load_module(identifier=mod0.identifier, module_path=mod0.module_path) # Test timestamps created_at = m.timestamp.created_at started_at = to_datetime('2018-11-26T13:00:00.000000') m.timestamp.started_at = started_at m.write_module() m = OSModuleHandle.load_module(identifier=mod0.identifier, module_path=mod0.module_path) self.assertEqual(m.timestamp.created_at, created_at) self.assertEqual(m.timestamp.started_at, started_at) finished_at = to_datetime('2018-11-26T13:00:00.000010') m.timestamp.created_at = finished_at m.timestamp.finished_at = finished_at m.write_module() m = OSModuleHandle.load_module(identifier=mod0.identifier, module_path=mod0.module_path) self.assertEqual(m.timestamp.created_at, finished_at) self.assertEqual(m.timestamp.started_at, started_at) self.assertEqual(m.timestamp.finished_at, finished_at) mod0 = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_PENDING, outputs=ModuleOutputs(), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(created_at=created_at, started_at=started_at), module_folder=MODULE_DIR) m = OSModuleHandle.load_module(identifier=mod0.identifier, module_path=mod0.module_path) self.assertEqual(m.timestamp.created_at, created_at) self.assertEqual(m.timestamp.started_at, started_at) self.assertIsNone(m.timestamp.finished_at)
def test_load_active(self): """Test loading workflows with active modules.""" base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties=None, base_path=base_path) branch = vt.get_default_branch() # Append ten modules for i in range(5): ts = get_current_time() command = python_cell(source='print ' + str(i) + '+' + str(i)) module = OSModuleHandle.create_module( command=command, external_form='print ' + str(i) + '+' + str(i), state=MODULE_SUCCESS, datasets=dict(), outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) if not branch.head is None: modules = branch.head.modules + [module] else: modules = [module] branch.append_workflow(modules=modules, action=ACTION_INSERT, command=command) self.assertEqual(len(branch.get_history()), (i + 1)) # This is a hack to simulate loading workflows with active modules # Change state of last two modules in branch head to an active state m = branch.get_head().modules[-2] m.state = MODULE_RUNNING m.write_module() m = branch.get_head().modules[-1] m.state = MODULE_RUNNING m.write_module() vt = OSViztrailHandle.load_viztrail(base_path) branch = vt.get_default_branch() self.assertTrue(branch.get_head().modules[0].is_success) self.assertTrue(branch.get_head().modules[1].is_success) self.assertTrue(branch.get_head().modules[2].is_success) self.assertTrue(branch.get_head().modules[3].is_canceled) self.assertTrue(branch.get_head().modules[4].is_canceled) # Change state of last module in second workflow to an active state m = branch.get_head().modules[1] m.state = MODULE_RUNNING m.write_module() vt = OSViztrailHandle.load_viztrail(base_path) branch = vt.get_default_branch() wf = branch.get_workflow(branch.get_history()[1].identifier) self.assertTrue(wf.modules[0].is_success) self.assertTrue(wf.modules[1].is_canceled)
def test_datasets(self): """Test reading and writing modules with dataset information.""" mod0 = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_PENDING, outputs=ModuleOutputs(), provenance=ModuleProvenance(write=DATASETS), timestamp=ModuleTimestamp(), module_folder=MODULE_DIR, datasets=DATASETS) m = OSModuleHandle.load_module(identifier=mod0.identifier, module_path=mod0.module_path, prev_state=dict()) self.assertEqual(len(m.datasets), 0) mod0 = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_SUCCESS, outputs=ModuleOutputs(), provenance=ModuleProvenance(write=DATASETS), timestamp=ModuleTimestamp(), module_folder=MODULE_DIR, datasets=DATASETS) m = OSModuleHandle.load_module(identifier=mod0.identifier, module_path=mod0.module_path, prev_state=dict()) self.assertEqual(len(m.datasets), 2) self.assertEqual(m.datasets['DS1'].identifier, 'ID1') self.assertEqual(len(m.datasets['DS1'].columns), 0) self.assertEqual(m.datasets['DS1'].row_count, 0) ds2 = m.datasets['DS2'] self.assertEqual(ds2.identifier, 'ID2') self.assertEqual(len(ds2.columns), 2) col0 = ds2.columns[0] self.assertEqual(col0.identifier, 0) self.assertEqual(col0.name, 'ABC') self.assertEqual(col0.data_type, 'int') col1 = ds2.columns[1] self.assertEqual(col1.identifier, 1) self.assertEqual(col1.name, 'xyz') self.assertEqual(col1.data_type, 'real') self.assertEqual(ds2.row_count, 100)
def set_canceled(self, finished_at: datetime = get_current_time(), outputs: ModuleOutputs = ModuleOutputs() ) -> None: """Set status of the module to canceled. The finished_at property of the timestamp is set to the given value or the current time (if None). The module outputs are set to the given value. If no outputs are given the module output streams will be empty. """ super().set_canceled(finished_at, outputs) # Materialize module state self.write_safe()
def set_error( self, task_id: str, finished_at: datetime = get_current_time(), outputs: ModuleOutputs = ModuleOutputs() ) -> Optional[bool]: """Set status of the module that is associated with the given task identifier to error. The finished_at property of the timestamp is set to the given value or the current time (if None). The module outputs are adjusted to the given value. The output streams are empty if no value is given for the outputs parameter. Cancels all pending modules in the workflow. Returns True if the state of the workflow was changed and False otherwise. The result is None if the project or task did not exist. Parameters ---------- task_id : string Unique task identifier finished_at: datetime.datetime, optional Timestamp when module started running outputs: vizier.viztrail.module.output.ModuleOutputs, optional Output streams for module Returns ------- bool """ print("ERROR: {}".format(task_id)) with self.backend.lock: # Get task handle and remove it from the internal index. The result # is None if the task does not exist. task = pop_task(tasks=self.tasks, task_id=task_id) if task is None: return None # Get the handle for the head workflow of the specified branch and # the index for the module matching the identifier in the task. workflow, module_index = self.get_task_module(task) if workflow is None or module_index == -1: return None # Notify the backend that the task is finished self.backend.task_finished(task_id) module = workflow.modules[module_index] if module.is_active: module.set_error(finished_at=finished_at, outputs=outputs) for m in workflow.modules[module_index + 1:]: m.set_canceled() return True else: return False
def test_safe_write(self): """Update module state with write error.""" # Create original module module = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_PENDING, module_folder=MODULE_DIR, timestamp=ModuleTimestamp(), outputs=ModuleOutputs(stdout=[TextOutput('ABC')]), provenance=ModuleProvenance( read={'DS1': 'ID1'}, write={'DS1': DatasetDescriptor(identifier='ID2', name='ID2')})) self.assertTrue(module.is_pending) module.set_running(external_form='TEST MODULE') self.assertTrue(module.is_running) module.set_success(outputs=ModuleOutputs(stderr=[None])) self.assertTrue(module.is_error) module = OSModuleHandle.load_module(identifier=module.identifier, module_path=module.module_path) self.assertTrue(module.is_running)
def __init__(self, command: ModuleCommand, external_form: Optional[str], identifier: Optional[str] = None, state: int = MODULE_PENDING, timestamp: ModuleTimestamp = ModuleTimestamp(), outputs: ModuleOutputs = ModuleOutputs(), provenance: ModuleProvenance = ModuleProvenance()): """Initialize the module handle. For new modules, datasets and outputs are initially empty. Parameters ---------- command : vizier.viztrail.command.ModuleCommand Specification of the module (i.e., package, name, and arguments) external_form: string Printable representation of module command identifier : string, optional Unique module identifier state: int Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS) timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp, optional Module timestamp outputs: vizier.viztrail.module.output.ModuleOutputs, optional Module output streams STDOUT and STDERR provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional Provenance information about datasets that were read and writen by previous execution of the module. """ super(ModuleHandle, self).__init__( state=state if not state is None else MODULE_PENDING) self.identifier = identifier self.command = command self.external_form = external_form self.outputs = outputs if not outputs is None else ModuleOutputs() self.provenance = provenance if not provenance is None else ModuleProvenance( ) self.timestamp = timestamp if not timestamp is None else ModuleTimestamp( )
def write_safe(self): """The write safe method writes the current module state to the object store. It catches any occuring exception and sets the module into error state if an exception occurs. This method is used to ensure that the state of the module is in error (i.e., the workflow cannot further be executed) if a state change fails. """ try: self.write_module() except Exception as ex: self.state = mstate.MODULE_ERROR self.outputs = ModuleOutputs(stderr=[TextOutput(str(ex))]) self.datasets = dict()
def test_completed_append(self): """Test appending a completed workflow to a branch.""" base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties=None, base_path=base_path) branch = vt.get_default_branch() for i in range(10): ts = get_current_time() command = python_cell(source='print ' + str(i) + '+' + str(i)) module = OSModuleHandle.create_module( command=command, external_form='print ' + str(i) + '+' + str(i), state=MODULE_SUCCESS, datasets=dict(), outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) if not branch.head is None: modules = branch.head.modules + [module] else: modules = [module] branch.append_workflow(modules=modules, action=ACTION_INSERT, command=command) head_modules = branch.get_head().modules wf = branch.append_workflow(modules=head_modules[:-1], action=ACTION_DELETE, command=head_modules[-1].command) self.assertEqual(len(wf.modules), 9) self.assertEqual(wf.descriptor.identifier, '0000000A') self.assertEqual(wf.descriptor.action, ACTION_DELETE) self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON) self.assertEqual(wf.descriptor.command_id, PYTHON_CODE) vt = OSViztrailHandle.load_viztrail(base_path) branch = vt.get_default_branch() history = branch.get_history() self.assertEqual(len(history), 11) wf = branch.get_head() self.assertEqual(len(wf.modules), 9) self.assertEqual(wf.descriptor.identifier, '0000000A') self.assertEqual(wf.descriptor.action, ACTION_DELETE) self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON) self.assertEqual(wf.descriptor.command_id, PYTHON_CODE)
def test_running(self): """Update module state from pending to running.""" # Create original module module = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_PENDING, module_folder=MODULE_DIR, timestamp=ModuleTimestamp(), datasets={'DS1': DS1}, outputs=ModuleOutputs(stdout=[TextOutput('ABC')]), provenance=ModuleProvenance( read={'DS1': 'ID1'}, write={'DS1': DatasetDescriptor(identifier='ID2')}, resources={'fileid': '0123456789'})) self.assertTrue(module.is_pending) module.set_running(external_form='TEST MODULE') self.assertTrue(module.is_running) self.assertIsNotNone(module.timestamp.started_at) self.assertEqual(len(module.datasets), 0) self.assertEqual(len(module.outputs.stderr), 0) self.assertEqual(len(module.outputs.stdout), 0) self.assertIsNotNone(module.provenance.read) self.assertIsNotNone(module.provenance.write) self.assertIsNotNone(module.provenance.resources) # Read module from object store and ensure that tall changes have been # materialized properly module = OSModuleHandle.load_module(identifier=module.identifier, module_path=module.module_path) self.assertTrue(module.is_running) self.assertIsNotNone(module.timestamp.started_at) self.assertEqual(len(module.datasets), 0) self.assertEqual(len(module.outputs.stderr), 0) self.assertEqual(len(module.outputs.stdout), 0) self.assertIsNotNone(module.provenance.read) self.assertIsNotNone(module.provenance.write) self.assertIsNotNone(module.provenance.resources) # Set running with all optional parameters module.set_running(started_at=module.timestamp.created_at, external_form='Some form') self.assertEqual(module.timestamp.started_at, module.timestamp.created_at) self.assertEqual(module.external_form, 'Some form') module = OSModuleHandle.load_module(identifier=module.identifier, module_path=module.module_path) self.assertEqual(module.timestamp.started_at, module.timestamp.created_at) self.assertEqual(module.external_form, 'Some form')
def test_state(self): """Ensure that only one of the state flag is True at the same time.""" # Create original module module = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_PENDING, module_folder=MODULE_DIR, timestamp=ModuleTimestamp(), outputs=ModuleOutputs(stdout=[TextOutput('ABC')]), provenance=ModuleProvenance( read={'DS1': 'ID1'}, write={'DS1': DatasetDescriptor(identifier='ID2', name='ID2')})) # Pending self.assertTrue(module.is_pending) self.assertFalse(module.is_canceled) self.assertFalse(module.is_error) self.assertFalse(module.is_running) self.assertFalse(module.is_success) # Running module.set_running(external_form='TEST MODULE') self.assertFalse(module.is_pending) self.assertFalse(module.is_canceled) self.assertFalse(module.is_error) self.assertTrue(module.is_running) self.assertFalse(module.is_success) # Canceled module.set_canceled() self.assertFalse(module.is_pending) self.assertTrue(module.is_canceled) self.assertFalse(module.is_error) self.assertFalse(module.is_running) self.assertFalse(module.is_success) # Error module.set_error() self.assertFalse(module.is_pending) self.assertFalse(module.is_canceled) self.assertTrue(module.is_error) self.assertFalse(module.is_running) self.assertFalse(module.is_success) # Success module.set_success() self.assertFalse(module.is_pending) self.assertFalse(module.is_canceled) self.assertFalse(module.is_error) self.assertFalse(module.is_running) self.assertTrue(module.is_success)
def __init__(self, identifier: str, command: ModuleCommand, external_form: str, module_path: str, state: int = mstate.MODULE_PENDING, timestamp: ModuleTimestamp = ModuleTimestamp(), outputs: ModuleOutputs = ModuleOutputs(), provenance: ModuleProvenance = ModuleProvenance(), object_store: ObjectStore = DefaultObjectStore() ): """Initialize the module handle. For new modules, datasets and outputs are initially empty. Parameters ---------- identifier : string Unique module identifier command : vizier.viztrail.command.ModuleCommand Specification of the module (i.e., package, name, and arguments) external_form: string Printable representation of module command module_path: string Path to module resource in object store state: int Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS) timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp, optional Module timestamp outputs: vizier.viztrail.module.output.ModuleOutputs, optional Module output streams STDOUT and STDERR provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional Provenance information about datasets that were read and writen by previous execution of the module. object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources """ super(OSModuleHandle, self).__init__( identifier=identifier, command=command, external_form=external_form, state=state, timestamp=timestamp, outputs= outputs, provenance=provenance, ) self.module_path = module_path self.object_store = object_store
def create_exec_result(self, dataset_name, input_dataset=None, output_dataset=None, database_state=None, stdout=None, resources=None): """Create execution result object for a successfully completed task. Assumes that a single datasets has been modified. Note that this method is not suitable to generate the result object for the drop dataset and rename dataset commands. Parameters ---------- dataset_name: string Name of the manipulated dataset input_dataset: vizier.datastore.dataset.DatasetDescriptor Descriptor for the input dataset output_dataset: vizier.datastore.dataset.DatasetDescriptor, optional Descriptor for the resulting dataset database_state: dict, optional Identifier for datasets in the database state agains which a task was executed (keyed by user-provided name) stdout= list(string), optional Lines in the command output resources: dict, optional Optional resources that were generated by the command Returns ------- vizier.engine.task.processor.ExecResult """ if not output_dataset is None: ds = DatasetDescriptor(identifier=output_dataset.identifier, columns=output_dataset.columns, row_count=output_dataset.row_count) else: ds = None return ExecResult( outputs=ModuleOutputs(stdout=[TextOutput(line) for line in stdout]), provenance=ModuleProvenance( read={dataset_name: input_dataset.identifier} if not input_dataset is None else None, write={dataset_name: ds}, resources=resources))
def __init__(self, is_success=True, outputs=None, provenance=None): """Initialize the result components. Parameters ---------- is_success: bool Flag indicating if execution was successful outputs: vizier.viztrail.module.output.ModuleOutputs, optional Outputs to STDOUT and STDERR generated during task execution provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional Provenance information about datasets that were read and writen during task execution. """ self.is_success = is_success self.outputs = outputs if not outputs is None else ModuleOutputs() self.provenance = provenance if not provenance is None else ModuleProvenance( )
def test_multi_append(self): """Test appending modules to viztrail branch.""" base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties=None, base_path=base_path) branch = vt.get_default_branch() # Append ten modules for i in range(10): ts = get_current_time() command = python_cell(source='print ' + str(i) + '+' + str(i)) module = OSModuleHandle.create_module( command=command, external_form='print ' + str(i) + '+' + str(i), state=MODULE_SUCCESS, datasets=dict(), outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) if not branch.head is None: modules = branch.head.modules + [module] else: modules = [module] branch.append_workflow(modules=modules, action=ACTION_INSERT, command=command) self.assertEqual(len(branch.get_history()), (i + 1)) vt = OSViztrailHandle.load_viztrail(base_path) branch = vt.get_default_branch() history = branch.get_history() self.assertEqual(len(history), 10) for i in range(10): wf = branch.get_workflow(history[i].identifier) self.assertEqual(len(wf.modules), (i + 1)) for m in range(i + 1): module = wf.modules[m] self.assertEqual(module.external_form, 'print ' + str(m) + '+' + str(m)) self.assertEqual(module.outputs.stdout[-1].value, str(m + m))