Ejemplo n.º 1
0
def _get_input_combinator(step: Step,
                          scatter: Optional[Any] = None) -> InputCombinator:
    scatter_inputs = _get_scatter_inputs(scatter)
    # If there are no scatter ports in this step, create a single DotProduct combinator
    if not [n for n in scatter_inputs]:
        input_combinator = DotProductInputCombinator(name=utils.random_name(), step=step)
        for port in step.input_ports.values():
            input_combinator.ports[port.name] = port
        return input_combinator
    # If there are scatter ports
    else:
        other_ports = dict(step.input_ports)
        cartesian_combinator = JupyterCartesianProductInputCombinator(name=utils.random_name(), step=step)
        # Separate scatter ports from the other ones
        scatter_ports = {}
        for port_name, port in step.input_ports.items():
            if port_name in scatter_inputs:
                scatter_ports[port_name] = port
                del other_ports[port_name]
        # Choose the right combinator for the scatter ports, based on the scatter method property
        scatter_combinator = _get_combinator_from_scatter(
            step=step,
            scatter_ports=scatter_ports,
            scatter=scatter)
        cartesian_combinator.ports[scatter_combinator.name] = scatter_combinator
        # Create a CartesianProduct combinator between the scatter ports and the DotProduct of the others
        if other_ports:
            dotproduct_name = utils.random_name()
            dotproduct_combinator = DotProductInputCombinator(name=dotproduct_name, step=step)
            dotproduct_combinator.ports = other_ports
            cartesian_combinator.ports[dotproduct_name] = dotproduct_combinator
        return cartesian_combinator
Ejemplo n.º 2
0
 async def run(self) -> None:
     jobs = []
     # If there are input ports create jobs until termination token are received
     if self.input_ports:
         if self.input_combinator is None:
             raise WorkflowExecutionException(
                 "No InputCombinator specified for step {step}".format(
                     step=self.name))
         while True:
             # Retrieve input tokens
             inputs = await self.input_combinator.get()
             # Check for termination
             if utils.check_termination(inputs):
                 break
             # Set status to fireable
             self.status = Status.FIREABLE
             # Run job
             jobs.append(
                 asyncio.create_task(self._run_job(inputs),
                                     name=utils.random_name()))
     # Otherwise simply run job
     else:
         jobs.append(
             asyncio.create_task(self._run_job([]),
                                 name=utils.random_name()))
     # Wait for jobs termination
     statuses = await asyncio.gather(*jobs)
     # Terminate step
     self.terminate(_get_step_status(statuses))
Ejemplo n.º 3
0
 def __init__(self,
              context: StreamFlowContext,
              checkpoint_dir: Optional[Text] = None):
     super().__init__(context)
     self.checkpoint_dir = checkpoint_dir or os.path.join(
         tempfile.gettempdir(), 'streamflow', 'checkpoint',
         utils.random_name())
     self.copy_tasks: MutableSequence = []
Ejemplo n.º 4
0
 async def _get_tmpdir(self, resource: Text):
     scratch_home = '/scratch/home/{username}'.format(
         username=self.username)
     temp_dir = posixpath.join(scratch_home, 'streamflow',
                               "".join(utils.random_name()))
     async with self._get_ssh_client(resource) as ssh_client:
         await ssh_client.run('mkdir -p {dir}'.format(dir=temp_dir))
     return temp_dir
Ejemplo n.º 5
0
 async def _run_with_streamflow(self, cell_name: Text, compiler,
                                ast_nodes: List[Tuple[ast.AST, Text]],
                                cell_config: MutableMapping[Text, Any]):
     # Build the step target from metadata
     cell = JupyterCell(name=cell_name,
                        code=ast_nodes,
                        compiler=compiler,
                        metadata=cell_config)
     translator = JupyterNotebookTranslator(context=self.context)
     step = await translator.translate_cell(cell=cell,
                                            autoawait=self.autoawait,
                                            metadata=cell_config)
     # Inject inputs
     input_injector = BaseJob(name=utils.random_name(),
                              step=BaseStep(utils.random_name(),
                                            self.context),
                              inputs=[])
     await self._inject_inputs(step=step, job=input_injector)
     # Execute the step
     await step.run()
     # Print output log
     output_retriever = utils.random_name()
     d = tempfile.mkdtemp()
     output = await _get_output(step=step,
                                output_retriever=output_retriever,
                                d=d)
     if output:
         print(output)
     # Retrieve output tokens
     if step.status == Status.COMPLETED:
         output_names = {}
         for port_name, port in step.output_ports.items():
             if port_name != executor.CELL_OUTPUT:
                 token_processor = step.output_ports[
                     port_name].token_processor
                 token = await step.output_ports[port_name].get(
                     output_retriever)
                 token = await token_processor.collect_output(token, d)
                 if isinstance(token.job, MutableSequence):
                     output_names[token.name] = utils.flatten_list(
                         [t.value for t in token.value])
                 else:
                     output_names[token.name] = token.value
         # Update namespaces
         self.user_ns.update(output_names)
Ejemplo n.º 6
0
 def _init_dir(self) -> Text:
     if self.step.target is not None:
         path_processor = posixpath
         workdir = self.step.workdir or path_processor.join(
             '/tmp', 'streamflow')
     else:
         path_processor = os.path
         workdir = self.step.workdir or path_processor.join(
             tempfile.gettempdir(), 'streamflow')
     dir_path = path_processor.join(workdir, utils.random_name())
     return dir_path
Ejemplo n.º 7
0
 async def collect_output(self, token: Token, output_dir: Text) -> Token:
     if isinstance(token.job, MutableSequence) or self.port_type not in [
             'File', 'Directory'
     ]:
         return await super().collect_output(token, output_dir)
     if token.value is not None and self.port_type in ['File', 'Directory']:
         context = self.get_context()
         output_collector = BaseJob(name=random_name(),
                                    step=BaseStep(name=random_name(),
                                                  context=context),
                                    inputs=[],
                                    input_directory=output_dir)
         return token.update(await self._update_file_token(
             job=output_collector,
             src_job=context.scheduler.get_job(token.job),
             token_value=token.value,
             load_listing=LoadListing.deep_listing,
             writable=True))
     else:
         return token
Ejemplo n.º 8
0
def _get_combinator_from_scatter(step: Step,
                                 scatter_ports: MutableMapping[Text, InputPort],
                                 scatter: Optional[MutableMapping[Text, Any]] = None) -> InputCombinator:
    scatter_method = scatter.get('method', 'cartesian')
    combinator_name = utils.random_name()
    if scatter_method == 'cartesian':
        combinator = JupyterCartesianProductInputCombinator(name=combinator_name, step=step)
    else:
        combinator = DotProductInputCombinator(name=combinator_name, step=step)
    if scatter:
        for entry in scatter.get('items') or []:
            if isinstance(entry, str):
                combinator.ports[entry] = scatter_ports[entry]
            else:
                inner_combinator = _get_combinator_from_scatter(
                    step=step,
                    scatter_ports=scatter_ports,
                    scatter=entry)
                combinator.ports[inner_combinator.name] = inner_combinator
    return combinator
Ejemplo n.º 9
0
 async def _run_batch_command(
         self,
         helper_file: Text,
         job_name: Text,
         resource: Text,
         workdir: Optional[Text] = None,
         stdin: Optional[Union[int, Text]] = None,
         stdout: Union[int, Text] = asyncio.subprocess.STDOUT,
         stderr: Union[int, Text] = asyncio.subprocess.STDOUT) -> Text:
     batch_command = "{workdir} qsub {stdin} {stdout} {stderr} {helper_file}".format(
         workdir="cd {workdir} &&".format(
             workdir=workdir) if workdir is not None else "",
         stdin="-i \"{stdin}\"".format(
             stdin=stdin) if stdin is not None else "",
         stdout=("-o \"{stdout}\"".format(
             stdout=stdout if stdout != STDOUT else utils.random_name())),
         stderr="-e \"{stderr}\"".format(stderr=stderr)
         if stderr != STDOUT and stderr != stdout else "",
         helper_file=helper_file)
     async with self._get_ssh_client(resource) as ssh_client:
         result = await ssh_client.run(batch_command)
     return result.stdout.strip()
Ejemplo n.º 10
0
 async def run(self, output_dir: Optional[Text] = os.getcwd()):
     output_tokens = {}
     # Execute workflow
     for step in self.workflow.steps:
         execution = asyncio.create_task(self._execute(step), name=step)
         self.executions.append(execution)
     # If workflow has output ports
     if self.workflow.output_ports:
         # Retreive output tokens
         output_consumer = utils.random_name()
         for port_name, port in self.workflow.output_ports.items():
             self.output_tasks.append(asyncio.create_task(port.get(output_consumer), name=port_name))
         while not self.closed:
             output_tokens = await self._wait_outputs(output_consumer, output_dir, output_tokens)
     # Otherwise simply wait for all tasks to finish
     else:
         await asyncio.gather(*self.executions)
     # Check if workflow terminated properly
     for step in self.workflow.steps.values():
         if step.status == Status.FAILED:
             raise WorkflowExecutionException("Workflow execution failed")
     # Print output tokens
     print(json.dumps(output_tokens, sort_keys=True, indent=4))
Ejemplo n.º 11
0
 async def execute(self, job: Job) -> CommandOutput:
     connector = self.step.get_connector()
     # Transfer executor file to remote resource
     executor_path = await self._transfer_file(
         job, os.path.join(executor.__file__))
     # Modify code, environment and namespaces according to inputs
     input_names = {}
     environment = {}
     for token in job.inputs:
         if token.value is not None:
             command_token = self.input_tokens[token.name]
             token_value = ([token.value] if isinstance(
                 self.step.input_ports[token.name], ScatterInputPort) else
                            token.value)
             if command_token.token_type == 'file':
                 input_names[token.name] = token_value
             elif command_token.token_type == 'name':
                 input_names[token.name] = token_value
             elif command_token.token_type == 'env':
                 environment[token.name] = token_value
     # List output names to be retrieved from remote context
     output_names = [
         name for name, p in self.step.output_ports.items()
         if name != executor.CELL_OUTPUT
     ]
     # Serialize AST nodes to remote resource
     code_path = await self._serialize_to_remote_file(job, self.ast_nodes)
     # Configure output fiel path
     path_processor = get_path_processor(self.step)
     output_path = path_processor.join(job.output_directory, random_name())
     # Extract serializers from command tokens
     input_serializers = {
         k: v.serializer
         for k, v in self.input_tokens.items() if v.serializer is not None
     }
     output_serializers = {
         k: v.serializer
         for k, v in self.output_tokens.items() if v.serializer is not None
     }
     # Serialize namespaces to remote resource
     user_ns_path = await self._serialize_namespace(
         input_serializers=input_serializers,
         job=job,
         namespace=input_names)
     # Create dictionaries of postload input serializers and predump output serializers
     postload_input_serializers = {
         k: {
             'postload': v['postload']
         }
         for k, v in input_serializers.items() if 'postload' in v
     }
     predump_output_serializers = {
         k: {
             'predump': v['predump']
         }
         for k, v in output_serializers.items() if 'predump' in v
     }
     # Parse command
     cmd = [self.interpreter, executor_path]
     if os.path.basename(self.interpreter) == 'ipython':
         cmd.append('--')
     if self.step.workdir:
         cmd.extend(["--workdir", self.step.workdir])
     if self.autoawait:
         cmd.append("--autoawait")
     cmd.extend(["--local-ns-file", user_ns_path])
     if postload_input_serializers:
         postload_serializers_path = await self._serialize_to_remote_file(
             job, postload_input_serializers)
         cmd.extend(
             ["--postload-input-serializers", postload_serializers_path])
     if predump_output_serializers:
         predump_serializers_path = await self._serialize_to_remote_file(
             job, predump_output_serializers)
         cmd.extend(
             ["--predump-output-serializers", predump_serializers_path])
     for name in output_names:
         cmd.extend(["--output-name", name])
     cmd.extend([code_path, output_path])
     # Execute command
     if connector is not None:
         resources = job.get_resources()
         logger.info(
             'Executing job {job} on resource {resource} into directory {outdir}:\n{command}'
             .format(
                 job=job.name,
                 resource=resources[0] if resources else None,
                 outdir=job.output_directory,
                 command=' \\\n\t'.join(cmd),
             ))
         # If step is assigned to multiple resources, add the STREAMFLOW_HOSTS environment variable
         if len(resources) > 1:
             available_resources = await connector.get_available_resources(
                 self.step.target.service)
             hosts = {
                 k: v.hostname
                 for k, v in available_resources.items() if k in resources
             }
             environment['STREAMFLOW_HOSTS'] = ','.join(hosts.values())
         # Configure standard streams
         stdin = self.stdin
         stdout = self.stdout if self.stdout is not None else STDOUT
         stderr = self.stderr if self.stderr is not None else stdout
         # Execute command
         result, exit_code = await connector.run(
             resources[0] if resources else None,
             cmd,
             environment=environment,
             workdir=job.output_directory,
             stdin=stdin,
             stdout=stdout,
             stderr=stderr,
             capture_output=True,
             job_name=job.name)
     else:
         logger.info(
             'Executing job {job} into directory {outdir}: \n{command}'.
             format(job=job.name,
                    outdir=job.output_directory,
                    command=' \\\n\t'.join(cmd)))
         # Configure standard streams
         stdin = open(self.stdin, "rb") if self.stdin is not None else None
         stdout = open(self.stdout,
                       "wb") if self.stdout is not None else None
         stderr = open(self.stderr,
                       "wb") if self.stderr is not None else None
         # Execute command
         proc = await asyncio.create_subprocess_exec(
             *cmd,
             cwd=job.output_directory,
             env={
                 **os.environ,
                 **environment
             },
             stdin=stdin,
             stdout=stdout,
             stderr=stderr)
         result, error = await proc.communicate()
         exit_code = proc.returncode
         # Close streams
         if stdin is not None:
             stdin.close()
         if stdout is not None:
             stdout.close()
         if stderr is not None:
             stderr.close()
     # Retrieve outputs
     with TemporaryDirectory() as d:
         dest_path = os.path.join(d, path_processor.basename(output_path))
         await self.step.context.data_manager.transfer_data(src=output_path,
                                                            src_job=job,
                                                            dst=dest_path,
                                                            dst_job=None)
         with open(dest_path, mode='r') as f:
             json_output = json.load(f)
     # Infer status
     status = Status[json_output[executor.CELL_STATUS]]
     if status == Status.COMPLETED:
         command_stdout = json_output[executor.CELL_OUTPUT]
         if isinstance(command_stdout, MutableSequence
                       ):  # TODO: understand why we obtain a list here
             command_stdout = command_stdout[0]
         user_ns = await self._deserialize_namespace(
             job=job,
             output_serializers=output_serializers,
             remote_path=json_output[executor.CELL_LOCAL_NS])
     else:
         command_stdout = json_output[executor.CELL_OUTPUT]
         user_ns = {}
     # Return the command output object
     return JupyterCommandOutput(value=command_stdout,
                                 status=status,
                                 user_ns=user_ns)
Ejemplo n.º 12
0
    async def run_workflow(self, notebook):
        result = ExecutionResult(None)

        def error_before_exec(val):
            result.error_before_exec = val
            self.last_execution_succeeded = False
            self.last_execution_result = result
            return result

        cells = [
            self.transform_cell(cell['code']) for cell in notebook['cells']
        ]
        with self.builtin_trap, self.display_trap:
            try:
                # Extract cells code
                jupyter_cells = []
                for cell, metadata in zip(cells, [
                        c.get('metadata', {'step': {}})
                        for c in notebook['cells']
                ]):
                    cell_name = self.compile.cache(cell,
                                                   self.execution_count,
                                                   raw_code=cell)
                    code_ast = self.compile.ast_parse(cell, filename=cell_name)
                    code_ast = self.transform_ast(code_ast)
                    to_run = [(node, 'exec') for node in code_ast.body]
                    jupyter_cells.append(
                        JupyterCell(name=cell_name,
                                    code=to_run,
                                    compiler=self.compile,
                                    metadata=metadata))
                # Build workflow
                translator = JupyterNotebookTranslator(context=self.context)
                workflow = await translator.translate(
                    JupyterNotebook(cells=jupyter_cells,
                                    autoawait=self.autoawait,
                                    metadata=notebook.get('metadata')))
                # Inject inputs
                input_injector = BaseJob(name=utils.random_name(),
                                         step=BaseStep(utils.random_name(),
                                                       self.context),
                                         inputs=[])
                for step in workflow.steps.values():
                    await self._inject_inputs(step=step, job=input_injector)
            except self.custom_exceptions as e:
                etype, value, tb = sys.exc_info()
                self.CustomTB(etype, value, tb)
                return error_before_exec(e)
            except (InputRejected, WorkflowDefinitionException) as e:
                self.showtraceback()
                return error_before_exec(e)
            except IndentationError as e:
                self.showindentationerror()
                return error_before_exec(e)
            except (OverflowError, SyntaxError, ValueError, TypeError,
                    MemoryError) as e:
                self.showsyntaxerror()
                return error_before_exec(e)
            self.displayhook.exec_result = result
            # Execute workflow
            d = tempfile.mkdtemp()
            try:
                with open(os.devnull, 'w') as devnull:
                    with redirect_stdout(devnull), redirect_stderr(devnull):
                        await StreamFlowExecutor(
                            context=self.context,
                            workflow=workflow).run(output_dir=d)
                        # Print output logs
                        output_retriever = utils.random_name()
                        d = tempfile.mkdtemp()
                        result.result = {}
                        for step in workflow.steps.values():
                            output = await _get_output(
                                step=step,
                                output_retriever=output_retriever,
                                d=d)
                            if output:
                                result.result[step.name] = output
            except:
                if result:
                    result.error_before_exec = sys.exc_info()[1]
                self.showtraceback()
        return result
Ejemplo n.º 13
0
 async def _build_token_value(
         self,
         job: Job,
         token_value: Any,
         load_contents: Optional[bool] = None,
         load_listing: Optional[LoadListing] = None) -> Any:
     if load_contents is None:
         load_contents = self.load_contents
     if token_value is None:
         return self.default_value
     elif isinstance(token_value, MutableSequence):
         value_tasks = []
         for t in token_value:
             value_tasks.append(
                 asyncio.create_task(
                     self._build_token_value(job, t, load_listing)))
         return await asyncio.gather(*value_tasks)
     elif (isinstance(token_value, MutableMapping) and token_value.get(
             'class', token_value.get('type')) in ['File', 'Directory']):
         step = job.step if job is not None else self.port.step
         # Get filepath
         filepath = get_path_from_token(token_value)
         if filepath is not None:
             # Process secondary files in token value
             sf_map = {}
             if 'secondaryFiles' in token_value:
                 sf_tasks = []
                 for sf in token_value.get('secondaryFiles', []):
                     sf_path = get_path_from_token(sf)
                     path_processor = get_path_processor(step)
                     if not path_processor.isabs(sf_path):
                         path_processor.join(
                             path_processor.dirname(filepath), sf_path)
                     sf_tasks.append(
                         asyncio.create_task(
                             _get_file_token(step=step,
                                             job=job,
                                             token_class=sf['class'],
                                             filepath=sf_path,
                                             basename=sf.get('basename'),
                                             load_contents=load_contents,
                                             load_listing=load_listing
                                             or self.load_listing)))
                 sf_map = {
                     get_path_from_token(sf): sf
                     for sf in await asyncio.gather(*sf_tasks)
                 }
             # Compute the new token value
             token_value = await _get_file_token(
                 step=step,
                 job=job,
                 token_class=token_value.get('class',
                                             token_value.get('type')),
                 filepath=filepath,
                 basename=token_value.get('basename'),
                 load_contents=load_contents,
                 load_listing=load_listing or self.load_listing)
             # Compute new secondary files from port specification
             if self.secondary_files:
                 context = utils.build_context(job)
                 context['self'] = token_value
                 sf_tasks, sf_specs = [], []
                 for secondary_file in self.secondary_files:
                     # If pattern is an expression, evaluate it and process result
                     if '$(' in secondary_file.pattern or '${' in secondary_file.pattern:
                         sf_value = utils.eval_expression(
                             expression=secondary_file.pattern,
                             context=context,
                             full_js=self.full_js,
                             expression_lib=self.expression_lib)
                         if isinstance(sf_value, MutableSequence):
                             for sf in sf_value:
                                 sf_tasks.append(
                                     asyncio.create_task(
                                         self._process_secondary_file(
                                             job=job,
                                             secondary_file=sf,
                                             token_value=token_value,
                                             from_expression=True,
                                             existing_sf=sf_map,
                                             load_contents=load_contents,
                                             load_listing=load_listing
                                             or self.load_listing)))
                                 sf_specs.append(secondary_file)
                         else:
                             sf_tasks.append(
                                 asyncio.create_task(
                                     self._process_secondary_file(
                                         job=job,
                                         secondary_file=sf_value,
                                         token_value=token_value,
                                         from_expression=True,
                                         existing_sf=sf_map,
                                         load_contents=load_contents,
                                         load_listing=load_listing
                                         or self.load_listing)))
                             sf_specs.append(secondary_file)
                     # Otherwise, simply process the pattern string
                     else:
                         sf_tasks.append(
                             asyncio.create_task(
                                 self._process_secondary_file(
                                     job=job,
                                     secondary_file=secondary_file.pattern,
                                     token_value=token_value,
                                     from_expression=False,
                                     existing_sf=sf_map,
                                     load_contents=load_contents,
                                     load_listing=load_listing
                                     or self.load_listing)))
                         sf_specs.append(secondary_file)
                 for sf_value, sf_spec in zip(
                         await asyncio.gather(*sf_tasks), sf_specs):
                     if sf_value is not None:
                         sf_map[get_path_from_token(sf_value)] = sf_value
                     elif sf_spec.required:
                         raise WorkflowExecutionException(
                             "Required secondary file {sf} not found".
                             format(sf=sf_spec.pattern))
             # Add all secondary files to the token
             if sf_map:
                 token_value['secondaryFiles'] = list(sf_map.values())
         # If there is only a 'contents' field, create a file on the step's resource and build the token
         elif 'contents' in token_value:
             path_processor = get_path_processor(self.port.step)
             filepath = path_processor.join(
                 job.output_directory,
                 token_value.get('basename', random_name()))
             connector = job.step.get_connector()
             resources = job.get_resources() or [None
                                                 ] if job is not None else [
                                                     None
                                                 ]
             await asyncio.gather(*[
                 asyncio.create_task(
                     remotepath.write(connector, res, filepath,
                                      token_value['contents']))
                 for res in resources
             ])
             token_value = await _get_file_token(
                 step=step,
                 job=job,
                 token_class=token_value.get('class',
                                             token_value.get('type')),
                 filepath=filepath,
                 basename=token_value.get('basename'),
                 load_contents=load_contents,
                 load_listing=load_listing or self.load_listing)
     return token_value