async def _undeploy_node(self, name: Text, job_id: Text): undeploy_command = "".join(["occam-kill ", "{job_id}"]).format(job_id=job_id) logger.debug("Executing {command}".format(command=undeploy_command)) async with self._get_ssh_client(name) as ssh_client: await ssh_client.run(undeploy_command) logger.info("Killed {resource}".format(resource=job_id))
async def _deploy_node(self, name: Text, service: MutableMapping[Text, Any], node: Text): deploy_command = "".join([ "{workdir}" "occam-run ", "{x11}", "{node}", "{stdin}" "{jobidFile}" "{shmSize}" "{volumes}" "{image} " "{command}" ]).format(workdir="cd {workdir} && ".format( workdir=service.get('workdir')) if 'workdir' in service else "", x11=self.get_option("x", service.get('x11')), node=self.get_option("n", node), stdin=self.get_option("i", service.get('stdin')), jobidFile=self.get_option("c", service.get('jobidFile')), shmSize=self.get_option("s", service.get('shmSize')), volumes=self.get_option("v", service.get('volumes')), image=service['image'], command=" ".join(service.get('command', ""))) logger.debug("Executing {command}".format(command=deploy_command)) async with self._get_ssh_client(name) as ssh_client: result = await ssh_client.run(deploy_command) output = result.stdout search_result = re.findall('({node}-[0-9]+).*'.format(node=node), output, re.MULTILINE) if search_result: if name not in self.jobs_table: self.jobs_table[name] = [] self.jobs_table[name].append(search_result[0]) logger.info("Deployed {name} on {resource}".format( name=name, resource=search_result[0])) else: raise Exception
async def schedule(self, job: Job, scheduling_policy: Policy = None) -> None: async with self.wait_queue: model_name = job.step.target.model.name connector = self.context.deployment_manager.get_connector(model_name) while True: available_resources = dict(await connector.get_available_resources(job.step.target.service)) selected_resources = self._get_resources(job, scheduling_policy, available_resources) if selected_resources is not None: break try: await asyncio.wait_for(self.wait_queue.wait(), timeout=self.retry_interval) except asyncio.TimeoutError: pass if len(selected_resources) == 1: logger.info( "Job {name} allocated on resource {resource}".format(name=job.name, resource=selected_resources[0])) else: logger.info( "Job {name} allocated on resources {resources}".format( name=job.name, resources=', '.join(selected_resources))) self.job_allocations[job.name] = JobAllocation(job, selected_resources, Status.RUNNING) for selected_resource in selected_resources: if selected_resource not in self.resource_allocations: self.resource_allocations[selected_resource] = ResourceAllocation(selected_resource, model_name) self.resource_allocations[selected_resource].jobs.append(job.name)
def terminate(self, status: Status): if not self.terminated: # Add a TerminationToken to each output port for port in self.output_ports.values(): port.put(TerminationToken(name=port.name)) self.status = status self.terminated = True logger.info("Step {name} terminated with status {status}".format( name=self.name, status=status.name))
async def notify_status(self, job_name: str, status: Status) -> None: async with self.wait_queue: if job_name in self.job_allocations: if status != self.job_allocations[job_name].status: self.job_allocations[job_name].status = status logger.info( "Job {name} changed status to {status}".format(name=job_name, status=status.name)) if status in [Status.COMPLETED, Status.FAILED]: self.wait_queue.notify_all()
def main(args): args = parser.parse_args(args) if args.version: from streamflow.version import VERSION print("StreamFlow version {version}".format(version=VERSION)) exit() if args.quiet: logger.setLevel(logging.WARN) if platform.python_implementation() == 'CPython': logger.info('CPython detected: using uvloop EventLoop implementation') uvloop.install() asyncio.run(_async_main(args))
async def undeploy(self, model_name: Text): if model_name in dict(self.deployments_map): await self.events_map[model_name].wait() self.events_map[model_name].clear() connector = self.deployments_map[model_name] config = self.config_map[model_name] if not config.external: logger.info( "Undeploying model {model}".format(model=model_name)) await connector.undeploy(config.external) del self.deployments_map[model_name] del self.config_map[model_name] self.events_map[model_name].set()
async def replay_job(self, replay_request: ReplayRequest) -> ReplayResponse: sender_job = self.context.scheduler.get_job(replay_request.sender) target_job = self.context.scheduler.get_job(replay_request.target) if target_job.name not in self.jobs: self.jobs[target_job.name] = JobVersion(target_job.name, version=0) if target_job.name not in self.wait_queues: self.wait_queues[target_job.name] = Condition() wait_queue = self.wait_queues[target_job.name] async with wait_queue: if (target_job.name not in self.replay_cache or self.replay_cache[target_job.name].version < replay_request.version): # Reschedule job logger.info( "Rescheduling job {job}".format(job=target_job.name)) command_output = CommandOutput(value=None, status=Status.FAILED) self.replay_cache[target_job.name] = ReplayResponse( job=target_job.name, outputs=None, version=self.jobs[target_job.name].version + 1) try: if sender_job.step.target is not None: await self.context.scheduler.notify_status( sender_job.name, Status.WAITING) command_output = await self._replay_job( self.jobs[target_job.name]) finally: if target_job.step.target is not None: await self.context.scheduler.notify_status( target_job.name, command_output.status) # Retrieve output output_ports = target_job.step.output_ports.values() output_tasks = [] for output_port in output_ports: output_tasks.append( asyncio.create_task( output_port.token_processor.compute_token( target_job, command_output))) self.replay_cache[target_job.name].outputs = { port.name: token for (port, token) in zip(output_ports, await asyncio.gather( *output_tasks)) } wait_queue.notify_all() elif self.replay_cache[target_job.name].outputs is None: # Wait for job completion await wait_queue.wait() return self.replay_cache[target_job.name]
async def execute(self, job: Job) -> CWLCommandOutput: context = utils.build_context(job) if self.initial_work_dir is not None: await self._prepare_work_dir(job, context, self.initial_work_dir) logger.info('Evaluating expression for job {job}'.format(job=job.name)) timeout = self._get_timeout(job) result = eval_expression(expression=self.expression, context=context, full_js=self.full_js, expression_lib=self.expression_lib, timeout=timeout) return CWLCommandOutput(value=result, status=Status.COMPLETED, exit_code=0)
async def _recover_path(self, job: Job, resources: MutableSequence[Text], token: Token, path: Text) -> Optional[Text]: context = self.get_context() connector = self.port.step.get_connector() job_resources = job.get_resources() or [None] # Check if path is already present in actual job's resources for resource in job_resources: if await remotepath.exists(connector, resource, path): return path # Otherwise, get the list of other file locations from DataManager data_locations = set() for resource in resources: data_locations.update( context.data_manager.get_data_locations( resource, path, DataLocationType.PRIMARY)) # Check if path is still present in original resources for location in data_locations: if location.resource in job_resources: if await remotepath.exists(connector, location.resource, path): return path else: context.data_manager.invalidate_location( location.resource, path) # Check if files are saved locally for location in data_locations: if location.resource == LOCAL_RESOURCE: return await self._transfer_file(None, job, location.path) # If not, check if files are stored elsewhere for location in data_locations: if location.resource not in job_resources and location.resource != LOCAL_RESOURCE: location_job = context.scheduler.get_job(location.job) location_connector = location_job.step.get_connector() available_resources = await location_connector.get_available_resources( location_job.step.target.service) if (location.resource in available_resources and await remotepath.exists(location_connector, location.resource, location.path)): return await self._transfer_file(location_job, job, location.path) else: context.data_manager.invalidate_location( location.resource, location.path) # If file has been lost, raise an exception message = "Failed to recover path {path} for token {token} from job {job}".format( path=path, token=token.name, job=token.job) logger.info(message) raise UnrecoverableTokenException(message, token)
async def deploy(self, model_config: ModelConfig): model_name = model_config.name while True: if model_name not in self.events_map: self.events_map[model_name] = Event() if model_name not in self.config_map: self.config_map[model_name] = model_config connector = connector_classes[model_config.connector_type]( self.streamflow_config_dir, **model_config.config) self.deployments_map[model_name] = connector if not model_config.external: logger.info( "Deploying model {model}".format(model=model_name)) await connector.deploy(model_config.external) self.events_map[model_name].set() break else: await self.events_map[model_name].wait() if model_name in self.config_map: break
async def execute(self, job: Job) -> CommandOutput: context = utils.build_context(job) logger.info('Executing job {job}'.format(job=job.name)) # Process expressions processed_inputs = {} for k, v in self.input_expressions.items(): context = {**context, **{'self': context['inputs'][k]}} processed_inputs[k] = utils.eval_expression( expression=v, context=context, full_js=self.full_js, expression_lib=self.expression_lib) context['inputs'] = {**context['inputs'], **processed_inputs} # If condition is satisfied, return the updated inputs if self._evaulate_condition(context): return CWLCommandOutput(value=context['inputs'], status=Status.COMPLETED, exit_code=0) # Otherwise, skip and return None else: return CWLCommandOutput(value={t.name: None for t in job.inputs}, status=Status.SKIPPED, exit_code=0)
async def execute(self, job: Job) -> CommandOutput: connector = self.step.get_connector() # Transfer executor file to remote resource executor_path = await self._transfer_file( job, os.path.join(executor.__file__)) # Modify code, environment and namespaces according to inputs input_names = {} environment = {} for token in job.inputs: if token.value is not None: command_token = self.input_tokens[token.name] token_value = ([token.value] if isinstance( self.step.input_ports[token.name], ScatterInputPort) else token.value) if command_token.token_type == 'file': input_names[token.name] = token_value elif command_token.token_type == 'name': input_names[token.name] = token_value elif command_token.token_type == 'env': environment[token.name] = token_value # List output names to be retrieved from remote context output_names = [ name for name, p in self.step.output_ports.items() if name != executor.CELL_OUTPUT ] # Serialize AST nodes to remote resource code_path = await self._serialize_to_remote_file(job, self.ast_nodes) # Configure output fiel path path_processor = get_path_processor(self.step) output_path = path_processor.join(job.output_directory, random_name()) # Extract serializers from command tokens input_serializers = { k: v.serializer for k, v in self.input_tokens.items() if v.serializer is not None } output_serializers = { k: v.serializer for k, v in self.output_tokens.items() if v.serializer is not None } # Serialize namespaces to remote resource user_ns_path = await self._serialize_namespace( input_serializers=input_serializers, job=job, namespace=input_names) # Create dictionaries of postload input serializers and predump output serializers postload_input_serializers = { k: { 'postload': v['postload'] } for k, v in input_serializers.items() if 'postload' in v } predump_output_serializers = { k: { 'predump': v['predump'] } for k, v in output_serializers.items() if 'predump' in v } # Parse command cmd = [self.interpreter, executor_path] if os.path.basename(self.interpreter) == 'ipython': cmd.append('--') if self.step.workdir: cmd.extend(["--workdir", self.step.workdir]) if self.autoawait: cmd.append("--autoawait") cmd.extend(["--local-ns-file", user_ns_path]) if postload_input_serializers: postload_serializers_path = await self._serialize_to_remote_file( job, postload_input_serializers) cmd.extend( ["--postload-input-serializers", postload_serializers_path]) if predump_output_serializers: predump_serializers_path = await self._serialize_to_remote_file( job, predump_output_serializers) cmd.extend( ["--predump-output-serializers", predump_serializers_path]) for name in output_names: cmd.extend(["--output-name", name]) cmd.extend([code_path, output_path]) # Execute command if connector is not None: resources = job.get_resources() logger.info( 'Executing job {job} on resource {resource} into directory {outdir}:\n{command}' .format( job=job.name, resource=resources[0] if resources else None, outdir=job.output_directory, command=' \\\n\t'.join(cmd), )) # If step is assigned to multiple resources, add the STREAMFLOW_HOSTS environment variable if len(resources) > 1: available_resources = await connector.get_available_resources( self.step.target.service) hosts = { k: v.hostname for k, v in available_resources.items() if k in resources } environment['STREAMFLOW_HOSTS'] = ','.join(hosts.values()) # Configure standard streams stdin = self.stdin stdout = self.stdout if self.stdout is not None else STDOUT stderr = self.stderr if self.stderr is not None else stdout # Execute command result, exit_code = await connector.run( resources[0] if resources else None, cmd, environment=environment, workdir=job.output_directory, stdin=stdin, stdout=stdout, stderr=stderr, capture_output=True, job_name=job.name) else: logger.info( 'Executing job {job} into directory {outdir}: \n{command}'. format(job=job.name, outdir=job.output_directory, command=' \\\n\t'.join(cmd))) # Configure standard streams stdin = open(self.stdin, "rb") if self.stdin is not None else None stdout = open(self.stdout, "wb") if self.stdout is not None else None stderr = open(self.stderr, "wb") if self.stderr is not None else None # Execute command proc = await asyncio.create_subprocess_exec( *cmd, cwd=job.output_directory, env={ **os.environ, **environment }, stdin=stdin, stdout=stdout, stderr=stderr) result, error = await proc.communicate() exit_code = proc.returncode # Close streams if stdin is not None: stdin.close() if stdout is not None: stdout.close() if stderr is not None: stderr.close() # Retrieve outputs with TemporaryDirectory() as d: dest_path = os.path.join(d, path_processor.basename(output_path)) await self.step.context.data_manager.transfer_data(src=output_path, src_job=job, dst=dest_path, dst_job=None) with open(dest_path, mode='r') as f: json_output = json.load(f) # Infer status status = Status[json_output[executor.CELL_STATUS]] if status == Status.COMPLETED: command_stdout = json_output[executor.CELL_OUTPUT] if isinstance(command_stdout, MutableSequence ): # TODO: understand why we obtain a list here command_stdout = command_stdout[0] user_ns = await self._deserialize_namespace( job=job, output_serializers=output_serializers, remote_path=json_output[executor.CELL_LOCAL_NS]) else: command_stdout = json_output[executor.CELL_OUTPUT] user_ns = {} # Return the command output object return JupyterCommandOutput(value=command_stdout, status=status, user_ns=user_ns)
async def copy(self, src: Text, dst: Text, resources: MutableSequence[Text], kind: ConnectorCopyKind, source_remote: Optional[Text] = None, read_only: bool = False) -> None: if kind == ConnectorCopyKind.REMOTE_TO_REMOTE: if source_remote is None: raise Exception( "Source resource is mandatory for remote to remote copy") if len(resources) > 1: logger.info( "Copying {src} on resource {source_remote} to {dst} on resources:\n\t{resources}" .format(source_remote=source_remote, src=src, dst=dst, resources='\n\t'.join(resources))) else: logger.info( "Copying {src} on resource {source_remote} to {dst} on resource {resource}" .format(source_remote=source_remote, src=src, dst=dst, resource=resources[0])) await self._copy_remote_to_remote(src=src, dst=dst, resources=resources, source_remote=source_remote, read_only=read_only) elif kind == ConnectorCopyKind.LOCAL_TO_REMOTE: if len(resources) > 1: logger.info( "Copying {src} on local file-system to {dst} on resources:\n\t{resources}" .format(source_remote=source_remote, src=src, dst=dst, resources='\n\t'.join(resources))) else: logger.info( "Copying {src} on local file-system to {dst} on resource {resource}" .format(source_remote=source_remote, src=src, dst=dst, resource=resources[0])) await self._copy_local_to_remote(src=src, dst=dst, resources=resources, read_only=read_only) elif kind == ConnectorCopyKind.REMOTE_TO_LOCAL: if len(resources) > 1: raise Exception( "Copy from multiple resources is not supported") logger.info( "Copying {src} on resource {resource} to {dst} on local file-system" .format(source_remote=source_remote, src=src, dst=dst, resource=resources[0])) await self._copy_remote_to_local(src=src, dst=dst, resource=resources[0], read_only=read_only) else: raise NotImplementedError
async def handle_failure(self, job: Job, command_output: CommandOutput) -> CommandOutput: logger.info( "Handling command failure for job {job}".format(job=job.name)) return await self._do_handle_failure(job)
async def handle_exception(self, job: Job, exception: BaseException) -> CommandOutput: logger.info("Handling {exception} failure for job {job}".format( job=job.name, exception=type(exception).__name__)) return await self._do_handle_failure(job)
async def _run( self, resource: Text, command: MutableSequence[Text], environment: MutableMapping[Text, Text] = None, workdir: Optional[Text] = None, stdin: Optional[Union[int, Text]] = None, stdout: Union[int, Text] = asyncio.subprocess.STDOUT, stderr: Union[int, Text] = asyncio.subprocess.STDOUT, job_name: Optional[Text] = None, capture_output: bool = False, encode: bool = True, interactive: bool = False, stream: bool = False ) -> Union[Optional[Tuple[Optional[Any], int]], asyncio.subprocess.Process]: # TODO: find a smarter way to identify detachable jobs when implementing stacked connectors if job_name: command = utils.create_command(command=command, environment=environment, workdir=workdir) logger.debug( "Executing command {command} on {resource} {job}".format( command=command, resource=resource, job="for job {job}".format( job=job_name) if job_name else "")) helper_file = await self._build_helper_file( command, resource, environment, workdir) job_id = await self._run_batch_command(helper_file=helper_file, job_name=job_name, resource=resource, workdir=workdir, stdin=stdin, stdout=stdout, stderr=stderr) logger.info("Scheduled job {job} with job id {job_id}".format( job=job_name, job_id=job_id)) self.scheduledJobs.append(job_id) self.jobsCache.clear() while True: async with self.jobsCacheLock: running_jobs = await self._get_running_jobs(resource) if job_id not in running_jobs: break await asyncio.sleep(self.pollingInterval) self.scheduledJobs.remove(job_id) return (await self._get_output(job_id, resource) if stdout == STDOUT else None, await self._get_returncode(job_id, resource)) else: return await super()._run(resource=resource, command=command, environment=environment, workdir=workdir, stdin=stdin, stdout=stdout, stderr=stderr, job_name=job_name, capture_output=capture_output, encode=encode, interactive=interactive, stream=stream)
async def _run_job(self, inputs: MutableSequence[Token]) -> Status: # Create job job = BaseJob(name=posixpath.join(self.name, asyncio.current_task().get_name()), step=self, inputs=inputs) logger.info("Job {name} created".format(name=job.name)) # Initialise command output with defualt values command_output = CommandOutput(value=None, status=Status.FAILED) try: # Setup runtime environment if self.target is not None: await self.context.deployment_manager.deploy(self.target.model) await self.context.scheduler.schedule(job) # Initialize job await job.initialize() # Update tokens after target assignment job.inputs = await asyncio.gather(*[ asyncio.create_task(self.input_ports[ token.name].token_processor.update_token(job, token)) for token in inputs ]) # Run job command_output = await job.run() if command_output.status == Status.FAILED: self.terminate(command_output.status) # When receiving a KeyboardInterrupt, propagate it (to allow debugging) except KeyboardInterrupt: raise # When receiving a CancelledError, mark the step as Skipped except CancelledError: command_output.status = Status.SKIPPED self.terminate(command_output.status) # When receiving a FailureHandling exception, mark the step as Failed except FailureHandlingException: command_output.status = Status.FAILED self.terminate(command_output.status) # When receiving a generic exception, try to handle it except BaseException as e: logger.exception(e) try: command_output = await self.context.failure_manager.handle_exception( job, e) # If failure cannot be recovered, simply fail except BaseException as ie: if ie != e: logger.exception(ie) command_output.status = Status.FAILED self.terminate(command_output.status) finally: # Notify completion to scheduler if self.target is not None: await self.context.scheduler.notify_status( job.name, command_output.status) # Retrieve output tokens if not self.terminated: try: await asyncio.gather(*[ asyncio.create_task( _retrieve_output(job, output_port, command_output)) for output_port in self.output_ports.values() ]) except BaseException as e: logger.exception(e) command_output.status = Status.FAILED # Return job status logger.info("Job {name} terminated with status {status}".format( name=job.name, status=command_output.status.name)) return command_output.status
async def execute(self, job: Job) -> CWLCommandOutput: context = utils.build_context(job) if logger.isEnabledFor(logging.DEBUG): logger.debug("Job {job} inputs: {inputs}".format( job=job.name, inputs=json.dumps(context['inputs'], indent=4, sort_keys=True))) if self.initial_work_dir is not None: await self._prepare_work_dir(job, context, self.initial_work_dir) cmd = self._get_executable_command(context) parsed_env = { k: str( eval_expression(expression=v, context=context, full_js=self.full_js, expression_lib=self.expression_lib)) for (k, v) in self.environment.items() } if 'HOME' not in parsed_env: parsed_env['HOME'] = job.output_directory if 'TMPDIR' not in parsed_env: parsed_env['TMPDIR'] = job.tmp_directory if self.step.target is None: if self.is_shell_command: cmd = ["/bin/sh", "-c", " ".join(cmd)] # Open streams stderr = self._get_stream(job, context, self.stderr, sys.stderr) stdin = self._get_stream(job, context, self.stdin, sys.stdin, is_input=True) stdout = self._get_stream(job, context, self.stdout, sys.stderr) # Execute command logger.info( 'Executing job {job} into directory {outdir}: \n{command}'. format(job=job.name, outdir=job.output_directory, command=' \\\n\t'.join(cmd))) proc = await asyncio.create_subprocess_exec( *cmd, cwd=job.output_directory, env=parsed_env, stdin=stdin, stdout=stdout, stderr=stderr) result, error = await asyncio.wait_for(proc.communicate(), self._get_timeout(job)) exit_code = proc.returncode # Close streams if stdin is not sys.stdin: stdin.close() if stdout is not sys.stderr: stdout.close() if stderr is not sys.stderr: stderr.close() else: connector = self.step.get_connector() resources = job.get_resources() logger.info( 'Executing job {job} on resource {resource} into directory {outdir}:\n{command}' .format( job=job.name, resource=resources[0] if resources else None, outdir=job.output_directory, command=' \\\n\t'.join([ "/bin/sh", "-c", "\"{cmd}\"".format(cmd=" ".join(cmd)) ] if self.is_shell_command else cmd))) if self.is_shell_command: cmd = [ "/bin/sh", "-c", "\"$(echo {command} | base64 -d)\"".format( command=base64.b64encode(" ".join(cmd).encode( 'utf-8')).decode('utf-8')) ] # If step is assigned to multiple resources, add the STREAMFLOW_HOSTS environment variable if len(resources) > 1: available_resources = await connector.get_available_resources( self.step.target.service) hosts = { k: v.hostname for k, v in available_resources.items() if k in resources } parsed_env['STREAMFLOW_HOSTS'] = ','.join(hosts.values()) # Process streams stdin = eval_expression(expression=self.stdin, context=context, full_js=self.full_js, expression_lib=self.expression_lib) stdout = eval_expression(expression=self.stdout, context=context, full_js=self.full_js, expression_lib=self.expression_lib ) if self.stdout is not None else STDOUT stderr = eval_expression(expression=self.stderr, context=context, full_js=self.full_js, expression_lib=self.expression_lib ) if self.stderr is not None else stdout # Execute remote command result, exit_code = await asyncio.wait_for( connector.run(resources[0] if resources else None, cmd, environment=parsed_env, workdir=job.output_directory, stdin=stdin, stdout=stdout, stderr=stderr, capture_output=True, job_name=job.name), self._get_timeout(job)) # Handle exit codes if self.failure_codes is not None and exit_code in self.failure_codes: status = Status.FAILED elif (self.success_codes is not None and exit_code in self.success_codes) or exit_code == 0: status = Status.COMPLETED if result: logger.info(result) else: status = Status.FAILED return CWLCommandOutput(value=result, status=status, exit_code=exit_code)