def __init__( self, builder, # type: Builder joborder, # type: JSON make_path_mapper, # type: Callable[..., PathMapper] requirements, # type: List[Dict[Text, Text]] hints, # type: List[Dict[Text, Text]] name, # type: Text runtime_context, url, spec, remote_storage_url=None, token=None): super(TESTask, self).__init__(builder, joborder, make_path_mapper, requirements, hints, name) self.runtime_context = runtime_context self.spec = spec self.outputs = None self.inplace_update = False self.basedir = runtime_context.basedir or os.getcwd() self.fs_access = StdFsAccess(self.basedir) self.id = None self.state = "UNKNOWN" self.exit_code = None self.poll_interval = 1 self.poll_retries = 10 self.client = tes.HTTPClient(url, token=token) self.remote_storage_url = remote_storage_url self.token = token
def __init__( self, builder, # type: Builder joborder, # type: JobOrderType make_path_mapper, # type: Callable[..., PathMapper] requirements, # type: List[Dict[Text, Text]] hints, # type: List[Dict[Text, Text]] name, # type: Text runtime_context, url, spec): super(TESTask, self).__init__(builder, joborder, make_path_mapper, requirements, hints, name) self.runtime_context = runtime_context self.spec = spec self.outputs = None self.inplace_update = False if runtime_context.basedir is not None: self.basedir = runtime_context.basedir else: self.basedir = os.getcwd() self.fs_access = StdFsAccess(self.basedir) self.id = None self.docker_workdir = '/var/spool/cwl' self.state = "UNKNOWN" self.poll_interval = 1 self.poll_retries = 10 self.client = tes.HTTPClient(url)
def __cancel_tes_tasks(collection: Collection, run_id: str, url: str, timeout: int = 5): """Cancel individual TES tasks.""" tes_client = tes.HTTPClient(url, timeout=timeout) canceled: List = list() while True: task_ids = db_utils.find_tes_task_ids( collection=collection, run_id=run_id, ) cancel = [item for item in task_ids if item not in canceled] for task_id in cancel: try: tes_client.cancel_task(task_id) except HTTPError: # TODO: handle more robustly: only 400/Bad Request is okay; # TODO: other errors (e.g. 500) should be dealt with pass canceled = canceled + cancel time.sleep(timeout) document = collection.find_one(filter={'run_id': run_id}, projection={ 'api.state': True, '_id': False, }) if document['api']['state'] in States.FINISHED: break
def __init__(self, url, kwargs): self.threads = [] self.kwargs = kwargs self.client = tes.HTTPClient(url) if kwargs.get("basedir") is not None: self.basedir = kwargs.get("basedir") else: self.basedir = os.getcwd() self.fs_access = StdFsAccess(self.basedir)
def __init__(self, url, kwargs): super(TESPipeline, self).__init__() self.kwargs = kwargs self.service = tes.HTTPClient(url) if kwargs.get("basedir") is not None: self.basedir = kwargs.get("basedir") else: self.basedir = os.getcwd() self.fs_access = StdFsAccess(self.basedir)
def __init__( self, workflow, dag, cores, jobname="snakejob.{name}.{jobid}.sh", printreason=False, quiet=False, printshellcmds=False, cluster_config=None, local_input=None, restart_times=None, assume_shared_fs=False, max_status_checks_per_second=0.5, tes_url=None, container_image=None, ): try: import tes except ImportError: raise WorkflowError( "Unable to import Python package tes. TES backend requires py-tes to be installed. Please install py-tes, e.g. via Conda or Pip." ) self.container_image = container_image or get_container_image() logger.info(f"Using {self.container_image} for TES jobs.") self.container_workdir = "/tmp" self.max_status_checks_per_second = max_status_checks_per_second self.tes_url = tes_url self.tes_client = tes.HTTPClient(url=self.tes_url) logger.info( "[TES] Job execution on TES: {url}".format(url=self.tes_url)) super().__init__( workflow, dag, None, jobname=jobname, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, cluster_config=cluster_config, local_input=local_input, restart_times=restart_times, assume_shared_fs=assume_shared_fs, max_status_checks_per_second=max_status_checks_per_second, )
def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None: super().__init__(config, maxCores, maxMemory, maxDisk) # Connect to TES, using Funnel-compatible environment variables to fill in credentials if not specified. self.tes = tes.HTTPClient(config.tes_endpoint, user=config.tes_user, password=config.tes_password, token=config.tes_bearer_token) # Get service info from the TES server and pull out supported storages. # We need this so we can tell if the server is likely to be able to # mount any of our local files. These are URL bases that the server # supports. server_info = self.tes.get_service_info() logger.debug("Detected TES server info: %s", server_info) self.server_storages = server_info.storage or [] # Define directories to mount for each task, as py-tes Input objects self.mounts: List[tes.Input] = [] if config.jobStore: job_store_type, job_store_path = Toil.parseLocator(config.jobStore) if job_store_type == 'file': # If we have a file job store, we want to mount it at the same path, if we can self._mount_local_path_if_possible(job_store_path, job_store_path) # If we have AWS credentials, we want to mount them in our home directory if we can. aws_credentials_path = os.path.join(os.path.expanduser("~"), '.aws') if os.path.isdir(aws_credentials_path): self._mount_local_path_if_possible(aws_credentials_path, '/root/.aws') # We assign job names based on a numerical job ID. This functionality # is managed by the BatchSystemLocalSupport. # Here is where we will store the user script resource object if we get one. self.user_script: Optional[Resource] = None # Ge the image to deploy from Toil's configuration self.docker_image = applianceSelf() # We need a way to map between our batch system ID numbers, and TES task IDs from the server. self.bs_id_to_tes_id: Dict[int, str] = {} self.tes_id_to_bs_id: Dict[str, int] = {}
def __init__( self, workflow, dag, cores, jobname="snakejob.{name}.{jobid}.sh", printreason=False, quiet=False, printshellcmds=False, latency_wait=3, cluster_config=None, local_input=None, restart_times=None, assume_shared_fs=False, max_status_checks_per_second=0.5, tes_url=None, container_image=None, ): import tes self.container_image = container_image or get_container_image() self.container_workdir = "/tmp" self.max_status_checks_per_second = max_status_checks_per_second self.tes_url = tes_url self.tes_client = tes.HTTPClient(url=self.tes_url) logger.info( "[TES] Job execution on TES: {url}".format(url=self.tes_url)) exec_job = "\\\n".join(( "{envvars} ", "mkdir /tmp/conda && cd /tmp && ", "snakemake {target} ", "--snakefile {snakefile} ", "--verbose ", "--force -j{cores} ", "--keep-target-files ", "--keep-remote ", "--latency-wait 10 ", "--attempt 1 ", "{use_threads}", "{overwrite_config} {rules} ", "--nocolor ", "--notemp ", "--no-hooks ", "--nolock ", "--mode {} ".format(Mode.cluster), )) super().__init__( workflow, dag, None, jobname=jobname, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, cluster_config=cluster_config, local_input=local_input, restart_times=restart_times, exec_job=exec_job, assume_shared_fs=assume_shared_fs, max_status_checks_per_second=max_status_checks_per_second, )
def cancel_task(config: Dict, id: str, *args, **kwargs) -> Dict: """Cancels running workflow.""" collection = get_conf(config, 'database', 'collections', 'tasks') document = collection.find_one(filter={'task_id': id}, projection={ 'task_id_tes': True, 'tes_uri': True, 'task.state': True, 'user_id': True, 'worker_id': True, '_id': False, }) # Raise error if task was not found if not document: logger.error("Task '{id}' not found.".format(id=id)) raise TaskNotFound # Raise error trying to access workflow run that is not owned by user # Only if authorization enabled if 'user_id' in kwargs and document['user_id'] != kwargs['user_id']: logger.error( ("User '{user_id}' is not allowed to access task '{id}'.").format( user_id=kwargs['user_id'], id=id, )) raise Forbidden # If task is in cancelable state... if document['task']['state'] in States.CANCELABLE or \ document['task']['state'] in States.UNDEFINED: # Get timeout duration timeout = get_conf( config, 'api', 'endpoint_params', 'timeout_service_calls', ) # Cancel local task current_app.control.revoke(document['worker_id'], terminate=True, signal='SIGKILL') # Cancel remote task if document['tes_uri'] is not None and document[ 'task_id_tes'] is not None: cli = tes.HTTPClient(document['tes_uri'], timeout=timeout) try: cli.cancel_task(document['task_id_tes']) except HTTPError: # TODO: handle more robustly: only 400/Bad Request is okay; # TODO: other errors (e.g. 500) should be dealt with pass # Write log entry logger.info( ("Task '{id}' (worker ID '{worker_id}') was canceled.").format( id=id, worker_id=document['worker_id'], )) # Update task state set_task_state( collection=collection, task_id=id, worker_id=document['worker_id'], state='CANCELED', ) return {}
def _poll_task( collection: Collection, task_id: str, worker_id: str, tes_uri: str, tes_task_id: str, initial_state: str = 'UNKNOWN', token: str = None, interval: float = 2, max_missed_heartbeats: int = 100, timeout: float = 1.5, ) -> None: """Poll task state.""" # Log message logger.info( ( "Starting polling of TES task '{task_id}' with " "worker ID '{worker_id}' at TES '{tes_uri}'..." ).format( task_id=task_id, worker_id=worker_id, tes_uri=tes_uri, ) ) # Initialize states and counters state = previous_state = initial_state heartbeats_left = max_missed_heartbeats # Start polling while state in States.UNFINISHED: # Try to submit task to TES instance try: cli = tes.HTTPClient(tes_uri, timeout=timeout) response = cli.get_task(tes_task_id, view='MINIMAL') # Issue warning if heartbeat was missed except Exception as e: heartbeats_left -= 1 logger.warning( ( "Missed heartbeat for task '{tes_task_id}' at TES " "'{tes_uri}'. {heartbeats_left} heartbeats left. Original " "error message: {type}: {msg}" ).format( tes_task_id=tes_task_id, tes_uri=tes_uri, type=type(e).__name__, msg=e, ) ) continue # Reset heartbeat counter heartbeats_left = max_missed_heartbeats # Update state in database if changed state = response.state if state != previous_state: set_task_state( collection=collection, task_id=task_id, worker_id=worker_id, state=state, ) # Sleep for specified interval sleep(interval)
def _send_task( tes_uris: List[str], request: Dict, token: str, timeout: float = 5 ) -> Tuple[str, str]: """Send task to TES instance.""" # Process/sanitize request for use with py-tes time_now = datetime.now().strftime("%m-%d-%Y %H:%M:%S") if not 'creation_time' in request: request['creation_time'] = parse_time(time_now) if 'inputs' in request: request['inputs'] = [ tes.models.Input(**input) for input in request['inputs'] ] if 'outputs' in request: request['outputs'] = [ tes.models.Output(**output) for output in request['outputs'] ] if 'resources' in request: request['resources'] = tes.models.Resources(**request['resources']) if 'executors' in request: request['executors'] = [ tes.models.Executor(**executor) for executor in request['executors'] ] if 'logs' in request: for log in request['logs']: log['start_time'] = time_now log['end_time'] = time_now if 'logs' in log: for inner_log in log['logs']: inner_log['start_time'] = time_now inner_log['end_time'] = time_now log['logs'] = [ tes.models.ExecutorLog(**log) for log in log['logs'] ] if 'outputs' in log: for output in log['outputs']: output['size_bytes'] = 0 log['outputs'] = [ tes.models.OutputFileLog(**output) for output in log['outputs'] ] if 'system_logs' in log: log['system_logs'] = [ tes.models.SystemLog(**log) for log in log['system_logs'] ] request['logs'] = [ tes.models.TaskLog(**log) for log in request['logs'] ] # Create Task object try: task = tes.Task(**request) except Exception as e: logger.error( ( "Task object could not be created. Original error message: " "{type}: {msg}" ).format( type=type(e).__name__, msg=e, ) ) raise BadRequest # Iterate over known TES URIs for tes_uri in tes_uris: # Try to submit task to TES instance try: cli = tes.HTTPClient(tes_uri, timeout=timeout) task_id = cli.create_task(task) # Issue warning and try next TES instance if task submission failed except Exception as e: logger.warning( ( "Task could not be submitted to TES instance '{tes_uri}'. " 'Trying next TES instance in list. Original error ' "message: {type}: {msg}" ).format( tes_uri=tes_uri, type=type(e).__name__, msg=e, ) ) continue # Return task ID and URL of TES instance return (task_id, tes_uri) # Log error if no suitable TES instance was found raise ConnectionError( 'Task could not be submitted to any known TES instance.' )
def __attrs_post_init__(self): self.__tes_client = tes.HTTPClient(self.tes_url, timeout=self.timeout) self.__id = None
def run_workflow( config: Dict, body: Dict, sender: str, *args, **kwargs ) -> Dict: """Relays task to best TES instance; returns universally unique task id.""" # Get config parameters authorization_required = get_conf( config, 'security', 'authorization_required' ) endpoint_params = get_conf_type( config, 'tes', 'endpoint_params', types=(list), ) security_params = get_conf_type( config, 'security', 'jwt', ) remote_urls = get_conf_type( config, 'tes', 'service-list', types=(list), ) # Get associated workflow run # TODO: get run_id, task_id and user_id # Set initial task state # TODO: # Set access token if authorization required: try: access_token = request_access_token( user_id=document['user_id'], token_endpoint=endpoint_params['token_endpoint'], timeout=endpoint_params['timeout_token_request'], ) validate_token( token=access_token, key=security_params['public_key'], identity_claim=security_params['identity_claim'], ) except Exception as e: logger.exception( ( 'Could not get access token from token endpoint ' "'{token_endpoint}'. Original error message {type}: {msg}" ).format( token_endpoint=endpoint_params['token_endpoint'], type=type(e).__name__, msg=e, ) ) raise Forbidden else: access_token = None # Order TES instances by priority testribute = TEStribute_Interface() remote_urls_ordered = testribute.order_endpoint_list( tes_json=body, endpoints=remote_urls, access_token=access_token, method=endpoint_params['tes_distribution_method'], ) # Send task to best TES instance try: remote_id, remote_url = __send_task( urls=remote_urls_ordered, body=body, access_token=access_token, timeout=endpoint_params['timeout_tes_submission'], ) except Exception as e: logger.exception('{type}: {msg}'.format( default_path=default_path, config_var=config_var, type=type(e).__name__, msg=e, ) raise InternalServerError # Poll TES instance for state updates __initiate_state_polling( task_id=remote_id, run_id=document['run_id'], url=remote_url, interval_polling=endpoint_params['interval_polling'], timeout_polling=endpoint_params['timeout_polling'], max_time_polling=endpoint_params['max_time_polling'], ) # Generate universally unique ID local_id = __amend_task_id( remote_id=remote_id, remote_url=remote_url, separator=endpoint_params['id_separator'], encoding=endpoint_params['id_encoding'], ) # Format and return response response = {'id': local_id} return response def request_access_token( user_id: str, token_endpoint: str, timeout: int = 5 ) -> str: """Get access token from token endpoint.""" try: response = post( token_endpoint, data={'user_id': user_id}, timeout=timeout ) except Exception as e: raise if response.status_code != 200: raise ConnectionError( ( "Could not access token endpoint '{endpoint}'. Received " "status code '{code}'." ).format( endpoint=token_endpoint, code=response.status_code ) ) return response.json()['access_token'] def validate_token( token:str, key:str, identity_claim:str, ) -> None: # Decode token try: token_data = decode( jwt=token, key=get_conf( current_app.config, 'security', 'jwt', 'public_key' ), algorithms=get_conf( current_app.config, 'security', 'jwt', 'algorithm' ), verify=True, ) except Exception as e: raise ValueError( ( 'Authentication token could not be decoded. Original ' 'error message: {type}: {msg}' ).format( type=type(e).__name__, msg=e, ) ) # Validate claims identity_claim = get_conf( current_app.config, 'security', 'jwt', 'identity_claim' ) validate_claims( token_data=token_data, required_claims=[identity_claim], ) def __send_task( urls: List[str], body: Dict, timeout: int = 5 ) -> Tuple[str, str]: """Send task to TES instance.""" task = tes.Task(body) # TODO: implement this properly for url in urls: # Try to submit task to TES instance try: cli = tes.HTTPClient(url, timeout=timeout) task_id = cli.create_task(task) # TODO: fix problem with marshaling # Issue warning and try next TES instance if task submission failed except Exception as e: logger.warning( ( "Task could not be submitted to TES instance '{url}'. " 'Trying next TES instance in list. Original error ' "message: {type}: {msg}" ).format( url=url, type=type(e).__name__, msg=e, ) ) continue # Return task ID and URL of TES instance return (task_id, url) # Log error if no suitable TES instance was found raise ConnectionError( 'Task could not be submitted to any known TES instance.' ) def __initiate_state_polling( task_id: str, run_id: str, url: str, interval_polling: int = 2, timeout_polling: int = 1, max_time_polling: Optional[int] = None ) -> None: """Initiate polling of TES instance for task state.""" celery_id = uuid() logger.debug( ( "Starting polling of TES task '{task_id}' in " "background task '{celery_id}'..." ).format( task_id=task_id, celery_id=celery_id, ) ) task__poll_task_state.apply_async( None, { 'task_id': task_id, 'run_id': run_id, 'url': url, 'interval': interval_polling, 'timeout': timeout_polling, }, task_id=celery_id, soft_time_limit=max_time_polling, ) return None def __amend_task_id( remote_id: str, remote_url: str, separator: str = '@', # TODO: add to config encoding: str= 'utf-8' # TODO: add to config ) -> str: """Appends base64 to remote task ID.""" append = base64.b64encode(remote_url.encode(encoding)) return separator.join([remote_id, append])