Esempio n. 1
0
    def __init__(
            self,
            builder,  # type: Builder
            joborder,  # type: JSON
            make_path_mapper,  # type: Callable[..., PathMapper]
            requirements,  # type: List[Dict[Text, Text]]
            hints,  # type: List[Dict[Text, Text]]
            name,  # type: Text
            runtime_context,
            url,
            spec,
            remote_storage_url=None,
            token=None):
        super(TESTask, self).__init__(builder, joborder, make_path_mapper,
                                      requirements, hints, name)
        self.runtime_context = runtime_context
        self.spec = spec
        self.outputs = None
        self.inplace_update = False
        self.basedir = runtime_context.basedir or os.getcwd()
        self.fs_access = StdFsAccess(self.basedir)

        self.id = None
        self.state = "UNKNOWN"
        self.exit_code = None
        self.poll_interval = 1
        self.poll_retries = 10
        self.client = tes.HTTPClient(url, token=token)
        self.remote_storage_url = remote_storage_url
        self.token = token
Esempio n. 2
0
    def __init__(
            self,
            builder,  # type: Builder
            joborder,  # type: JobOrderType
            make_path_mapper,  # type: Callable[..., PathMapper]
            requirements,  # type: List[Dict[Text, Text]]
            hints,  # type: List[Dict[Text, Text]]
            name,  # type: Text
            runtime_context,
            url,
            spec):
        super(TESTask, self).__init__(builder, joborder, make_path_mapper,
                                      requirements, hints, name)
        self.runtime_context = runtime_context
        self.spec = spec
        self.outputs = None
        self.inplace_update = False
        if runtime_context.basedir is not None:
            self.basedir = runtime_context.basedir
        else:
            self.basedir = os.getcwd()
        self.fs_access = StdFsAccess(self.basedir)

        self.id = None
        self.docker_workdir = '/var/spool/cwl'
        self.state = "UNKNOWN"
        self.poll_interval = 1
        self.poll_retries = 10
        self.client = tes.HTTPClient(url)
Esempio n. 3
0
def __cancel_tes_tasks(collection: Collection,
                       run_id: str,
                       url: str,
                       timeout: int = 5):
    """Cancel individual TES tasks."""
    tes_client = tes.HTTPClient(url, timeout=timeout)
    canceled: List = list()
    while True:
        task_ids = db_utils.find_tes_task_ids(
            collection=collection,
            run_id=run_id,
        )
        cancel = [item for item in task_ids if item not in canceled]
        for task_id in cancel:
            try:
                tes_client.cancel_task(task_id)
            except HTTPError:
                # TODO: handle more robustly: only 400/Bad Request is okay;
                # TODO: other errors (e.g. 500) should be dealt with
                pass
        canceled = canceled + cancel
        time.sleep(timeout)
        document = collection.find_one(filter={'run_id': run_id},
                                       projection={
                                           'api.state': True,
                                           '_id': False,
                                       })
        if document['api']['state'] in States.FINISHED:
            break
Esempio n. 4
0
 def __init__(self, url, kwargs):
     self.threads = []
     self.kwargs = kwargs
     self.client = tes.HTTPClient(url)
     if kwargs.get("basedir") is not None:
         self.basedir = kwargs.get("basedir")
     else:
         self.basedir = os.getcwd()
     self.fs_access = StdFsAccess(self.basedir)
Esempio n. 5
0
 def __init__(self, url, kwargs):
     super(TESPipeline, self).__init__()
     self.kwargs = kwargs
     self.service = tes.HTTPClient(url)
     if kwargs.get("basedir") is not None:
         self.basedir = kwargs.get("basedir")
     else:
         self.basedir = os.getcwd()
     self.fs_access = StdFsAccess(self.basedir)
Esempio n. 6
0
    def __init__(
        self,
        workflow,
        dag,
        cores,
        jobname="snakejob.{name}.{jobid}.sh",
        printreason=False,
        quiet=False,
        printshellcmds=False,
        cluster_config=None,
        local_input=None,
        restart_times=None,
        assume_shared_fs=False,
        max_status_checks_per_second=0.5,
        tes_url=None,
        container_image=None,
    ):
        try:
            import tes
        except ImportError:
            raise WorkflowError(
                "Unable to import Python package tes. TES backend requires py-tes to be installed. Please install py-tes, e.g. via Conda or Pip."
            )

        self.container_image = container_image or get_container_image()
        logger.info(f"Using {self.container_image} for TES jobs.")
        self.container_workdir = "/tmp"
        self.max_status_checks_per_second = max_status_checks_per_second
        self.tes_url = tes_url
        self.tes_client = tes.HTTPClient(url=self.tes_url)

        logger.info(
            "[TES] Job execution on TES: {url}".format(url=self.tes_url))

        super().__init__(
            workflow,
            dag,
            None,
            jobname=jobname,
            printreason=printreason,
            quiet=quiet,
            printshellcmds=printshellcmds,
            cluster_config=cluster_config,
            local_input=local_input,
            restart_times=restart_times,
            assume_shared_fs=assume_shared_fs,
            max_status_checks_per_second=max_status_checks_per_second,
        )
Esempio n. 7
0
File: tes.py Progetto: tmooney/toil
    def __init__(self, config: Config, maxCores: float, maxMemory: int,
                 maxDisk: int) -> None:
        super().__init__(config, maxCores, maxMemory, maxDisk)
        # Connect to TES, using Funnel-compatible environment variables to fill in credentials if not specified.
        self.tes = tes.HTTPClient(config.tes_endpoint,
                                  user=config.tes_user,
                                  password=config.tes_password,
                                  token=config.tes_bearer_token)

        # Get service info from the TES server and pull out supported storages.
        # We need this so we can tell if the server is likely to be able to
        # mount any of our local files. These are URL bases that the server
        # supports.
        server_info = self.tes.get_service_info()
        logger.debug("Detected TES server info: %s", server_info)
        self.server_storages = server_info.storage or []

        # Define directories to mount for each task, as py-tes Input objects
        self.mounts: List[tes.Input] = []

        if config.jobStore:
            job_store_type, job_store_path = Toil.parseLocator(config.jobStore)
            if job_store_type == 'file':
                # If we have a file job store, we want to mount it at the same path, if we can
                self._mount_local_path_if_possible(job_store_path,
                                                   job_store_path)

        # If we have AWS credentials, we want to mount them in our home directory if we can.
        aws_credentials_path = os.path.join(os.path.expanduser("~"), '.aws')
        if os.path.isdir(aws_credentials_path):
            self._mount_local_path_if_possible(aws_credentials_path,
                                               '/root/.aws')

        # We assign job names based on a numerical job ID. This functionality
        # is managed by the BatchSystemLocalSupport.

        # Here is where we will store the user script resource object if we get one.
        self.user_script: Optional[Resource] = None

        # Ge the image to deploy from Toil's configuration
        self.docker_image = applianceSelf()

        # We need a way to map between our batch system ID numbers, and TES task IDs from the server.
        self.bs_id_to_tes_id: Dict[int, str] = {}
        self.tes_id_to_bs_id: Dict[str, int] = {}
Esempio n. 8
0
    def __init__(
        self,
        workflow,
        dag,
        cores,
        jobname="snakejob.{name}.{jobid}.sh",
        printreason=False,
        quiet=False,
        printshellcmds=False,
        latency_wait=3,
        cluster_config=None,
        local_input=None,
        restart_times=None,
        assume_shared_fs=False,
        max_status_checks_per_second=0.5,
        tes_url=None,
        container_image=None,
    ):
        import tes

        self.container_image = container_image or get_container_image()
        self.container_workdir = "/tmp"
        self.max_status_checks_per_second = max_status_checks_per_second
        self.tes_url = tes_url
        self.tes_client = tes.HTTPClient(url=self.tes_url)

        logger.info(
            "[TES] Job execution on TES: {url}".format(url=self.tes_url))

        exec_job = "\\\n".join((
            "{envvars} ",
            "mkdir /tmp/conda && cd /tmp && ",
            "snakemake {target} ",
            "--snakefile {snakefile} ",
            "--verbose ",
            "--force -j{cores} ",
            "--keep-target-files ",
            "--keep-remote ",
            "--latency-wait 10 ",
            "--attempt 1 ",
            "{use_threads}",
            "{overwrite_config} {rules} ",
            "--nocolor ",
            "--notemp ",
            "--no-hooks ",
            "--nolock ",
            "--mode {} ".format(Mode.cluster),
        ))

        super().__init__(
            workflow,
            dag,
            None,
            jobname=jobname,
            printreason=printreason,
            quiet=quiet,
            printshellcmds=printshellcmds,
            latency_wait=latency_wait,
            cluster_config=cluster_config,
            local_input=local_input,
            restart_times=restart_times,
            exec_job=exec_job,
            assume_shared_fs=assume_shared_fs,
            max_status_checks_per_second=max_status_checks_per_second,
        )
Esempio n. 9
0
def cancel_task(config: Dict, id: str, *args, **kwargs) -> Dict:
    """Cancels running workflow."""
    collection = get_conf(config, 'database', 'collections', 'tasks')
    document = collection.find_one(filter={'task_id': id},
                                   projection={
                                       'task_id_tes': True,
                                       'tes_uri': True,
                                       'task.state': True,
                                       'user_id': True,
                                       'worker_id': True,
                                       '_id': False,
                                   })

    # Raise error if task was not found
    if not document:
        logger.error("Task '{id}' not found.".format(id=id))
        raise TaskNotFound

    # Raise error trying to access workflow run that is not owned by user
    # Only if authorization enabled
    if 'user_id' in kwargs and document['user_id'] != kwargs['user_id']:
        logger.error(
            ("User '{user_id}' is not allowed to access task '{id}'.").format(
                user_id=kwargs['user_id'],
                id=id,
            ))
        raise Forbidden

    # If task is in cancelable state...
    if document['task']['state'] in States.CANCELABLE or \
       document['task']['state'] in States.UNDEFINED:

        # Get timeout duration
        timeout = get_conf(
            config,
            'api',
            'endpoint_params',
            'timeout_service_calls',
        )

        # Cancel local task
        current_app.control.revoke(document['worker_id'],
                                   terminate=True,
                                   signal='SIGKILL')

        # Cancel remote task
        if document['tes_uri'] is not None and document[
                'task_id_tes'] is not None:
            cli = tes.HTTPClient(document['tes_uri'], timeout=timeout)
            try:
                cli.cancel_task(document['task_id_tes'])
            except HTTPError:
                # TODO: handle more robustly: only 400/Bad Request is okay;
                # TODO: other errors (e.g. 500) should be dealt with
                pass

        # Write log entry
        logger.info(
            ("Task '{id}' (worker ID '{worker_id}') was canceled.").format(
                id=id,
                worker_id=document['worker_id'],
            ))

        # Update task state
        set_task_state(
            collection=collection,
            task_id=id,
            worker_id=document['worker_id'],
            state='CANCELED',
        )

    return {}
def _poll_task(
    collection: Collection,
    task_id: str,
    worker_id: str,
    tes_uri: str,
    tes_task_id: str,
    initial_state: str = 'UNKNOWN',
    token: str = None,
    interval: float = 2,
    max_missed_heartbeats: int = 100,
    timeout: float = 1.5,
) -> None:
    """Poll task state."""
    # Log message
    logger.info(
        (
            "Starting polling of TES task '{task_id}' with "
            "worker ID '{worker_id}' at TES '{tes_uri}'..."
        ).format(
            task_id=task_id,
            worker_id=worker_id,
            tes_uri=tes_uri,
        )
    )

    # Initialize states and counters
    state = previous_state = initial_state
    heartbeats_left = max_missed_heartbeats

    # Start polling
    while state in States.UNFINISHED:

        # Try to submit task to TES instance
        try:
            cli = tes.HTTPClient(tes_uri, timeout=timeout)
            response = cli.get_task(tes_task_id, view='MINIMAL')

        # Issue warning if heartbeat was missed
        except Exception as e:
            heartbeats_left -= 1
            logger.warning(
                (
                    "Missed heartbeat for task '{tes_task_id}' at TES "
                    "'{tes_uri}'. {heartbeats_left} heartbeats left. Original "
                    "error message: {type}: {msg}"
                ).format(
                    tes_task_id=tes_task_id,
                    tes_uri=tes_uri,
                    type=type(e).__name__,
                    msg=e,
                )
            )
            continue
        
        # Reset heartbeat counter
        heartbeats_left = max_missed_heartbeats

        # Update state in database if changed
        state = response.state
        if state != previous_state:
            set_task_state(
                collection=collection,
                task_id=task_id,
                worker_id=worker_id,
                state=state,
            )

        # Sleep for specified interval
        sleep(interval)
def _send_task(
    tes_uris: List[str],
    request: Dict,
    token: str,
    timeout: float = 5
) -> Tuple[str, str]:
    """Send task to TES instance."""
    # Process/sanitize request for use with py-tes
    time_now = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
    if not 'creation_time' in request:
        request['creation_time'] = parse_time(time_now)
    if 'inputs' in request:
        request['inputs'] = [
            tes.models.Input(**input) for input in request['inputs']
        ]
    if 'outputs' in request:
        request['outputs'] = [
            tes.models.Output(**output) for output in request['outputs']
        ]
    if 'resources' in request:
        request['resources'] = tes.models.Resources(**request['resources'])
    if 'executors' in request:
        request['executors'] = [
            tes.models.Executor(**executor) for executor in request['executors']
        ]
    if 'logs' in request:
        for log in request['logs']:
            log['start_time'] = time_now
            log['end_time'] = time_now
            if 'logs' in log:
                for inner_log in log['logs']:
                    inner_log['start_time'] = time_now
                    inner_log['end_time'] = time_now
                log['logs'] = [
                    tes.models.ExecutorLog(**log) for log in log['logs']
                ]
            if 'outputs' in log:
                for output in log['outputs']:
                    output['size_bytes'] = 0
                log['outputs'] = [
                    tes.models.OutputFileLog(**output) for output in log['outputs']
                ]
            if 'system_logs' in log:
                log['system_logs'] = [
                    tes.models.SystemLog(**log) for log in log['system_logs']
                ]
        request['logs'] = [
            tes.models.TaskLog(**log) for log in request['logs']
        ]

    # Create Task object
    try:
        task = tes.Task(**request)
    except Exception as e:
        logger.error(
            (
                "Task object could not be created. Original error message: "
                "{type}: {msg}"
            ).format(
                type=type(e).__name__,
                msg=e,
            )
        )
        raise BadRequest

    # Iterate over known TES URIs
    for tes_uri in tes_uris:

        # Try to submit task to TES instance
        try:
            cli = tes.HTTPClient(tes_uri, timeout=timeout)
            task_id = cli.create_task(task)

        # Issue warning and try next TES instance if task submission failed
        except Exception as e:
            logger.warning(
                (
                    "Task could not be submitted to TES instance '{tes_uri}'. "
                    'Trying next TES instance in list. Original error '
                    "message: {type}: {msg}"
                ).format(
                    tes_uri=tes_uri,
                    type=type(e).__name__,
                    msg=e,
                )
            )
            continue

        # Return task ID and URL of TES instance
        return (task_id, tes_uri)

    # Log error if no suitable TES instance was found
    raise ConnectionError(
        'Task could not be submitted to any known TES instance.'
    )
Esempio n. 12
0
 def __attrs_post_init__(self):
     self.__tes_client = tes.HTTPClient(self.tes_url, timeout=self.timeout)
     self.__id = None
Esempio n. 13
0
def run_workflow(
    config: Dict,
    body: Dict,
    sender: str,
    *args,
    **kwargs
) -> Dict:
    """Relays task to best TES instance; returns universally unique task id."""
    # Get config parameters
    authorization_required = get_conf(
        config,
        'security',
        'authorization_required'
    )
    endpoint_params = get_conf_type(
        config,
        'tes',
        'endpoint_params',
        types=(list),
    )
    security_params = get_conf_type(
        config,
        'security',
        'jwt',
    )
    remote_urls = get_conf_type(
        config,
        'tes',
        'service-list',
        types=(list),
    )
    
    # Get associated workflow run
    # TODO: get run_id, task_id and user_id
    
    # Set initial task state
    # TODO:
    
    # Set access token
    if authorization required:
        try:
            access_token = request_access_token(
                user_id=document['user_id'],
                token_endpoint=endpoint_params['token_endpoint'],
                timeout=endpoint_params['timeout_token_request'],
            )
            validate_token(
                token=access_token,
                key=security_params['public_key'],
                identity_claim=security_params['identity_claim'],
            )
        except Exception as e:
            logger.exception(
                (
                    'Could not get access token from token endpoint '
                    "'{token_endpoint}'. Original error message {type}: {msg}"
                ).format(
                    token_endpoint=endpoint_params['token_endpoint'],
                    type=type(e).__name__,
                    msg=e,
                )
            )
            raise Forbidden
    else:
        access_token = None

    # Order TES instances by priority
    testribute = TEStribute_Interface()
    remote_urls_ordered = testribute.order_endpoint_list(
        tes_json=body,
        endpoints=remote_urls,
        access_token=access_token,
        method=endpoint_params['tes_distribution_method'],
    )
    
    # Send task to best TES instance
    try:
        remote_id, remote_url = __send_task(
            urls=remote_urls_ordered,
            body=body,
            access_token=access_token,
            timeout=endpoint_params['timeout_tes_submission'],
        )
    except Exception as e:
        logger.exception('{type}: {msg}'.format(
            default_path=default_path,
            config_var=config_var,
            type=type(e).__name__,
            msg=e,
        )
        raise InternalServerError

    # Poll TES instance for state updates
    __initiate_state_polling(
        task_id=remote_id,
        run_id=document['run_id'],
        url=remote_url,
        interval_polling=endpoint_params['interval_polling'],
        timeout_polling=endpoint_params['timeout_polling'],
        max_time_polling=endpoint_params['max_time_polling'],
    )
    
    # Generate universally unique ID
    local_id = __amend_task_id(
        remote_id=remote_id,
        remote_url=remote_url,
        separator=endpoint_params['id_separator'],
        encoding=endpoint_params['id_encoding'],
    )
    
    # Format and return response
    response = {'id': local_id}
    return response


def request_access_token(
    user_id: str,
    token_endpoint: str,
    timeout: int = 5
) -> str:
    """Get access token from token endpoint."""
    try: 
        response = post(
            token_endpoint,
            data={'user_id': user_id},
            timeout=timeout
        )
    except Exception as e:
        raise
    if response.status_code != 200:
        raise ConnectionError(
            (
                "Could not access token endpoint '{endpoint}'. Received "
                "status code '{code}'."
            ).format(
                endpoint=token_endpoint,
                code=response.status_code
            )
        )
    return response.json()['access_token']


def validate_token(
    token:str,
    key:str,
    identity_claim:str,
) -> None:

    # Decode token
    try:
        token_data = decode(
            jwt=token,
            key=get_conf(
                current_app.config,
                'security',
                'jwt',
                'public_key'
            ),
            algorithms=get_conf(
                current_app.config,
                'security',
                'jwt',
                'algorithm'
            ),
            verify=True,
        )
    except Exception as e:
        raise ValueError(
            (
                'Authentication token could not be decoded. Original '
                'error message: {type}: {msg}'
            ).format(
                type=type(e).__name__,
                msg=e,
            )
        )

    # Validate claims
    identity_claim = get_conf(
        current_app.config,
        'security',
        'jwt',
        'identity_claim'
    )
    validate_claims(
        token_data=token_data,
        required_claims=[identity_claim],
    )


def __send_task(
    urls: List[str],
    body: Dict,
    timeout: int = 5
) -> Tuple[str, str]:
    """Send task to TES instance."""
    task = tes.Task(body)       # TODO: implement this properly
    for url in urls:
        # Try to submit task to TES instance
        try:
            cli = tes.HTTPClient(url, timeout=timeout)
            task_id = cli.create_task(task)
            # TODO: fix problem with marshaling
        # Issue warning and try next TES instance if task submission failed
        except Exception as e:
            logger.warning(
                (
                    "Task could not be submitted to TES instance '{url}'. "
                    'Trying next TES instance in list. Original error '
                    "message: {type}: {msg}"
                ).format(
                    url=url,
                    type=type(e).__name__,
                    msg=e,
                )
            )
            continue
        # Return task ID and URL of TES instance
        return (task_id, url)
    # Log error if no suitable TES instance was found
    raise ConnectionError(
        'Task could not be submitted to any known TES instance.'
    )


def __initiate_state_polling(
    task_id: str,
    run_id: str,
    url: str,
    interval_polling: int = 2,
    timeout_polling: int = 1,
    max_time_polling: Optional[int] = None
) -> None:
    """Initiate polling of TES instance for task state."""
    celery_id = uuid()
    logger.debug(
        (
            "Starting polling of TES task '{task_id}' in "
            "background task '{celery_id}'..."
        ).format(
            task_id=task_id,
            celery_id=celery_id,
        )
    )
    task__poll_task_state.apply_async(
        None,
        {
            'task_id': task_id,
            'run_id': run_id,
            'url': url,
            'interval': interval_polling,
            'timeout': timeout_polling,
        },
        task_id=celery_id,
        soft_time_limit=max_time_polling,
    )
    return None


def __amend_task_id(
    remote_id: str,
    remote_url: str,
    separator: str = '@',   # TODO: add to config
    encoding: str= 'utf-8'  # TODO: add to config
) -> str:
    """Appends base64 to remote task ID."""
    append = base64.b64encode(remote_url.encode(encoding))
    return separator.join([remote_id, append])