Beispiel #1
0
def _register_worker(worker_tag: str) -> int:
    """ Registers the worker with the DB and returns its unique id """
    with sql.session_scope() as session:
        record = WorkerRecord(last_heartbeat=datetime.now(), tag=worker_tag)
        session.add(record)
        session.flush()
        return int(record.id_)
Beispiel #2
0
def _set_worker_task_to_none(worker_id: int):
    """ Setting a worker task to None means the worker is not considered to be
    working on any tasks any more. This has no effect on the worker behaviour, its
    purpose is only to stop cleanups from resetting a task if the worker is considered
    non-responsive (dead)
    """
    with sql.session_scope() as session:
        worker = session.query(WorkerRecord).get(worker_id)
        worker.working_on_task_id = None
        session.merge(worker)
Beispiel #3
0
def get_task_result(task_id: int) -> Any:
    """ Returns the result of running the task, which can

    :param task_id: unique identifier for the task, you get this number when you call
        submit().

    :return: either be the return value of the function submitted (if status is DONE,
    or the exception raised (if status is ERROR)
    """
    with session_scope() as session:
        rec = session.query(TaskRecord).get(task_id)
        results = rec.results_dill
    return dill.loads(eval(results)) if results else None
Beispiel #4
0
def get_task_status(task_id: int) -> TaskStatus:
    """ A task status represents what part of the execution journey it is in,
    possible journeys:

    PENDING -> RUNNING -> DONE  (use get_task_result() to get the result)

    PENDING -> RUNNING -> ERROR  (use get_task_result() to get the error)

    PENDING -> RUNNING -> PENDING -> RUNNING -> TIMEOUT  (task takes too long or hangs)

    """
    with session_scope() as session:
        return TaskStatus(session.query(TaskRecord).get(task_id).status)
Beispiel #5
0
def _hearbeat(worker_id: int) -> bool:
    """ Interacts with the Worker table, checking if worker needs to die, and
    updating the worker heartbeat to signal it is still alive

    :returns: True if all is normal, False if execution should stop
    """
    with sql.session_scope() as session:
        record = session.query(WorkerRecord).get(worker_id)
        if record is None:
            return False
        record.last_heartbeat = datetime.now()  # TODO: only if diff > 3 sec
        session.merge(record)
    return True
Beispiel #6
0
def submit(
    func: Callable[..., Any],
    kwargs: Dict[str, Any],
    creator: int,
    requires_tag: str = "",
    n_retries_if_worker_hangs: int = 1,
) -> int:
    """ Sends the function to be executed remotely, with the given kwargs

    :param creator: identifier for the creator of this task, so that it can later
        be filtered/retrieved
    :param requires_tag: if specified, only workers started with this tag will be
        allowed to run this task
    :param func: the function to run
    :param kwargs: the keyword arguments to pass to the function
    :param creator: a unique identifier for the creator of this task, to ease later
        queries such as 'get tasks for this creator'
    :param requires_tag: only workers that have this tag will pick up this
        task. Defaults to '' (any worker)
    :param n_retries_if_worker_hangs: how many times should this task be retried if it
        makes the workers hang. queueless reassign tasks from non-responsive workers
        to new workers. If a task takes too long, or has resource problems, it may be
        its fault that the worker executing it died. This number limits the chances a
        task has to complete before it is marked as TIMEOUT
    :return: a unique identifier for the task, which can later be used to query its
        status or get the results
    """

    func_str = str(dill.dumps(func))
    kwargs_str = str(dill.dumps(kwargs))
    status = TaskStatus.PENDING.value

    rec = TaskRecord(
        creator=creator,
        owner=NO_OWNER,
        status=status,
        function_dill=func_str,
        kwargs_dill=kwargs_str,
        results_dill="",
        retries=n_retries_if_worker_hangs,
        last_updated=datetime.now(),
        requires_tag=requires_tag,
    )
    with session_scope() as session:
        session.add(rec)
        session.flush()
        task_id = rec.id_

    return task_id
Beispiel #7
0
def _claim_task(worker_id: int, worker_tag: str = "") -> Optional[Task]:
    """ Grabs a PENDING task from the DB and marks it as owned by this worker, and set
     is to RUNNING

    :param worker_id: identity of the worker that is claiming the task. Both the worker
        record and the task record will be modified, to have references to one another
    :param worker_tag: an arbitrary string, if set, only tasks with this tag will be
        claimed.

    :return: either the task claimed or None if no suitable (PENDING and correct tag)
        tasks were found
    """
    with sql.session_scope() as session:
        rec = (
            session.query(TaskRecord)
            .with_for_update()
            .filter_by(status=TaskStatus.PENDING.value, owner=NO_OWNER)
            .filter(TaskRecord.requires_tag.in_([worker_tag, ""]))
            .first()
        )
        if rec is None:
            return None
        rec.owner = worker_id
        rec.status = TaskStatus.RUNNING.value
        session.merge(rec)
        func = rec.function_dill
        kwargs = rec.kwargs_dill
        id_ = rec.id_
        creator = rec.creator

        worker = session.query(WorkerRecord).get(worker_id)
        worker.working_on_task_id = id_
        session.merge(worker)

    return Task(
        id_=id_,
        owner=worker_id,
        creator=creator,
        status=TaskStatus.RUNNING.value,
        func=func,
        kwargs=kwargs,
        results="",
    )
Beispiel #8
0
def _search_for_dead_workers_and_disown_their_tasks(cleanup_timeout: float) -> None:
    """ If a worker is dead, the task it was working on should be reset, so another
    worker can pick it up.

    :param cleanup_timeout: a worker will be considered dead if it hasnt updated its
        heartbeat on the DB in the last `cleanup_timeout` seconds
    """
    too_long_ago = datetime.now() - timedelta(seconds=cleanup_timeout)
    with sql.session_scope() as session:
        # Find any workers whose last heartbeat was more than `cleanup_timeout` ago
        dead_workers = (
            session.query(WorkerRecord)
            .with_for_update()
            .filter(WorkerRecord.last_heartbeat < too_long_ago)
            .all()
        )
        for dead_worker in dead_workers:
            task_id = dead_worker.working_on_task_id
            if task_id is not None:
                # Reset the task (if any) that the worker was working on
                orphan_task = session.query(TaskRecord).with_for_update().get(task_id)
                log(
                    f"Worker {dead_worker.id_} has not responded in {cleanup_timeout} "
                    f"seconds. Its task {task_id} will be disowned, and..."
                )
                orphan_task.owner = NO_OWNER

                retries = orphan_task.retries
                if retries == 0:
                    log(f"...Task Status set to TIMEOUT. No more retries left")
                    orphan_task.status = TaskStatus.TIMEOUT.value
                else:
                    log(f"...Task status set to PENDING. {retries} retries left")
                    orphan_task.status = TaskStatus.PENDING.value
                    orphan_task.retries = retries - 1
                session.merge(orphan_task)

                dead_worker.working_on_task_id = None
Beispiel #9
0
def _save_results(
    task_id: int, results: Any, worker_id: int, status: Optional[TaskStatus] = None
) -> None:
    """ Saves the result for a given task (either the return value of the function
    executed or the exception raised).

    :param task_id: unique identifier for the task. This is created when the task is
        submitted.
    :param results: any object that can be serialised
    :param worker_id: the identifier for the worker attempting to save the results. This
        is needed because only workers that legitimally own a task are allowed to save
        results for it. This is to prevent multiple workers working on the same task
    :param status: if set, the task status will also be updated to this
    """
    serialised_results = _serialise(results)
    with sql.session_scope() as session:
        task = session.query(TaskRecord).get(task_id)

        # Only save results if tasks is RUNNING and this worker still owns it
        if task.status == TaskStatus.RUNNING.value and task.owner == worker_id:
            log(
                f"Saving result for task {task_id}, with size = {len(serialised_results)}"
            )
            task.results_dill = serialised_results
            if status is not None:
                task.status = status.value
            session.merge(task)
        elif task.status != TaskStatus.RUNNING.value:
            log(f"Task {task_id} not RUNNING. Status={task.status}. Results discarded.")
        elif task.owner != worker_id:
            log(
                f"Worker {worker_id} running task {task_id}, but task owner is: {task.owner}. Results discarded."
            )

        worker = session.query(WorkerRecord).get(worker_id)
        worker.working_on_task_id = None
        session.merge(worker)