Python Worker.delete_workerの例、models.Worker.delete_worker Pythonの例

コード例 #1

0

ファイルを表示

def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch,
                      ag_client):
    """ Target for a thread to listen on the worker channel for a message to stop processing.
    :param worker_ch:
    :return:
    """
    global keep_running
    logger.info("Worker subscribing to worker channel...")
    while True:
        try:
            msg = worker_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        logger.debug("Received message in worker channel: {}".format(msg))
        logger.debug("Type(msg)={}".format(type(msg)))
        if type(msg) == dict:
            value = msg.get('value', '')
            if value == 'status':
                # this is a health check, return 'ok' to the reply_to channel.
                logger.debug("received health check. returning 'ok'.")
                ch = msg['reply_to']
                ch.put('ok')
        elif msg == 'stop':
            logger.info("Received stop message, stopping worker...")
            # first, delete an associated client
            # its possible this worker was not passed a client,
            # but if so, we need to delete it before shutting down.
            if ag_client:
                logger.info("Requesting client {} be deleted.".format(
                    ag_client.api_key))
                secret = os.environ.get('_abaco_secret')
                clients_ch = ClientsChannel()
                msg = clients_ch.request_delete_client(
                    tenant=tenant,
                    actor_id=actor_id,
                    worker_id=worker_id,
                    client_id=ag_client.api_key,
                    secret=secret)

                if msg['status'] == 'ok':
                    logger.info("Delete request completed successfully.")
                else:
                    logger.error("Error deleting client. Message: {}".format(
                        msg['message']))
            else:
                logger.info(
                    "Did not receive client. Not issuing delete. Exiting.")
            try:
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                logger.info(
                    "Got WorkerException from delete_worker(). Exception: {}".
                    format(e))
            keep_running = False
            actor_ch.close()
            logger.info("Closing actor channel for actor: {}".format(actor_id))
            logger.info("Worker is now exiting.")
            sys.exit()

コード例 #2

0

ファイルを表示

ファイル: spawner.py プロジェクト: kwhitley33/abaco

 def kill_worker(self, actor_id, worker_id):
     try:
         Worker.delete_worker(actor_id, worker_id)
     except WorkerException as e:
         logger.info("Got WorkerException from delete_worker(). "
                     "worker_id: {}"
                     "Exception: {}".format(worker_id, e))
     except Exception as e:
         logger.error("Got an unexpected exception from delete_worker(). "
                     "worker_id: {}"
                     "Exception: {}".format(worker_id, e))

コード例 #3

0

ファイルを表示

 def kill_worker(self, actor_id, worker_id):
     logger.debug(f"top of kill_worker: {actor_id}_{worker_id}")
     try:
         Worker.delete_worker(actor_id, worker_id)
         logger.debug(f"worker deleted; {actor_id}_{worker_id}")
     except WorkerException as e:
         logger.info("Got WorkerException from delete_worker(). "
                     "worker_id: {}"
                     "Exception: {}".format(worker_id, e))
     except Exception as e:
         logger.error("Got an unexpected exception from delete_worker(). "
                      "worker_id: {}"
                      "Exception: {}".format(worker_id, e))

コード例 #4

0

ファイルを表示

ファイル: worker.py プロジェクト: TACC/abaco

def process_worker_ch(tenant, worker_ch, actor_id, actor_ch, ag_client):
    """ Target for a thread to listen on the worker channel for a message to stop processing.
    :param worker_ch:
    :return:
    """
    global keep_running
    print("Worker subscribing to worker channel...")
    while True:
        try:
            msg = worker_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        print("Received message in worker channel: {}".format(msg))
        print("Type(msg)={}".format(type(msg)))
        if type(msg) == dict:
            value = msg.get('value', '')
            if value == 'status':
                # this is a health check, return 'ok' to the reply_to channel.
                ch = msg['reply_to']
                ch.put('ok')
        elif msg == 'stop':
            print("Received stop message, stopping worker...")
            # first, delete an associated client
            # its possible this worker was not passed a client,
            # but if so, we need to delete it before shutting down.
            if ag_client:
                print("Requesting client {} be deleted.".format(ag_client.api_key))
                secret = os.environ.get('_abaco_secret')
                clients_ch = ClientsChannel()
                msg = clients_ch.request_delete_client(tenant=tenant,
                                                       actor_id=actor_id,
                                                       worker_id=worker_ch.name,
                                                       client_id=ag_client.api_key,
                                                       secret=secret)

                if msg['status'] == 'ok':
                    print("Delete request completed successfully.")
                else:
                    print("Error deleting client. Message: {}".format(msg['message']))
            else:
                print("Did not receive client. Not issuing delete. Exiting.")
            try:
                Worker.delete_worker(actor_id, worker_ch.name)
            except WorkerException:
                pass
            keep_running = False
            actor_ch.close()
            sys.exit()

コード例 #5

0

ファイルを表示

ファイル: worker.py プロジェクト: mwvaughn/abaco

def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    workers = Worker.get_workers(actor_id)
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for _, worker in workers.items():
        # if there is already a channel, tell the worker to shut itself down
        if 'ch_name' in worker:
            shutdown_worker(worker['ch_name'])
        else:
            # otherwise, just remove from db:
            try:
                worker_id = worker.get('id')
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                print(
                    "Got a WorkerException trying to delete worker with id: {}; exception: {}"
                    .format(worker_id, e))

コード例 #6

0

ファイルを表示

ファイル: health.py プロジェクト: mwvaughn/abaco

def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")

コード例 #7

0

ファイルを表示

ファイル: health.py プロジェクト: TACC/abaco

def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")

コード例 #8

0

ファイルを表示

ファイル: health.py プロジェクト: shresnis000/abaco

def hard_delete_worker(actor_id,
                       worker_id,
                       worker_container_id=None,
                       reason_str=None):
    """
    Hard delete of worker from the db. Will also try to hard remove the worker container id, if one is passed,
    but does not stop for errors.
    :param actor_id: db_id of the actor.
    :param worker_id: id of the worker
    :param worker_container_id: Docker container id of the worker container (optional)
    :param reason_str: The reason the worker is being hard deleted (optional, for the logs only).
    :return: None
    """
    logger.error(f"Top of hard_delete_worker for actor_id: {actor_id}; "
                 f"worker_id: {worker_id}; "
                 f"worker_container_id: {worker_container_id};"
                 f"reason: {reason_str}")

    # hard delete from worker db --
    try:
        Worker.delete_worker(actor_id, worker_id)
        logger.info(f"worker {worker_id} deleted from store")
    except Exception as e:
        logger.error(
            f"Got exception trying to delete worker: {worker_id}; exception: {e}"
        )

    # also try to delete container --
    if worker_container_id:
        try:
            rm_container(worker_container_id)
            logger.info(f"worker {worker_id} container deleted from docker")
        except Exception as e:
            logger.error(
                f"Got exception trying to delete worker container; worker: {worker_id}; "
                f"container: {worker_container_id}; exception: {e}")

コード例 #9

0

ファイルを表示

def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    logger.debug("shutdown_workers() called for actor: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for _, worker in workers.items():
        # if there is already a channel, tell the worker to shut itself down
        if 'ch_name' in worker:
            logger.info(
                "worker had channel: {}. calling shutdown_worker() for worker: {}"
                .format(worker['ch_name'], worker.get('id')))
            shutdown_worker(worker['ch_name'])
        else:
            # otherwise, just remove from db:
            try:
                logger.info(
                    "worker: {} did not have a channel. Deleting worker.".
                    format(worker.get('id')))
                worker_id = worker.get('id')
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                logger.error(
                    "Got a WorkerException trying to delete worker with id: {}; exception: {}"
                    .format(worker_id, e))

コード例 #10

0

ファイルを表示

ファイル: spawner.py プロジェクト: kwhitley33/abaco

    def process(self, cmd):
        """Main spawner method for processing a command from the CommandChannel."""
        logger.info("top of process; cmd: {}".format(cmd))
        actor_id = cmd['actor_id']
        try:
            actor = Actor.from_db(actors_store[actor_id])
        except Exception as e:
            msg = f"Exception in spawner trying to retrieve actor object from store. Aborting. Exception: {e}"
            logger.error(msg)
            return
        worker_id = cmd['worker_id']
        image = cmd['image']
        tenant = cmd['tenant']
        stop_existing = cmd.get('stop_existing', True)
        num_workers = 1
        logger.debug("spawner command params: actor_id: {} worker_id: {} image: {} tenant: {}"
                    "stop_existing: {} num_workers: {}".format(actor_id, worker_id,
                                                               image, tenant, stop_existing, num_workers))
        # if the worker was sent a delete request before spawner received this message to create the worker,
        # the status will be SHUTDOWN_REQUESTED, not REQUESTED. in that case, we simply abort and remove the
        # worker from the collection.
        try:
            logger.debug("spawner checking worker's status for SHUTDOWN_REQUESTED")
            worker = Worker.get_worker(actor_id, worker_id)
            logger.debug(f"spawner got worker; worker: {worker}")
        except Exception as e:
            logger.error(f"spawner got exception trying to retrieve worker. "
                         f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}")
            return

        status = worker.get('status')
        if not status == REQUESTED:
            logger.debug(f"worker was NOT in REQUESTED status. status: {status}")
            if status == SHUTDOWN_REQUESTED or status == SHUTTING_DOWN or status == ERROR:
                logger.debug(f"worker status was {status}; spawner deleting worker and returning..")
                try:
                    Worker.delete_worker(actor_id, worker_id)
                    logger.debug("spawner deleted worker because it was SHUTDOWN_REQUESTED.")
                    return
                except Exception as e:
                    logger.error(f"spawner got exception trying to delete a worker in SHUTDOWN_REQUESTED status."
                                 f"actor_id: {actor_id}; worker_id: {worker_id}; e: {e}")
                    return
            else:
                logger.error(f"spawner found worker in unexpected status: {status}. Not processing command and returning.")
                return

        # worker status was REQUESTED; moving on to SPAWNER_SETUP ----
        Worker.update_worker_status(actor_id, worker_id, SPAWNER_SETUP)
        logger.debug("spawner has updated worker status to SPAWNER_SETUP; worker_id: {}".format(worker_id))
        client_id = None
        client_secret = None
        client_access_token = None
        client_refresh_token = None
        api_server = None
        client_secret = None

        # ---- Oauth client generation for the worker -------
        # check if tenant and instance configured for client generation -
        try:
            generate_clients =  Config.get('workers', f'{tenant}_generate_clients').lower()
        except:
            logger.debug(f"Did not find a {tenant}_generate_clients config. Looking for a global config.")
            generate_clients = Config.get('workers', 'generate_clients').lower()
        logger.debug(f"final generate_clients: {generate_clients}")
        if generate_clients == "true":
            logger.debug("client generation was configured to be available; now checking the actor's token attr.")
            # updated 1.3.0-- check whether the actor requires a token:
            if actor.token:
                logger.debug("spawner starting client generation")
                client_id, \
                client_access_token, \
                client_refresh_token, \
                api_server, \
                client_secret = self.client_generation(actor_id, worker_id, tenant)
            else:
                logger.debug("actor's token attribute was False. Not generating client.")
        ch = SpawnerWorkerChannel(worker_id=worker_id)

        logger.debug("spawner attempting to start worker; worker_id: {}".format(worker_id))
        try:
            worker = self.start_worker(
                image,
                tenant,
                actor_id,
                worker_id,
                client_id,
                client_access_token,
                client_refresh_token,
                ch,
                api_server,
                client_secret
            )
        except Exception as e:
            msg = "Spawner got an exception from call to start_worker. Exception:{}".format(e)
            logger.error(msg)
            self.error_out_actor(actor_id, worker_id, msg)
            if client_id:
                self.delete_client(tenant, actor_id, worker_id, client_id, client_secret)
            return

        logger.debug("Returned from start_worker; Created new worker: {}".format(worker))
        ch.close()
        logger.debug("Client channel closed")

        if stop_existing:
            logger.info("Stopping existing workers: {}".format(worker_id))
            # TODO - update status to stop_requested
            self.stop_workers(actor_id, [worker_id])

コード例 #11

0

ファイルを表示

def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client):
    """ Target for a thread to listen on the worker channel for a message to stop processing.
    :param worker_ch:
    :return:
    """
    global keep_running
    logger.info("Worker subscribing to worker channel...")
    while True:
        msg = worker_ch.get_one()
        logger.debug("Received message in worker channel: {}".format(msg))
        logger.debug("Type(msg)={}".format(type(msg)))
        if type(msg) == dict:
            value = msg.get('value', '')
            if value == 'status':
                # this is a health check, return 'ok' to the reply_to channel.
                logger.debug("received health check. returning 'ok'.")
                ch = msg['reply_to']
                ch.put('ok')
                # @TODO -
                # delete the anonymous channel from this thread but sleep first to avoid the race condition.
                time.sleep(1.5)
                ch.delete()
                # NOT doing this for now -- deleting entire anon channel instead (see above)
                # clean up the event queue on this anonymous channel. this should be fixed in channelpy.
                # ch._queue._event_queue
        elif msg == 'stop' or msg == 'stop-no-delete':
            logger.info("Worker with worker_id: {} (actor_id: {}) received stop message, "
                        "stopping worker...".format(worker_id, actor_id))

            # when an actor's image is updated, old workers are deleted while new workers are
            # created. Deleting the actor msg channel in this case leads to race conditions
            delete_actor_ch = True
            if msg == 'stop-no-delete':
                logger.info("Got stop-no-delete; will not delete actor_ch.")
                delete_actor_ch = False
            # first, delete an associated client
            # its possible this worker was not passed a client,
            # but if so, we need to delete it before shutting down.
            if ag_client:
                logger.info("Requesting client {} be deleted.".format(ag_client.api_key))
                secret = os.environ.get('_abaco_secret')
                clients_ch = ClientsChannel()
                msg = clients_ch.request_delete_client(tenant=tenant,
                                                       actor_id=actor_id,
                                                       worker_id=worker_id,
                                                       client_id=ag_client.api_key,
                                                       secret=secret)

                if msg['status'] == 'ok':
                    logger.info("Client delete request completed successfully for "
                                "worker_id: {}, client_id: {}.".format(worker_id, ag_client.api_key))
                else:
                    logger.error("Error deleting client for "
                                 "worker_id: {}, client_id: {}. Message: {}".format(worker_id, msg['message'],
                                                                                    ag_client.api_key))
                clients_ch.close()
            else:
                logger.info("Did not receive client. Not issuing delete. Exiting.")
            try:
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                logger.info("Got WorkerException from delete_worker(). "
                            "worker_id: {}"
                            "Exception: {}".format(worker_id, e))
            keep_running = False
            # delete associated channels:
            if delete_actor_ch:
                actor_ch.delete()
            worker_ch.delete()
            logger.info("WorkerChannel deleted and ActorMsgChannel closed for actor: {} worker_id: {}".format(actor_id, worker_id))
            logger.info("Worker with worker_id: {} is now exiting.".format(worker_id))
            _thread.interrupt_main()
            logger.info("main thread interruptted.")
            os._exit()

コード例 #12

0

ファイルを表示

def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not host_id == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        result = None
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
                logger.info("worker {} deleted from store".format(worker_id))
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
            # if the put_sync timed out and we removed the worker, we also need to delete the channel
            # otherwise the un-acked message will remain.
            try:
                ch.delete()
            except Exception as e:
                logger.error(
                    "Got exception: {} while trying to delete worker channel for worker: {}"
                    .format(e, worker_id))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if result and not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")

        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])

コード例 #13

0

ファイルを表示

ファイル: worker.py プロジェクト: kwhitley33/abaco

def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch,
                      ag_client):
    """ Target for a thread to listen on the worker channel for a message to stop processing.
    :param worker_ch:
    :return:
    """
    global keep_running
    logger.info("Worker subscribing to worker channel...")
    while keep_running:
        msg, msg_obj = worker_ch.get_one()
        # receiving the message is enough to ack it - resiliency is currently handled in the calling code.
        msg_obj.ack()
        logger.debug("Received message in worker channel: {}".format(msg))
        logger.debug("Type(msg)={}".format(type(msg)))
        if type(msg) == dict:
            value = msg.get('value', '')
            if value == 'status':
                # this is a health check, return 'ok' to the reply_to channel.
                logger.debug("received health check. returning 'ok'.")
                ch = msg['reply_to']
                ch.put('ok')
                # @TODO -
                # delete the anonymous channel from this thread but sleep first to avoid the race condition.
                time.sleep(1.5)
                ch.delete()
                # NOT doing this for now -- deleting entire anon channel instead (see above)
                # clean up the event queue on this anonymous channel. this should be fixed in channelpy.
                # ch._queue._event_queue
        elif msg == 'force_quit':
            logger.info(
                "Worker with worker_id: {} (actor_id: {}) received a force_quit message, "
                "forcing the execution to halt...".format(worker_id, actor_id))
            globals.force_quit = True

        elif msg == 'stop' or msg == 'stop-no-delete':
            logger.info(
                "Worker with worker_id: {} (actor_id: {}) received stop message, "
                "stopping worker...".format(worker_id, actor_id))
            # set the worker status to SHUTTING_DOWN:
            try:
                Worker.update_worker_status(actor_id, worker_id, SHUTTING_DOWN)
            except Exception as e:
                logger.error(
                    f"worker got exception trying to update status to SHUTTING_DOWN. actor_id: {actor_id};"
                    f"worker_id: {worker_id}; exception: {e}")

            globals.keep_running = False

            # when an actor's image is updated, old workers are deleted while new workers are
            # created. Deleting the actor msg channel in this case leads to race conditions
            delete_actor_ch = True
            if msg == 'stop-no-delete':
                logger.info("Got stop-no-delete; will not delete actor_ch.")
                delete_actor_ch = False
            # if a `stop` was sent, the actor is being deleted, and so we want to immediately shutdown processing.
            else:
                globals.force_quit = True
            # first, delete an associated client
            # its possible this worker was not passed a client,
            # but if so, we need to delete it before shutting down.
            if ag_client:
                logger.info("Requesting client {} be deleted.".format(
                    ag_client.api_key))
                secret = os.environ.get('_abaco_secret')
                clients_ch = ClientsChannel()
                msg = clients_ch.request_delete_client(
                    tenant=tenant,
                    actor_id=actor_id,
                    worker_id=worker_id,
                    client_id=ag_client.api_key,
                    secret=secret)

                if msg['status'] == 'ok':
                    logger.info(
                        "Client delete request completed successfully for "
                        "worker_id: {}, client_id: {}.".format(
                            worker_id, ag_client.api_key))
                else:
                    logger.error(
                        "Error deleting client for "
                        "worker_id: {}, client_id: {}. Message: {}".format(
                            worker_id, msg['message'], ag_client.api_key))
                clients_ch.close()
            else:
                logger.info(
                    "Did not receive client. Not issuing delete. Exiting.")
            try:
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                logger.info("Got WorkerException from delete_worker(). "
                            "worker_id: {}"
                            "Exception: {}".format(worker_id, e))
            # delete associated channels:
            # it is possible the actor channel was already deleted, in which case we just keep processing
            if delete_actor_ch:
                try:
                    actor_ch.delete()
                    logger.info(
                        "ActorChannel deleted for actor: {} worker_id: {}".
                        format(actor_id, worker_id))
                except Exception as e:
                    logger.info(
                        "Got exception deleting ActorChannel for actor: {} "
                        "worker_id: {}; exception: {}".format(
                            actor_id, worker_id, e))
            try:
                worker_ch.delete()
                logger.info(
                    "WorkerChannel deleted for actor: {} worker_id: {}".format(
                        actor_id, worker_id))
            except Exception as e:
                logger.info(
                    "Got exception deleting WorkerChannel for actor: {} "
                    "worker_id: {}; exception: {}".format(
                        actor_id, worker_id, e))

            logger.info(
                "Worker with worker_id: {} is now exiting.".format(worker_id))
            _thread.interrupt_main()
            logger.info("main thread interrupted, issuing os._exit()...")
            os._exit(0)

コード例 #14

0

ファイルを表示

ファイル: health.py プロジェクト: jlooney/abaco

def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = int(float(worker.get('last_execution_time', 0)))
            if last_execution + ttl < time.time():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(worker['id'])
            else:
                logger.info("Still time left for this worker.")
        elif worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(worker['id'])
        else:
            logger.debug("Worker not in READY status, will postpone.")