コード例 #1
0
def scale_down(actor_id):
    workers = Worker.get_workers(actor_id)
    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
    try:
        # if len(workers) == 1:
        #     logger.debug("METRICS only one worker, won't scale down")
        # else:
        while len(workers) > 0:
            logger.debug('METRICS made it STATUS check')
            worker = workers.popitem()[1]
            logger.debug('METRICS SCALE DOWN current worker: {}'.format(
                worker['status']))
            # check status of the worker is ready
            if worker['status'] == 'READY':
                # scale down
                try:
                    shutdown_worker(worker['id'], delete_actor_ch=False)
                    continue
                except Exception as e:
                    logger.debug(
                        'METRICS ERROR shutting down worker: {} - {} - {}'.
                        format(type(e), e, e.args))
                logger.debug('METRICS shut down worker {}'.format(
                    worker['id']))

    except IndexError:
        logger.debug('METRICS only one worker found for actor {}. '
                     'Will not scale down'.format(actor_id))
    except Exception as e:
        logger.debug("METRICS SCALE UP FAILED: {}".format(e))
コード例 #2
0
ファイル: controllers.py プロジェクト: TACC/abaco
 def delete(self, actor_id, ch_name):
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         worker = Worker.get_worker(id, ch_name)
     except WorkerException as e:
         raise APIException(e.msg, 404)
     shutdown_worker(ch_name)
     return ok(result=None, msg="Worker scheduled to be stopped.")
コード例 #3
0
 def delete(self, actor_id, worker_id):
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         worker = Worker.get_worker(id, worker_id)
     except WorkerException as e:
         raise ResourceError(e.msg, 404)
     shutdown_worker(worker['ch_name'])
     return ok(result=None, msg="Worker scheduled to be stopped.")
コード例 #4
0
ファイル: controllers.py プロジェクト: ehb54/abaco-1
 def delete(self, actor_id, worker_id):
     logger.debug("top of DELETE /actors/{}/workers/{}.".format(actor_id, worker_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         worker = Worker.get_worker(id, worker_id)
     except WorkerException as e:
         logger.debug("Did not find worker: {}. actor: {}.".format(worker_id, actor_id))
         raise ResourceError(e.msg, 404)
     logger.info("calling shutdown_worker(). worker: {}. actor: {}.".format(worker_id, actor_id))
     shutdown_worker(worker['id'])
     logger.info("shutdown_worker() called for worker: {}. actor: {}.".format(worker_id, actor_id))
     return ok(result=None, msg="Worker scheduled to be stopped.")
コード例 #5
0
ファイル: health.py プロジェクト: mwvaughn/abaco
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
コード例 #6
0
ファイル: health.py プロジェクト: TACC/abaco
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
コード例 #7
0
ファイル: health.py プロジェクト: shresnis000/abaco
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        worker_id = worker['id']
        worker_status = worker.get('status')
        # if the worker has only been requested, it will not have a host_id. it is possible
        # the worker will ultimately get scheduled on a different host; however, if there is
        # some issue and the worker is "stuck" in the early phases, we should remove it..
        if 'host_id' not in worker:
            # check for an old create time
            worker_create_t = worker.get('create_time')
            # in versions prior to 1.9, worker create_time was not set until after it was READY
            if not worker_create_t:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str=
                    'Worker did not have a host_id or create_time field.')
            # if still no host after 5 minutes, delete it
            if worker_create_t < get_current_utc_time() - datetime.timedelta(
                    minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker did not have a host_id and had '
                    'old create_time field.')

        # ignore workers on different hosts because this health agent cannot interact with the
        # docker daemon responsible for the worker container..
        if not host_id == worker['host_id']:
            continue

        # we need to delete any worker that is in SHUTDOWN REQUESTED or SHUTTING down for too long
        if worker_status == codes.SHUTDOWN_REQUESTED or worker_status == codes.SHUTTING_DOWN:
            worker_last_health_check_time = worker.get(
                'last_health_check_time')
            if not worker_last_health_check_time:
                worker_last_health_check_time = worker.get('create_time')
            if not worker_last_health_check_time:
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN and no health checks.')
            elif worker_last_health_check_time < get_current_utc_time(
            ) - datetime.timedelta(minutes=5):
                hard_delete_worker(
                    actor_id,
                    worker_id,
                    reason_str='Worker in SHUTDOWN for too long.')

        # check if the worker has not responded to a health check recently; we use a relatively long period
        # (60 minutes) of idle health checks in case there is an issue with sending health checks through rabbitmq.
        # this needs to be watched closely though...
        worker_last_health_check_time = worker.get('last_health_check_time')
        if not worker_last_health_check_time or \
                (worker_last_health_check_time < get_current_utc_time() - datetime.timedelta(minutes=60)):
            hard_delete_worker(
                actor_id,
                worker_id,
                reason_str='Worker has not health checked for too long.')

        # first send worker a health check
        logger.info(f"sending worker {worker_id} a health check")
        ch = WorkerChannel(worker_id=worker_id)
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            ch.put('status')
        except (channelpy.exceptions.ChannelTimeoutException, Exception) as e:
            logger.error(
                f"Got exception of type {type(e)} trying to send worker {worker_id} a "
                f"health check. e: {e}")
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))

        # now check if the worker has been idle beyond the max worker_ttl configured for this abaco:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            continue
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])
コード例 #8
0
ファイル: controllers.py プロジェクト: ehb54/abaco-1
    def check_metrics(self, actor_ids):
        for actor_id in actor_ids:
            logger.debug("TOP OF CHECK METRICS")

            query = {
                'query': 'message_count_for_actor_{}'.format(actor_id.decode("utf-8").replace('-', '_')),
                'time': datetime.datetime.utcnow().isoformat() + "Z"
            }
            r = requests.get(PROMETHEUS_URL + '/api/v1/query', params=query)
            data = json.loads(r.text)['data']['result']

            change_rate = 0
            try:
                previous_data = last_metric[actor_id]
                try:
                    change_rate = int(data[0]['value'][1]) - int(previous_data[0]['value'][1])
                except:
                    logger.debug("Could not calculate change rate.")
            except:
                logger.info("No previous data yet for new actor {}".format(actor_id))

            last_metric.update({actor_id: data})
            # Add a worker if message count reaches a given number
            try:
                logger.debug("METRICS current message count: {}".format(data[0]['value'][1]))
                if int(data[0]['value'][1]) >= 1:
                    tenant, aid = actor_id.decode('utf8').split('_')
                    logger.debug('METRICS Attempting to create a new worker for {}'.format(actor_id))
                    try:
                        # create a worker & add to this actor
                        actor = Actor.from_db(actors_store[actor_id])
                        worker_ids = [Worker.request_worker(tenant=tenant, actor_id=aid)]
                        logger.info("New worker id: {}".format(worker_ids[0]))
                        ch = CommandChannel()
                        ch.put_cmd(actor_id=actor.db_id,
                                   worker_ids=worker_ids,
                                   image=actor.image,
                                   tenant=tenant,
                                   num=1,
                                   stop_existing=False)
                        ch.close()
                        logger.debug('METRICS Added worker successfully for {}'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(type(e), e, e.args))
                elif int(data[0]['value'][1]) <= 1:
                    logger.debug("METRICS made it to scale down block")
                    # Check the number of workers for this actor before deciding to scale down
                    workers = Worker.get_workers(actor_id)
                    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
                    try:
                        if len(workers) == 1:
                            logger.debug("METRICS only one worker, won't scale down")
                        else:
                            while len(workers) > 0:
                                logger.debug('METRICS made it STATUS check')
                                worker = workers.popitem()[1]
                                logger.debug('METRICS SCALE DOWN current worker: {}'.format(worker['status']))
                                # check status of the worker is ready
                                if worker['status'] == 'READY':
                                    logger.debug("METRICS I MADE IT")
                                    # scale down
                                    try:
                                        shutdown_worker(worker['id'])
                                        continue
                                    except Exception as e:
                                        logger.debug('METRICS ERROR shutting down worker: {} - {} - {}'.format(type(e), e, e.args))
                                    logger.debug('METRICS shut down worker {}'.format(worker['id']))

                    except IndexError:
                        logger.debug('METRICS only one worker found for actor {}. '
                                     'Will not scale down'.format(actor_id))
                    except Exception as e:
                        logger.debug("METRICS SCALE UP FAILED: {}".format(e))


            except Exception as e:
                logger.debug("METRICS - ANOTHER ERROR: {} - {} - {}".format(type(e), e, e.args))
コード例 #9
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    host_id = os.environ.get('SPAWNER_HOST_ID',
                             Config.get('spawner', 'host_id'))
    logger.debug("host_id: {}".format(host_id))
    for worker in workers:
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not host_id == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        result = None
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
                logger.info("worker {} deleted from store".format(worker_id))
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
            # if the put_sync timed out and we removed the worker, we also need to delete the channel
            # otherwise the un-acked message will remain.
            try:
                ch.delete()
            except Exception as e:
                logger.error(
                    "Got exception: {} while trying to delete worker channel for worker: {}"
                    .format(e, worker_id))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if result and not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")

        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = worker.get('last_execution_time', 0)
            # if worker has made zero executions, use the create_time
            if last_execution == 0:
                last_execution = worker.get('create_time',
                                            datetime.datetime.min)
            logger.debug("using last_execution: {}".format(last_execution))
            try:
                assert type(last_execution) == datetime.datetime
            except:
                logger.error(
                    "Time received for TTL measurements is not of type datetime."
                )
                last_execution = datetime.datetime.min
            if last_execution + datetime.timedelta(
                    seconds=ttl) < datetime.datetime.utcnow():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(actor_id, worker['id'])
            else:
                logger.info("Still time left for this worker.")

        if worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(actor_id, worker['id'])
コード例 #10
0
def scale_down(actor_id, is_sync_actor=False):
    logger.debug(f"top of scale_down for actor_id: {actor_id}")
    workers = Worker.get_workers(actor_id)
    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
    try:
        while len(workers) > 0:
            logger.debug('METRICS made it STATUS check')
            check_ttl = False
            sync_max_idle_time = 0
            if len(workers) == 1 and is_sync_actor:
                logger.debug(
                    "only one worker, on sync actor. checking worker idle time.."
                )
                try:
                    sync_max_idle_time = int(
                        Config.get('worker', 'sync_max_idle_time'))
                except Exception as e:
                    logger.error(
                        f"Got exception trying to read sync_max_idle_time from config; e:{e}"
                    )
                    sync_max_idle_time = DEFAULT_SYNC_MAX_IDLE_TIME
                check_ttl = True
            worker = workers.popitem()[1]
            if check_ttl:
                try:
                    last_execution = int(
                        float(worker.get('last_execution_time', 0)))
                except Exception as e:
                    logger.error(
                        f"metrics got exception trying to compute last_execution! e: {e}"
                    )
                    last_execution = 0
                # if worker has made zero executions, use the create_time
                if last_execution == 0:
                    last_execution = worker.get('create_time', 0)
                logger.debug("using last_execution: {}".format(last_execution))
                try:
                    last_execution = int(float(last_execution))
                except:
                    logger.error(
                        "Could not cast last_execution {} to int(float()".
                        format(last_execution))
                    last_execution = 0
                if last_execution + sync_max_idle_time < time.time():
                    # shutdown worker
                    logger.info("OK to shut down this worker -- beyond ttl.")
                    # continue onto additional checks below
                else:
                    logger.info(
                        "Autoscaler not shuting down this worker - still time left."
                    )
                    break

            logger.debug('METRICS SCALE DOWN current worker: {}'.format(
                worker['status']))
            # check status of the worker is ready
            if worker['status'] == 'READY':
                # scale down
                try:
                    shutdown_worker(actor_id,
                                    worker['id'],
                                    delete_actor_ch=False)
                    continue
                except Exception as e:
                    logger.debug(
                        'METRICS ERROR shutting down worker: {} - {} - {}'.
                        format(type(e), e, e.args))
                logger.debug('METRICS shut down worker {}'.format(
                    worker['id']))

    except IndexError:
        logger.debug('METRICS only one worker found for actor {}. '
                     'Will not scale down'.format(actor_id))
    except Exception as e:
        logger.debug("METRICS SCALE UP FAILED: {}".format(e))
コード例 #11
0
ファイル: metrics_utils.py プロジェクト: TACC-Cloud/abaco
def scale_down(actor_id, is_sync_actor=False):
    """
    This function determines whether an actor's worker pool should be scaled down and if so,
    initiates the scaling down.
    :param actor_id: the actor_id
    :param is_sync_actor: whether or not the actor has the SYNC hint.
    :return:
    """
    logger.debug(f"top of scale_down for actor_id: {actor_id}")
    # we retrieve the current workers again as we will need the entire worker ojects (not just the number).
    workers = Worker.get_workers(actor_id)
    logger.debug(f'scale_down number of workers: {len(workers)}')
    try:
        # iterate through all the actor's workers and determine if they should be shut down.
        while len(workers) > 0:
            # whether to check the TTL for this worker; we only check TTL for SYNC actors; for non-sync,
            # workers are immediately shut down when the actor has no messages.
            check_ttl = False
            sync_max_idle_time = 0
            if len(workers) == 1 and is_sync_actor:
                logger.debug(
                    "only one worker, on sync actor. checking worker idle time.."
                )
                try:
                    sync_max_idle_time = int(
                        Config.get('workers', 'sync_max_idle_time'))
                except Exception as e:
                    logger.error(
                        f"Got exception trying to read sync_max_idle_time from config; e:{e}"
                    )
                    sync_max_idle_time = DEFAULT_SYNC_MAX_IDLE_TIME
                check_ttl = True
            worker = workers.pop()
            logger.debug(f"check_ttl: {check_ttl} for worker: {worker}")
            if check_ttl:
                try:
                    last_execution = int(
                        float(worker.get('last_execution_time', 0)))
                except Exception as e:
                    logger.error(
                        f"metrics got exception trying to compute last_execution! e: {e}"
                    )
                    last_execution = 0
                # if worker has made zero executions, use the create_time
                if last_execution == 0:
                    last_execution = worker.get('create_time', 0)
                logger.debug("using last_execution: {}".format(last_execution))
                try:
                    last_execution = int(float(last_execution))
                except:
                    logger.error(
                        "Could not cast last_execution {} to int(float()".
                        format(last_execution))
                    last_execution = 0
                if last_execution + sync_max_idle_time < time.time():
                    # shutdown worker
                    logger.info("OK to shut down this worker -- beyond ttl.")
                    # continue onto additional checks below
                else:
                    logger.info(
                        "Autoscaler not shuting down this worker - still time left."
                    )
                    continue

            logger.debug('based on TTL, worker could be scaled down.')
            # check status of the worker is ready
            if worker['status'] == 'READY':
                # scale down
                logger.debug(
                    'worker was in READY status; attempting shutdown.')
                try:
                    shutdown_worker(actor_id,
                                    worker['id'],
                                    delete_actor_ch=False)
                    logger.debug('sent worker shutdown message.')
                    continue
                except Exception as e:
                    logger.debug(
                        'METRICS ERROR shutting down worker: {} - {} - {}'.
                        format(type(e), e, e.args))
                logger.debug('METRICS shut down worker {}'.format(
                    worker['id']))

    except IndexError:
        logger.debug('METRICS only one worker found for actor {}. '
                     'Will not scale down'.format(actor_id))
    except Exception as e:
        logger.debug("METRICS SCALE UP FAILED: {}".format(e))
コード例 #12
0
ファイル: health.py プロジェクト: jlooney/abaco
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    logger.info("Checking health for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception trying to retrieve workers: {}".format(e))
        return None
    logger.debug("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        logger.info("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(worker_id=worker['id'])
        worker_id = worker.get('id')
        try:
            logger.debug("Issuing status check to channel: {}".format(
                worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            logger.info(
                "Worker did not respond, removing container and deleting worker."
            )
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            try:
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got exception trying to delete worker: {}".format(e))
        finally:
            try:
                ch.close()
            except Exception as e:
                logger.error(
                    "Got an error trying to close the worker channel for dead worker. Exception: {}"
                    .format(e))
        if not result == 'ok':
            logger.error(
                "Worker responded unexpectedly: {}, deleting worker.".format(
                    result))
            try:
                rm_container(worker['cid'])
                Worker.delete_worker(actor_id, worker_id)
            except Exception as e:
                logger.error(
                    "Got error removing/deleting worker: {}".format(e))
        else:
            # worker is healthy so update last health check:
            Worker.update_worker_health_time(actor_id, worker_id)
            logger.info("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            logger.info("Infinite ttl configured; leaving worker")
            return
        # we don't shut down workers that are currently running:
        if not worker['status'] == codes.BUSY:
            last_execution = int(float(worker.get('last_execution_time', 0)))
            if last_execution + ttl < time.time():
                # shutdown worker
                logger.info("Shutting down worker beyond ttl.")
                shutdown_worker(worker['id'])
            else:
                logger.info("Still time left for this worker.")
        elif worker['status'] == codes.ERROR:
            # shutdown worker
            logger.info("Shutting down worker in error status.")
            shutdown_worker(worker['id'])
        else:
            logger.debug("Worker not in READY status, will postpone.")