def main(): logger.info("Running abaco health checks. Now: {}".format(time.time())) try: ttl = Config.get('workers', 'worker_ttl') except Exception as e: logger.error( "Could not get worker_ttl config. Exception: {}".format(e)) if not container_running(name='spawner*'): logger.critical("No spawners running! Launching new spawner..") command = 'python3 -u /actors/spawner.py' # check logging strategy to determine log file name: if get_log_file_strategy() == 'split': log_file = 'spawner.log' else: log_file = 'service.log' try: run_container_with_docker(AE_IMAGE, command, name='abaco_spawner_0', environment={'AE_IMAGE': AE_IMAGE}, log_file=log_file) except Exception as e: logger.critical( "Could not restart spanwer. Exception: {}".format(e)) try: ttl = int(ttl) except Exception as e: logger.error("Invalid ttl config: {}. Setting to -1.".format(e)) ttl = -1 ids = get_actor_ids() logger.info("Found {} actor(s). Now checking status.".format(len(ids))) for id in ids: check_workers(id, ttl)
def run_worker(image, ch_name, worker_id): """ Run an actor executor worker with a given channel and image. :return: """ logger.debug("top of run_worker()") command = 'python3 -u /actors/worker.py' logger.debug( "docker_utils running worker. image:{}, command:{}, chan:{}".format( image, command, ch_name)) # determine what log file to use if get_log_file_strategy() == 'split': log_file = 'worker.log' else: log_file = 'abaco.log' container = run_container_with_docker(image=AE_IMAGE, command=command, environment={ 'ch_name': ch_name, 'image': image, 'worker_id': worker_id, '_abaco_secret': os.environ.get('_abaco_secret') }, log_file=log_file) # don't catch errors -- if we get an error trying to run a worker, let it bubble up. # TODO - determines worker structure; should be placed in a proper DAO class. logger.info( "worker container running. worker_id: {}. container: {}".format( worker_id, container)) return { 'image': image, # @todo - location will need to change to support swarm or cluster 'location': dd, 'id': worker_id, 'cid': container.get('Id'), 'ch_name': ch_name, 'status': BUSY, 'host_id': host_id, 'host_ip': host_ip, 'last_execution': 0 }
def start_spawner(queue, idx='0'): """ Start a spawner on this host listening to a queue, `queue`. :param queue: (str) - the queue the spawner should listen to. :param idx: (str) - the index to use as a suffix to the spawner container name. :return: """ command = 'python3 -u /actors/spawner.py' name = 'healthg_{}_spawner_{}'.format(queue, idx) try: environment = dict(os.environ) except Exception as e: environment = {} logger.error( "Unable to convert environment to dict; exception: {}".format(e)) environment.update({ 'AE_IMAGE': AE_IMAGE.split(':')[0], 'queue': queue, }) if not '_abaco_secret' in environment: msg = 'Error in health process trying to start spawner. Did not find an _abaco_secret. Aborting' logger.critical(msg) raise # check logging strategy to determine log file name: log_file = 'abaco.log' if get_log_file_strategy() == 'split': log_file = 'spawner.log' try: run_container_with_docker(AE_IMAGE, command, name=name, environment=environment, mounts=[], log_file=log_file) except Exception as e: logger.critical( "Could not restart spawner for queue {}. Exception: {}".format( queue, e))
def run_container_with_docker(image, command, name=None, environment={}, mounts=[], log_file=None, auto_remove=False, client_id=None, client_access_token=None, client_refresh_token=None, actor_id=None, tenant=None, api_server=None, client_secret=None): """ Run a container with docker mounted in it. Note: this function always mounts the abaco conf file so it should not be used by execute_actor(). """ logger.debug("top of run_container_with_docker().") cli = docker.APIClient(base_url=dd, version="auto") # bind the docker socket as r/w since this container gets docker. volumes = ['/var/run/docker.sock'] binds = { '/var/run/docker.sock': { 'bind': '/var/run/docker.sock', 'ro': False } } # add a bind key and dictionary as well as a volume for each mount for m in mounts: binds[m.get('host_path')] = { 'bind': m.get('container_path'), 'ro': m.get('format') == 'ro' } volumes.append(m.get('host_path')) # mount the abaco conf file. first we look for the environment variable, falling back to the value in Config. try: abaco_conf_host_path = os.environ.get('abaco_conf_host_path') if not abaco_conf_host_path: abaco_conf_host_path = Config.get('spawner', 'abaco_conf_host_path') logger.debug("docker_utils using abaco_conf_host_path={}".format( abaco_conf_host_path)) # mount config file at the root of the container as r/o volumes.append('/service.conf') binds[abaco_conf_host_path] = {'bind': '/service.conf', 'ro': True} except configparser.NoOptionError as e: # if we're here, it's bad. we don't have a config file. better to cut and run, msg = "Did not find the abaco_conf_host_path in Config. Exception: {}".format( e) logger.error(msg) raise DockerError(msg) # also add it to the environment if not already there if 'abaco_conf_host_path' not in environment: environment['abaco_conf_host_path'] = abaco_conf_host_path if 'client_id' not in environment: environment['client_id'] = client_id if 'client_access_token' not in environment: environment['client_access_token'] = client_access_token if 'actor_id' not in environment: environment['actor_id'] = actor_id if 'tenant' not in environment: environment['tenant'] = tenant if 'api_server' not in environment: environment['api_server'] = api_server if 'client_secret' not in environment: environment['client_secret'] = client_secret if 'client_refresh_token' not in environment: environment['client_refresh_token'] = client_refresh_token # if not passed, determine what log file to use if not log_file: if get_log_file_strategy() == 'split': log_file = 'worker.log' else: log_file = 'abaco.log' # mount the logs file. volumes.append('/var/log/service.log') # first check to see if the logs directory config was set: try: logs_host_dir = Config.get('logs', 'host_dir') except (configparser.NoSectionError, configparser.NoOptionError): # if the directory is not configured, default it to abaco_conf_host_path logs_host_dir = os.path.dirname(abaco_conf_host_path) binds['{}/{}'.format(logs_host_dir, log_file)] = { 'bind': '/var/log/service.log', 'rw': True } host_config = cli.create_host_config(binds=binds, auto_remove=auto_remove) logger.debug("binds: {}".format(binds)) # add the container to a specific docker network, if configured netconf = None try: docker_network = Config.get('spawner', 'docker_network') except Exception: docker_network = None if docker_network: netconf = cli.create_networking_config( {docker_network: cli.create_endpoint_config()}) # create and start the container try: container = cli.create_container(image=image, environment=environment, volumes=volumes, host_config=host_config, command=command, name=name, networking_config=netconf) cli.start(container=container.get('Id')) logger.debug('container successfully started') except Exception as e: msg = "Got exception trying to run container from image: {}. Exception: {}".format( image, e) logger.info(msg) raise DockerError(msg) logger.info("container started successfully: {}".format(container)) return container
def run_worker(image, worker_id): """ Run an actor executor worker with a given channel and image. :return: """ logger.debug("top of run_worker()") command = 'python3 -u /actors/worker.py' logger.debug("docker_utils running worker. image:{}, command:{}".format( image, command)) # determine what log file to use if get_log_file_strategy() == 'split': log_file = 'worker.log' else: log_file = 'abaco.log' # mount the directory on the host for creating fifos try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') logger.info("Using fifo_host_path_dir: {}".format(fifo_host_path_dir)) except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error("Got exception trying to look up fifo_host_path_dir. Setting to None. Exception: {}".format(e)) fifo_host_path_dir = None if fifo_host_path_dir: mounts = [{'host_path': os.path.join(fifo_host_path_dir, worker_id), 'container_path': os.path.join(fifo_host_path_dir, worker_id), 'format': 'rw'}] else: mounts = [] # mount the directory on the host for creating result sockets try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') logger.info("Using socket_host_path_dir: {}".format(socket_host_path_dir)) except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error("Got exception trying to look up fifo_host_path_dir. Setting to None. Exception: {}".format(e)) socket_host_path_dir = None if socket_host_path_dir: mounts.append({'host_path': os.path.join(socket_host_path_dir, worker_id), 'container_path': os.path.join(socket_host_path_dir, worker_id), 'format': 'rw'}) logger.info("Final fifo_host_path_dir: {}; socket_host_path_dir: {}".format(fifo_host_path_dir, socket_host_path_dir)) try: auto_remove = Config.get('workers', 'auto_remove') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.debug("no auto_remove in the workers stanza.") auto_remove = True if hasattr(auto_remove, 'lower'): if auto_remove.lower() == 'false': auto_remove = False elif not auto_remove == True: auto_remove = False container = run_container_with_docker(image=AE_IMAGE, command=command, environment={'image': image, 'worker_id': worker_id, '_abaco_secret': os.environ.get('_abaco_secret')}, mounts=mounts, log_file=log_file, auto_remove=auto_remove) # don't catch errors -- if we get an error trying to run a worker, let it bubble up. # TODO - determines worker structure; should be placed in a proper DAO class. logger.info("worker container running. worker_id: {}. container: {}".format(worker_id, container)) return { 'image': image, # @todo - location will need to change to support swarm or cluster 'location': dd, 'id': worker_id, 'cid': container.get('Id'), 'status': BUSY, 'host_id': host_id, 'host_ip': host_ip, 'last_execution_time': 0, 'last_health_check_time': get_current_utc_time() }