def swarm_events_thread(args): swarm = SwarmClient(args) while True: try: swarm.event_listener(main_callback) except Exception: log.exception('Something bad happened')
def ip_address(self): """Getter for the service IP address, queries Swarm as the IP address changes outside our control.""" if self.docker_status != self.DOCKER_START_STATUS: return {} swarm = SwarmClient(get_conf()) s_info = swarm.inspect_container(self.docker_id) return s_info['ip_address'][get_conf().overlay_network_name]
def guest_check_thread(args): swarm = SwarmClient(args) while True: try: zoe_containers = swarm.list( {'zoe.deployment_name': get_conf().deployment_name}) for c in zoe_containers: if 'Exited' in c['status']: zoe_id = c['labels']['zoe.service.id'] try: container_died(zoe_id) except ZoeAPIException: log.warning( 'Container ' + c['name'] + ' has died, but Zoe does not know anything about it, deleting' ) swarm.terminate_container(c['id'], delete=True) check_guests(swarm) time.sleep(get_conf().loop_time) except Exception: log.exception('Something bad happened')
def loop(self): assert isinstance(config.singletons['sql_manager'], zoe_lib.sql_manager.SQLManager) while True: message = self.zmq_s.recv_json() self.debug_has_replied = False start_time = time.time() if message['command'] == 'execution_start': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_scheduled() self._reply_ok() zoe_master.execution_manager.execution_submit(execution) elif message['command'] == 'execution_terminate': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is None: self._reply_error('Execution ID {} not found'.format( message['exec_id'])) else: execution.set_cleaning_up() self._reply_ok() zoe_master.execution_manager.execution_terminate(execution) elif message['command'] == 'execution_delete': exec_id = message['exec_id'] execution = config.singletons['sql_manager'].execution_list( id=exec_id, only_one=True) if execution is not None: zoe_master.execution_manager.execution_delete(execution) self._reply_ok() elif message['command'] == 'service_inspect': service_id = message['service_id'] service = config.singletons['sql_manager'].service_list( id=service_id, only_one=True) if service is None: self._reply_error('no such service') else: swarm = SwarmClient(config.get_conf()) info = swarm.inspect_container(service.docker_id) self._reply_ok(info) else: log.error('Unknown command: {}'.format(message['command'])) self._reply_error('unknown command') if not self.debug_has_replied: self._reply_error('bug') raise ZoeException('BUG: command {} does not fill a reply') config.singletons['metric'].metric_api_call( start_time, message['command'])
def service_logs(self, uid, role, service_id, stream=True): """Retrieve the logs for the given service.""" service = self.sql.service_list(id=service_id, only_one=True) if service is None: raise zoe_api.exceptions.ZoeNotFoundException('No such service') if service.user_id != uid and role != 'admin': raise zoe_api.exceptions.ZoeAuthException() if service.docker_id is None: raise zoe_api.exceptions.ZoeNotFoundException('Container is not running') swarm = SwarmClient(get_conf()) return swarm.logs(service.docker_id, stream)
def terminate_execution(execution: Execution) -> None: execution.set_cleaning_up() swarm = SwarmClient(get_conf()) for s in execution.services: assert isinstance(s, Service) if s.docker_id is not None: s.set_terminating() swarm.terminate_container(s.docker_id, delete=True) s.set_inactive() log.debug('Service {} terminated'.format(s.name)) execution.set_terminated()
def run(self): """The thread loop.""" log.info("Monitor thread started") swarm = SwarmClient(get_conf()) while True: try: swarm.event_listener(lambda x: self._event_cb(x)) except: log.exception('Exception in monitor thread') time.sleep( 1 ) # Usually we got disconnected, so wait a bit before retrying
def terminate_execution(execution: Execution) -> None: """Terminate an execution, making sure no containers are left in Swarm.""" execution.set_cleaning_up() swarm = SwarmClient(get_conf()) for service in execution.services: assert isinstance(service, Service) if service.docker_id is not None: service.set_terminating() swarm.terminate_container(service.docker_id, delete=True) service.set_inactive() log.debug('Service {} terminated'.format(service.name)) execution.set_terminated()
def main(): """The main entrypoint function.""" conf = load_configuration() config.load_configuration(conf) args = config.get_conf() if args.debug: logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logging.getLogger('kazoo').setLevel(logging.WARNING) logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('docker').setLevel(logging.INFO) logging.getLogger("tornado").setLevel(logging.DEBUG) state = FakeSQLManager() zapp_description = json.load(args.jsonfile) print('Validating zapp description...') zoe_lib.applications.app_validate(zapp_description) exec_id = state.execution_new('test', 'fake_user', zapp_description) e = state.execution_list(only_one=True, id=exec_id) _digest_application_description(state, e) print('Zapp digested, starting containers...') execution_to_containers(e) print('Giving the containers a few seconds to start...') time.sleep(5) swarm = SwarmClient(args) for service in e.services: print("Service {}, docker ID: {}".format(service.name, service.docker_id)) logs = swarm.logs(service.docker_id, False) logs = logs.decode('utf-8').split('\n') for log_line in logs[-10:]: print(log_line) print("Execution as been started, press CTRL-C to terminate it") try: while True: time.sleep(1) except KeyboardInterrupt: pass print('Terminating...') terminate_execution(e)
def _spawn_service(execution: Execution, service: Service, env_subst_dict: dict): copts = DockerContainerOptions() copts.gelf_log_address = get_conf().gelf_address copts.name = service.dns_name copts.set_memory_limit(service.description['required_resources']['memory']) copts.network_name = get_conf().overlay_network_name copts.labels = { 'zoe.execution.name': execution.name, 'zoe.execution.id': str(execution.id), 'zoe.service.name': service.name, 'zoe.service.id': str(service.id), 'zoe.owner': execution.user_id, 'zoe.deployment_name': get_conf().deployment_name, 'zoe.type': 'app_service' } if service.description['monitor']: copts.labels['zoe.monitor'] = 'true' else: copts.labels['zoe.monitor'] = 'false' copts.restart = not service.description['monitor'] # Monitor containers should not restart # Generate a dictionary containing the current cluster status (before the new container is spawned) # This information is used to substitute template strings in the environment variables for env_name, env_value in service.description['environment']: try: env_value = env_value.format(**env_subst_dict) except KeyError: raise ZoeStartExecutionFatalException("unknown variable in expression {}".format(env_value)) copts.add_env_variable(env_name, env_value) for p in service.description['ports']: if p['expose']: copts.ports.append(p['port_number']) # FIXME UDP ports? if 'volumes' in service.description: for path, mount_point, readonly in service.description['volumes']: copts.add_volume_bind(path, mount_point, readonly) for wks in singletons['workspace_managers']: assert isinstance(wks, zoe_master.workspace.base.ZoeWorkspaceBase) if wks.can_be_attached(): copts.add_volume_bind(wks.get_path(execution.user_id), wks.get_mountpoint(), False) # The same dictionary is used for templates in the command if 'command' in service.description: copts.set_command(service.description['command'].format(**env_subst_dict)) try: swarm = SwarmClient(get_conf()) except Exception as e: raise ZoeStartExecutionFatalException(str(e)) try: cont_info = swarm.spawn_container(service.description['docker_image'], copts) except ZoeException as e: raise ZoeStartExecutionRetryException(str(e)) service.set_active(cont_info["docker_id"]) if 'networks' in service.description: for net in service.description['networks']: try: swarm.connect_to_network(service.docker_id, net) except ZoeException as e: raise ZoeStartExecutionFatalException(str(e)) return
def __init__(self): super().__init__(name='stats', daemon=True) self.swarm = SwarmClient(get_conf()) self._swarm_stats = None
def _spawn_service(execution: Execution, service: Service, env_subst_dict: dict): copts = DockerContainerOptions() copts.gelf_log_address = get_conf().gelf_address copts.name = service.dns_name copts.set_memory_limit(service.description['required_resources']['memory']) copts.network_name = get_conf().overlay_network_name copts.labels = { 'zoe.execution.name': execution.name, 'zoe.execution.id': str(execution.id), 'zoe.service.name': service.name, 'zoe.service.id': str(service.id), 'zoe.owner': execution.user_id, 'zoe.deployment_name': get_conf().deployment_name, 'zoe.type': 'app_service' } if service.description['monitor']: copts.labels['zoe.monitor'] = 'true' else: copts.labels['zoe.monitor'] = 'false' copts.restart = not service.description[ 'monitor'] # Monitor containers should not restart _gen_environment(service, env_subst_dict, copts) for p in service.description['ports']: if p['expose']: copts.ports.append(p['port_number']) # FIXME UDP ports? if 'volumes' in service.description: for path, mount_point, readonly in service.description['volumes']: copts.add_volume_bind(path, mount_point, readonly) if 'constraints' in service.description: for constraint in service.description['constraints']: copts.add_constraint(constraint) fswk = ZoeFSWorkspace() if fswk.can_be_attached(): copts.add_volume_bind(fswk.get_path(execution.user_id), fswk.get_mountpoint(), False) # The same dictionary is used for templates in the command if 'command' in service.description: copts.set_command( service.description['command'].format(**env_subst_dict)) try: swarm = SwarmClient(get_conf()) except Exception as e: raise ZoeStartExecutionFatalException(str(e)) try: cont_info = swarm.spawn_container(service.description['docker_image'], copts) except ZoeException as e: raise ZoeStartExecutionRetryException(str(e)) except ZoeLibException as e: raise ZoeStartExecutionRetryException(str(e)) service.set_active(cont_info["docker_id"]) if 'networks' in service.description: for net in service.description['networks']: try: swarm.connect_to_network(service.docker_id, net) except ZoeException as e: raise ZoeStartExecutionFatalException(str(e)) return