def _clean_up_kv_store(): global next_kv_clean_up_timestamp if time.time() < next_kv_clean_up_timestamp: return get_logger().info('Cleaning up kv-store:') next_kv_clean_up_timestamp = get_next_kv_clean_up_timestamp() services = armada_api.get_json('list') valid_container_ids = set( service.get('container_id') for service in services) start_timestamp_keys = kv.kv_list('start_timestamp/') or [] for key in start_timestamp_keys: container_id = key.split('/')[-1] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) single_active_instance_keys = kv.kv_list('single_active_instance/') or [] for key in single_active_instance_keys: container_id = key.split('/')[-1].split(':')[0] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) get_logger().info('Finished cleaning up kv-store.')
def _get_inactive_services_list(filter_microservice_name, filter_env, filter_app_id): services_list = kv.kv_list("service/") result = [] if not services_list: return result names = set([service.split('/')[1] for service in services_list]) if filter_microservice_name: names = fnmatch.filter(names, filter_microservice_name) for name in names: instances = kv.kv_list('service/{}/'.format(name)) if instances is None: continue for instance in instances: instance_dict = kv.kv_get(instance) microservice_name = instance_dict['ServiceName'] microservice_status = instance_dict['Status'] not_available = 'n/a' container_id = instance_dict[ 'container_id'] if 'container_id' in instance_dict else not_available microservice_start_timestamp = instance_dict['start_timestamp'] microservice_tags_dict = {} if instance_dict['params']['microservice_env']: microservice_tags_dict['env'] = instance_dict['params'][ 'microservice_env'] if instance_dict['params']['microservice_app_id']: microservice_tags_dict['app_id'] = instance_dict['params'][ 'microservice_app_id'] matches_env = (filter_env is None) or ( filter_env == microservice_tags_dict.get('env')) matches_app_id = (filter_app_id is None) or ( filter_app_id == microservice_tags_dict.get('app_id')) if matches_env and matches_app_id: microservice_dict = { 'name': microservice_name, 'status': microservice_status, 'address': not_available, 'microservice_id': not_available, 'container_id': container_id, 'tags': microservice_tags_dict, 'start_timestamp': microservice_start_timestamp, } result.append(microservice_dict) return result
def _stop_service(self, container_id): ship = get_ship_name() service_dict = None service_list = kv_list('ships/{}/service/'.format(ship)) if service_list: key = fnmatch.filter(service_list, '*/{}'.format(container_id)) service_dict = kv_get(key[0]) if key else None if service_dict and service_dict['Status'] in ['crashed', 'not-recovered']: kv_remove(key[0]) else: run_command_in_container('supervisorctl stop armada_agent', container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container('supervisorctl stop register_in_service_discovery', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except: traceback.print_exc() for i in range(3): try: docker_api.stop(container_id) kv_remove(key[0]) except Exception as e: last_exception = e traceback.print_exc() if not is_container_running(container_id): break if is_container_running(container_id): get_logger().error('Could not stop container: {}'.format(container_id)) raise last_exception
def main(): setup_sentry() args = _parse_args() saved_containers_path = args.saved_containers_path try: wait_for_consul_ready() ship = get_ship_name() saved_containers = kv.kv_list('ships/{}/service/'.format(ship)) containers_parameters_dict = {} if saved_containers: for container in saved_containers: container_dict = kv.kv_get(container) containers_parameters_dict[container] = container_dict if containers_parameters_dict: try: _save_containers_parameters_list_in_kv_store(containers_parameters_dict) get_logger().info('Containers have been saved to kv store.') except Exception as e: get_logger().exception(e) if not args.force and not _is_recovery_completed(): get_logger().warning('Recovery is not completed. Aborting saving running containers.') return _save_containers_parameters_list_in_file(containers_parameters_dict, saved_containers_path) get_logger().info('Containers have been saved to {}.'.format(saved_containers_path)) else: get_logger().info('Aborted saving container because of errors.') except Exception as e: get_logger().exception(e) sys.exit(1)
def _deregister_not_running_services(): try: ship = get_ship_name() except: ship = get_ship_ip() services = _get_local_services() running_containers_ids = _get_running_container_ids() for service_id in services.keys(): container_id, is_subservice = _get_container_id_with_subservice( service_id) if container_id in running_containers_ids: continue if not is_subservice: name = services[service_id]['Service'] kv.update_container_status('crashed', ship=ship, name=name, container_id=container_id) deregister_services(container_id) services_keys = kv.kv_list('ships/{}/service/'.format(ship)) or [] for service_key in services_keys: container_id = service_key.split('/')[-1] if container_id not in running_containers_ids: kv.update_container_status('crashed', key=service_key) deregister_services(container_id)
def deregister_not_running_services(): services_ids = get_local_services_ids() containers_ids = get_running_container_ids() for service_id in services_ids: if service_id != 'consul': container_id = service_id.split(':')[0] if container_id not in containers_ids: name = consul_query('agent/services')[service_id]['Service'] params = get_container_parameters(container_id) kv_index = 0 if kv.kv_list('service/{}/'.format(name)): kv_index = int( kv.kv_list( 'service/{}/'.format(name))[-1].split('/')[2]) + 1 kv.save_service(name, kv_index, 'crashed', params, container_id) deregister_services(container_id)
def _get_services_list(filter_microservice_name, filter_env, filter_app_id, filter_local): if filter_local: ship_list = ['containers_parameters_list/{}'.format(get_ship_name())] else: ship_list = kv.kv_list('containers_parameters_list/') services_dict = {} if not ship_list: return {} for ship in ship_list: containers = kv.kv_get(ship) if containers: services_dict.update(containers) services_list = services_dict.keys() result = {} if not services_list: return result if filter_microservice_name: services_list = fnmatch.filter( services_list, 'ships/*/service/{}/*'.format(filter_microservice_name)) for service in services_list: service_dict = services_dict[service] microservice_name = service_dict['ServiceName'] microservice_status = service_dict['Status'] microservice_id = service_dict['ServiceID'] container_id = service_dict['container_id'] microservice_start_timestamp = service_dict['start_timestamp'] not_available = 'n/a' microservice_tags_dict = {} if service_dict['params']['microservice_env']: microservice_tags_dict['env'] = service_dict['params'][ 'microservice_env'] if service_dict['params']['microservice_app_id']: microservice_tags_dict['app_id'] = service_dict['params'][ 'microservice_app_id'] matches_env = (filter_env is None) or ( filter_env == microservice_tags_dict.get('env')) matches_app_id = (filter_app_id is None) or ( filter_app_id == microservice_tags_dict.get('app_id')) if matches_env and matches_app_id: microservice_dict = { 'name': microservice_name, 'status': microservice_status, 'address': not_available, 'microservice_id': microservice_id, 'container_id': container_id, 'tags': microservice_tags_dict, 'start_timestamp': microservice_start_timestamp, } result[microservice_id] = microservice_dict return result
def _get_local_running_containers(): result = [] ship = get_ship_name() local_containers = kv.kv_list('ships/{}/service/'.format(ship)) or [] for container in local_containers: container_parameters = kv.kv_get(container)['params'] if container_parameters: result.append(container_parameters) return result
def _get_inactive_services_list(filter_microservice_name, filter_env, filter_app_id): services_list = kv.kv_list("service/") result = [] if not services_list: return result names = set([service.split('/')[1] for service in services_list]) if filter_microservice_name: names = fnmatch.filter(names, filter_microservice_name) for name in names: instances = kv.kv_list('service/{}/'.format(name)) if instances is None: continue for instance in instances: instance_dict = kv.kv_get(instance) microservice_name = instance_dict['ServiceName'] microservice_status = instance_dict['Status'] not_available = 'n/a' container_id = instance_dict['container_id'] if 'container_id' in instance_dict else not_available microservice_start_timestamp = instance_dict['start_timestamp'] microservice_tags_dict = {} if instance_dict['params']['microservice_env']: microservice_tags_dict['env'] = instance_dict['params']['microservice_env'] if instance_dict['params']['microservice_app_id']: microservice_tags_dict['app_id'] = instance_dict['params']['microservice_app_id'] matches_env = (filter_env is None) or (filter_env == microservice_tags_dict.get('env')) matches_app_id = (filter_app_id is None) or (filter_app_id == microservice_tags_dict.get('app_id')) if matches_env and matches_app_id: microservice_dict = { 'name': microservice_name, 'status': microservice_status, 'address': not_available, 'microservice_id': not_available, 'container_id': container_id, 'tags': microservice_tags_dict, 'start_timestamp': microservice_start_timestamp, } result.append(microservice_dict) return result
def _get_crashed_services(): ship = get_ship_name() services_list = kv.kv_list('ships/{}/service/'.format(ship)) crashed_services = [] if not services_list: return crashed_services for service in services_list: service_dict = kv.kv_get(service) microservice_status = service_dict['Status'] if microservice_status in ['crashed', 'not-recovered']: crashed_services.append(service) return crashed_services
def _load_containers_to_kv_store(saved_containers_path): wait_for_consul_ready() try: ship = get_ship_name() containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship)) saved_containers = _load_saved_containers_parameters_list(saved_containers_path) _add_running_services_at_startup(containers_saved_in_kv, ship) if isinstance(saved_containers, dict): _load_from_dict(saved_containers, containers_saved_in_kv, ship) else: _load_from_list(saved_containers, ship) except: traceback.print_exc() get_logger().error('Unable to load from {}.'.format(saved_containers_path))
def _get_restart_parameters(self, container_id): try: docker_api = docker_client.api() docker_inspect = docker_api.inspect_container(container_id) for env_var in docker_inspect['Config']['Env']: env_key, env_value = (env_var.strip('"').split('=', 1) + [''])[:2] if env_key == 'RESTART_CONTAINER_PARAMETERS': return json.loads(base64.b64decode(env_value)) except NotFound: service_list = kv_list('ships/') for service in service_list: if service.split('/')[-1] == container_id: return kv_get(service).get('params')
def recover_saved_containers_from_parameters(saved_containers): wait_for_consul_ready() try: ship = get_ship_name() containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship)) if isinstance(saved_containers, dict): _load_from_dict(saved_containers, containers_saved_in_kv, ship) else: _load_from_list(saved_containers, ship) except: traceback.print_exc() containers_to_be_recovered = recover_containers_from_kv_store() return containers_to_be_recovered
def get_list(): result_list = [] # Contains dictionaries: # {'name': ..., 'is_default':..., 'address':..., ['user':...], ['password':...]) default_alias = kv.kv_get('dockyard/default') aliases_key = 'dockyard/aliases/' prefixed_aliases = kv.kv_list(aliases_key) or [] for prefixed_alias in sorted(prefixed_aliases): alias_name = prefixed_alias[len(aliases_key):] row = { 'name': alias_name, 'is_default': default_alias == alias_name, } row.update(get_alias(alias_name)) result_list.append(row) return result_list
def _clean_up_kv_store(): global next_kv_clean_up_timestamp if time.time() < next_kv_clean_up_timestamp: return get_logger().info('Cleaning up kv-store:') next_kv_clean_up_timestamp = get_next_kv_clean_up_timestamp() services = armada_api.get_json('list') valid_container_ids = set(service.get('container_id') for service in services) start_timestamp_keys = kv.kv_list('start_timestamp/') or [] for key in start_timestamp_keys: container_id = key.split('/')[-1] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) single_active_instance_keys = kv.kv_list('single_active_instance/') or [] for key in single_active_instance_keys: container_id = key.split('/')[-1].split(':')[0] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) get_logger().info('Finished cleaning up kv-store.')
def POST(self): consul_host, error = self.get_post_parameter('host') if error: return self.status_error(error) ship = get_ship_name() local_services = kv.kv_list('ships/{}/service/'.format(ship)) or [] local_services_data = {key: kv.kv_get(key) for key in local_services} armada_size = _get_armada_size() if armada_size > 1: return self.status_error( 'Currently only single ship armadas can join the others. ' 'Your armada has size: {0}.'.format(armada_size)) try: agent_self_dict = consul_query( 'agent/self', consul_address='{0}:8500'.format(consul_host)) datacenter = agent_self_dict['Config']['Datacenter'] except: return self.status_error( 'Could not read remote host datacenter address.') current_consul_mode = _get_current_consul_mode() if current_consul_mode == consul_config.ConsulMode.BOOTSTRAP: override_runtime_settings( consul_mode=consul_config.ConsulMode.CLIENT, ship_ips=[consul_host], datacenter=datacenter) else: override_runtime_settings(ship_ips=[consul_host] + get_other_ship_ips(), datacenter=datacenter) if _restart_consul(): supervisor_server = xmlrpclib.Server('http://localhost:9001/RPC2') hermes_init_output = supervisor_server.supervisor.startProcessGroup( 'hermes_init') get_logger().info( 'hermes_init start: {}'.format(hermes_init_output)) set_ship_name(ship) for key, data in local_services_data.items(): kv.kv_set(key, data) return self.status_ok() return self.status_error('Waiting for armada restart timed out.')
def set_ship_name(new_name): ship_ip = get_ship_ip() old_name = get_ship_name(ship_ip) saved_containers = kv.kv_list('ships/{}/service/'.format(old_name)) if saved_containers: for container in saved_containers: new_key = 'ships/{}/service/{}/{}'.format(new_name, container.split('/')[-2], container.split('/')[-1]) container_dict = kv.kv_get(container) kv.kv_set(new_key, container_dict) kv.kv_remove(container) kv.kv_set('ships/{}/name'.format(ship_ip), new_name) kv.kv_set('ships/{}/ip'.format(new_name), ship_ip) os.system('sed -i \'s|ships/{}/|ships/{}/|\' /etc/consul.config'.format(old_name, new_name)) try: os.system('/usr/local/bin/consul reload') except Exception as e: traceback.print_exc() kv.kv_remove('containers_parameters_list/{}'.format(old_name))
def set_ship_name(new_name): ship_ip = get_ship_ip() old_name = get_ship_name(ship_ip) saved_containers = kv.kv_list('ships/{}/service/'.format(old_name)) if saved_containers: for container in saved_containers: new_key = 'ships/{}/service/{}/{}'.format(new_name, container.split('/')[-2], container.split('/')[-1]) container_dict = kv.kv_get(container) kv.kv_set(new_key, container_dict) kv.kv_remove(container) kv.kv_set('ships/{}/name'.format(ship_ip), new_name) kv.kv_set('ships/{}/ip'.format(new_name), ship_ip) os.system('sed -i \'s|ships/{}/|ships/{}/|\' /etc/consul.config'.format(old_name, new_name)) try: os.system('/usr/local/bin/consul reload') except Exception as e: get_logger().exception(e) kv.kv_remove('containers_parameters_list/{}'.format(old_name))
def _stop_service(self, container_id): ship = get_ship_name() service_list = kv_list('ships/{}/service/'.format(ship)) try: key = fnmatch.filter(service_list, '*/{}'.format(container_id))[0] except (IndexError, TypeError): key = None if not is_container_running(container_id): if key: kv_remove(key) try: deregister_services(container_id) except Exception as e: get_logger().exception(e) else: run_command_in_container('supervisorctl stop armada_agent', container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container( 'supervisorctl stop register_in_service_discovery', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except Exception as e: get_logger().exception(e) for i in range(3): try: docker_api.stop(container_id) except Exception as e: get_logger().debug(e, exc_info=True) last_exception = e if not is_container_running(container_id): if key: kv_remove(key) break if is_container_running(container_id): get_logger().error('Could not stop container: %s', container_id) raise last_exception
def _add_running_services_at_startup(): wait_for_consul_ready() try: ship = get_ship_name() containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship)) sleep(10) all_services = consul_query('agent/services') del all_services['consul'] for service_id, service_dict in all_services.items(): if ':' in service_id: continue if service_dict['Service'] == 'armada': continue key = 'ships/{}/service/{}/{}'.format(ship, service_dict['Service'], service_id) if not containers_saved_in_kv or key not in containers_saved_in_kv: kv.save_container(ship, service_id, 'started') get_logger().info('Added running service: {}'.format(service_id)) except: get_logger().exception('Unable to add running services.')
def _stop_service(self, container_id): ship = get_ship_name() service_list = kv_list("ships/{}/service/".format(ship)) try: key = fnmatch.filter(service_list, "*/{}".format(container_id))[0] except (IndexError, TypeError): key = None if not is_container_running(container_id): if key: kv_remove(key) try: deregister_services(container_id) except Exception as e: get_logger().exception(e) else: run_command_in_container("supervisorctl stop armada_agent", container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container("supervisorctl stop register_in_service_discovery", container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except Exception as e: get_logger().exception(e) for i in range(3): try: docker_api.stop(container_id) except Exception as e: get_logger().debug(e, exc_info=True) last_exception = e if not is_container_running(container_id): if key: kv_remove(key) break if is_container_running(container_id): get_logger().error("Could not stop container: %s", container_id) raise last_exception
def _stop_service(self, container_id): ship = get_ship_name() service_dict = None service_list = kv_list('ships/{}/service/'.format(ship)) if service_list: key = fnmatch.filter(service_list, '*/{}'.format(container_id)) service_dict = kv_get(key[0]) if key else None if service_dict and service_dict['Status'] in [ 'crashed', 'not-recovered' ]: kv_remove(key[0]) else: run_command_in_container('supervisorctl stop armada_agent', container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container( 'supervisorctl stop register_in_service_discovery', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except: traceback.print_exc() for i in range(3): try: docker_api.stop(container_id) kv_remove(key[0]) except Exception as e: last_exception = e traceback.print_exc() if not is_container_running(container_id): break if is_container_running(container_id): get_logger().error( 'Could not stop container: {}'.format(container_id)) raise last_exception
def _add_running_services_at_startup(): wait_for_consul_ready() try: ship = get_ship_name() containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship)) sleep(10) all_services = consul_query('agent/services') del all_services['consul'] for service_id, service_dict in all_services.items(): if ':' in service_id: continue if service_dict['Service'] == 'armada': continue key = 'ships/{}/service/{}/{}'.format(ship, service_dict['Service'], service_id) if not containers_saved_in_kv or key not in containers_saved_in_kv: kv.save_container(ship, service_id, 'started') get_logger().info( 'Added running service: {}'.format(service_id)) except: get_logger().exception('Unable to add running services.')
def _deregister_not_running_services(): try: ship = get_ship_name() except: ship = get_ship_ip() services = _get_local_services() running_containers_ids = _get_running_container_ids() for service_id in services.keys(): container_id, is_subservice = _get_container_id_with_subservice(service_id) if container_id in running_containers_ids: continue if not is_subservice: name = services[service_id]['Service'] kv.update_container_status('crashed', ship=ship, name=name, container_id=container_id) deregister_services(container_id) services_keys = kv.kv_list('ships/{}/service/'.format(ship)) or [] for service_key in services_keys: container_id = service_key.split('/')[-1] if container_id not in running_containers_ids: kv.update_container_status('crashed', key=service_key) deregister_services(container_id)
def POST(self): consul_host, error = self.get_post_parameter('host') if error: return self.status_error(error) ship = get_ship_name() local_services = kv.kv_list('ships/{}/service/'.format(ship)) or [] local_services_data = {key: kv.kv_get(key) for key in local_services} armada_size = _get_armada_size() if armada_size > 1: return self.status_error('Currently only single ship armadas can join the others. ' 'Your armada has size: {0}.'.format(armada_size)) try: agent_self_dict = consul_query('agent/self', consul_address='{0}:8500'.format(consul_host)) datacenter = agent_self_dict['Config']['Datacenter'] except: return self.status_error('Could not read remote host datacenter address.') current_consul_mode = _get_current_consul_mode() if current_consul_mode == consul_config.ConsulMode.BOOTSTRAP: override_runtime_settings(consul_mode=consul_config.ConsulMode.CLIENT, ship_ips=[consul_host], datacenter=datacenter) else: override_runtime_settings(ship_ips=[consul_host] + get_other_ship_ips(), datacenter=datacenter) if _restart_consul(): supervisor_server = xmlrpclib.Server('http://localhost:9001/RPC2') hermes_init_output = supervisor_server.supervisor.startProcessGroup('hermes_init') get_logger().info('hermes_init start: {}'.format(hermes_init_output)) set_ship_name(ship) for key, data in local_services_data.items(): kv.kv_set(key, data) return self.status_ok() return self.status_error('Waiting for armada restart timed out.')
def get_matched_containers(microservice_name_or_container_id_prefix): service_names = list(consul_query('catalog/services').keys()) matched_containers_by_name = [] matched_containers_by_id = [] for service_name in service_names: query = 'catalog/service/{service_name}'.format(**locals()) try: instances = consul_query(query) except Exception as e: print_err( 'WARNING: query "{query}" failed ({exception_class}: {exception})' .format(query=query, exception_class=type(e).__name__, exception=e)) instances = [] for instance in instances: container_id = instance['ServiceID'].split(':')[0] service_name = instance['ServiceName'] if microservice_name_or_container_id_prefix == service_name: matched_containers_by_name.append(instance) if container_id.startswith(microservice_name_or_container_id_prefix ) and ":" not in instance['ServiceID']: matched_containers_by_id.append(instance) services_list = kv.kv_list('services/') if services_list: for service in services_list: service_dict = kv.kv_get(service) container_id = service_dict['container_id'] service_name = service_dict['ServiceName'] if service_dict['Status'] == 'started': try: instances = consul_query( 'catalog/service/{}'.format(service_name)) if container_id in [ i['ServiceID'].split(':')[0] for i in instances ]: continue except Exception as e: logging.exception(e) if microservice_name_or_container_id_prefix == service_name: matched_containers_by_name.append(service_dict) if container_id.startswith(microservice_name_or_container_id_prefix) \ and ":" not in service_dict['ServiceID']: matched_containers_by_id.append(service_dict) matched_containers_by_name_count = len(matched_containers_by_name) matched_containers_by_id_count = len(matched_containers_by_id) if matched_containers_by_name_count and matched_containers_by_id_count: raise ArmadaCommandException( 'Found matching containers with both microservice name ({matched_containers_by_name_count}) ' 'and container_id ({matched_containers_by_id_count}). ' 'Please provide more specific criteria.'.format(**locals())) if matched_containers_by_id_count > 1 and len( microservice_name_or_container_id_prefix) < 12: raise ArmadaCommandException( 'There are too many ({matched_containers_by_id_count}) matching containers. ' 'Please provide more specific container_id.'.format(**locals())) matched_containers = matched_containers_by_name + matched_containers_by_id matches_count = len(matched_containers) if matches_count == 0: raise ArmadaCommandException( 'There are no running containers with microservice: ' '{microservice_name_or_container_id_prefix}'.format(**locals())) return matched_containers
def _mark_service_as_crashed(container_id, service_name): params = get_container_parameters(container_id) kv_index = 0 if kv.kv_list('service/{}/'.format(service_name)): kv_index = int(kv.kv_list('service/{}/'.format(service_name))[-1].split('/')[2]) + 1 kv.save_service(service_name, kv_index, 'crashed', params, container_id)
def get_services_by_ship(ship=None): consul_key = 'services' if ship: consul_key = '{}/{}'.format(consul_key, ship) return kv_list(consul_key) or []