def _get_runtime_settings(): try: shutil.copy(consul_config.RUNTIME_SETTINGS_PATH, consul_config.ORIGINAL_RUNTIME_SETTINGS_PATH) with open(consul_config.ORIGINAL_RUNTIME_SETTINGS_PATH ) as runtime_settings_json: runtime_settings = json.load(runtime_settings_json) except Exception as e: get_logger().exception(e) runtime_settings = {} try: if os.path.isfile(consul_config.OVERRIDE_RUNTIME_SETTINGS_PATH): with open(consul_config.OVERRIDE_RUNTIME_SETTINGS_PATH ) as runtime_settings_json: runtime_settings.update(json.load(runtime_settings_json)) except Exception as e: get_logger().exception(e) ship_ips = runtime_settings.get('ships', []) consul_mode = consul_config.ConsulMode.BOOTSTRAP if runtime_settings.get('is_commander') is True: if ship_ips and len(ship_ips) > 0: consul_mode = consul_config.ConsulMode.SERVER if runtime_settings.get('is_commander') is False: consul_mode = consul_config.ConsulMode.CLIENT if runtime_settings.get('datacenter'): datacenter = runtime_settings.get('datacenter') else: datacenter = 'dc-' + str(random.randrange(1000000)) ship_name = runtime_settings.get('name') return consul_mode, ship_ips, datacenter, ship_name
def status_exception(self, message, exception): get_logger().exception(exception) error_msg = "API exception: {0}. {1} - {2}".format( message, type(exception).__name__, str(exception)) web.header('Content-Type', 'application/json') return _create_response_with_error(error_msg)
def _get_armada_size(): try: catalog_nodes_dict = consul_query('catalog/nodes') return len(catalog_nodes_dict) except Exception as e: get_logger().exception(e) return 0
def _parse_single_ship(services_dict, filter_microservice_name, filter_env, filter_app_id): try: services_list = list(services_dict) except AttributeError: services_list = None result = {} if not services_list: return result if filter_microservice_name: services_list = fnmatch.filter( services_list, 'services/*/{}/*'.format(filter_microservice_name)) for service in services_list: service_dict = services_dict[service] microservice_name = service_dict['ServiceName'] microservice_status = service_dict['Status'] microservice_id = service_dict['ServiceID'] container_id = service_dict['container_id'] microservice_start_timestamp = service_dict['start_timestamp'] single_active_instance = service_dict.get('single_active_instance', False) microservice_version = service_dict.get('microservice_version') not_available = 'n/a' microservice_tags_dict = {} try: if service_dict['params']['microservice_env']: microservice_tags_dict['env'] = service_dict['params'][ 'microservice_env'] if service_dict['params']['microservice_app_id']: microservice_tags_dict['app_id'] = service_dict['params'][ 'microservice_app_id'] except KeyError as e: get_logger().warning(repr(e)) matches_env = (filter_env is None) or ( filter_env == microservice_tags_dict.get('env')) matches_app_id = (filter_app_id is None) or ( filter_app_id == microservice_tags_dict.get('app_id')) if matches_env and matches_app_id: microservice_dict = { 'name': microservice_name, 'status': microservice_status, 'address': not_available, 'microservice_id': microservice_id, 'container_id': container_id, 'tags': microservice_tags_dict, 'start_timestamp': microservice_start_timestamp, 'single_active_instance': single_active_instance, } if microservice_version: microservice_dict[ 'microservice_version'] = microservice_version result[microservice_id] = microservice_dict return result
def recover_containers_from_kv_store(): services_to_be_recovered = _get_crashed_services() for service in services_to_be_recovered: kv.update_container_status('recovering', key=service) recovery_retry_count = 0 while services_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: %s", json.dumps(services_to_be_recovered)) services_not_recovered = [] for service in services_to_be_recovered: service_parameters = kv.kv_get(service)['params'] if not _recover_container(service_parameters): services_not_recovered.append(service) else: kv.kv_remove(service) sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS) services_to_be_recovered = services_not_recovered recovery_retry_count += 1 for service in services_to_be_recovered: kv.update_container_status('not-recovered', key=service) return services_to_be_recovered
def POST(self): consul_host, error = self.get_post_parameter('host') if error: return self.status_error(error) armada_size = _get_armada_size() if armada_size > 1: return self.status_error('Currently only single ship armadas can join the others. ' 'Your armada has size: {0}.'.format(armada_size)) try: agent_self_dict = consul_query('agent/self', consul_address='{0}:8500'.format(consul_host)) datacenter = agent_self_dict['Config']['Datacenter'] except: return self.status_error('Could not read remote host datacenter address.') current_consul_mode = _get_current_consul_mode() if current_consul_mode == consul_config.ConsulMode.BOOTSTRAP: override_runtime_settings(consul_mode=consul_config.ConsulMode.CLIENT, ship_ips=[consul_host], datacenter=datacenter) else: override_runtime_settings(ship_ips=[consul_host] + get_other_ship_ips(), datacenter=datacenter) if _restart_consul(): supervisor_server = xmlrpclib.Server('http://localhost:9001/RPC2') hermes_init_output = supervisor_server.supervisor.startProcessGroup('hermes_init') get_logger().info('hermes_init start: {}'.format(hermes_init_output)) return self.status_ok() return self.status_error('Waiting for armada restart timed out.')
def main(): args = _parse_args() if not args.force and not _is_recovery_completed(): get_logger().warning('Recovery is not completed. Aborting saving running containers.') return saved_containers_path = args.saved_containers_path try: wait_for_consul_ready() containers_ids = get_local_containers_ids() containers_parameters_list = [] errors_count = 0 for container_id in containers_ids: try: container_parameters = get_container_parameters(container_id) if container_parameters: containers_parameters_list.append(container_parameters) except: errors_count += 1 get_logger().error('ERROR on getting container parameters for {}:'.format(container_id)) traceback.print_exc() containers_parameters_list.sort() # Don't overwrite saved containers' list if it would become empty because of errors. if containers_parameters_list or not errors_count: _save_containers_parameters_list_in_file(containers_parameters_list, saved_containers_path) get_logger().info('Containers have been saved to {}.'.format(saved_containers_path)) try: _save_containers_parameters_list_in_kv_store(containers_parameters_list) get_logger().info('Containers have been saved to kv store.') except: traceback.print_exc() else: get_logger().info('Aborted saving container because of errors.') except: traceback.print_exc() sys.exit(1)
def _load_from_list(saved_containers, ship_name, ship_ip): wait_for_consul_ready() running_containers = _get_local_running_containers() containers_to_be_added = _multiset_difference(saved_containers, running_containers) for container_parameters in containers_to_be_added: get_logger().info('Added service: {}'.format(container_parameters)) save_container(ship_name, _generate_id(), 'crashed', params=container_parameters, ship_ip=ship_ip)
def _load_from_list(saved_containers, ship): wait_for_consul_ready() running_containers = _get_local_running_containers() containers_to_be_added = _multiset_difference(saved_containers, running_containers) for container_parameters in containers_to_be_added: get_logger().info('Added service: {}'.format(container_parameters)) kv.save_container(ship, _generate_id(), 'crashed', params=container_parameters)
def _add_running_services_at_startup(): wait_for_consul_ready() try: ship_ip, ship_name = get_ship_ip_and_name() containers_saved_in_kv = get_local_services_from_kv_store() sleep(10) all_services = consul_query('agent/services') if 'consul' in all_services: del all_services['consul'] for service_id, service_dict in six.iteritems(all_services): if ':' in service_id: continue if service_dict['Service'] == 'armada': continue key = create_consul_services_key(ship_name, service_dict['Service'], service_id) if not containers_saved_in_kv or key not in containers_saved_in_kv: save_container(ship_name, service_id, 'started', ship_ip=ship_ip) get_logger().info( 'Added running service: {}'.format(service_id)) except Exception: get_logger().exception('Unable to add running services.')
def _load_containers_to_kv_store(saved_containers_path): wait_for_consul_ready() try: ship_ip, ship_name = get_ship_ip_and_name() saved_containers = _load_saved_containers_parameters(saved_containers_path) _load_from_dict(saved_containers, ship_name, ship_ip) except: get_logger().exception('Unable to load from %s', saved_containers_path)
def _is_recovery_completed(): try: if os.path.isfile(RECOVERY_COMPLETED_PATH): with open(RECOVERY_COMPLETED_PATH) as recovery_completed_file: if recovery_completed_file.read() == '1': return True except Exception as e: get_logger().exception(e) return False
def get_get_parameter(self, req, parameter_name): try: get_data = req.get_parameter(parameter_name) result = get_data[parameter_name] except Exception as e: get_logger().debug(e, exc_info=True) return None, "Invalid input data - no parameter '{0}'.".format( parameter_name) return result, None
def _create_service(self, image_path=None, microservice_name=None, microservice_env=None, microservice_app_id=None, dockyard_user=None, dockyard_password=None, ports=None, environment=None, volumes=None, run_command=None, resource_limits=None, configs=None, **kwargs): # Check required fields in received JSON: if not image_path: raise ValueError('Field image_path cannot be empty.') if not run_command: raise ValueError('Field run_command cannot be empty.') if kwargs: get_logger().warning('JSON data sent to API contains unrecognized keys: {}'.format(list(kwargs.keys()))) # Set default values: environment = environment or {} ports = ports or {} volumes = volumes or {} resource_limits = resource_limits or {} configs = configs or [] image_name = split_image_path(image_path)[1] microservice_name = microservice_name or environment.get('MICROSERVICE_NAME') or image_name microservice_env = microservice_env or environment.get('MICROSERVICE_ENV') microservice_app_id = microservice_app_id or environment.get('MICROSERVICE_APP_ID') # Update environment variables with armada-specific values: restart_parameters = { 'image_path': image_path, 'microservice_name': microservice_name, 'microservice_env': microservice_env, 'microservice_app_id': microservice_app_id, 'dockyard_user': dockyard_user, 'dockyard_password': dockyard_password, 'ports': ports, 'environment': environment, 'volumes': volumes, 'run_command': run_command, 'resource_limits': resource_limits, 'configs': configs, } environment['RESTART_CONTAINER_PARAMETERS'] = base64.b64encode(json.dumps(restart_parameters, sort_keys=True)) environment['ARMADA_RUN_COMMAND'] = base64.b64encode(run_command) environment['MICROSERVICE_NAME'] = microservice_name if microservice_env: environment['MICROSERVICE_ENV'] = microservice_env if microservice_app_id: environment['MICROSERVICE_APP_ID'] = microservice_app_id config_path, hermes_volumes = process_hermes(microservice_name, image_name, microservice_env, microservice_app_id, configs) if config_path: environment['CONFIG_PATH'] = config_path volumes[docker_client.DOCKER_SOCKET_PATH] = docker_client.DOCKER_SOCKET_PATH volumes.update(hermes_volumes or {}) long_container_id = self._create_container( image_path, microservice_name, ports, environment, volumes, dockyard_user, dockyard_password, resource_limits) return long_container_id
def recover_saved_containers_from_parameters(saved_containers): wait_for_consul_ready() try: ship = get_ship_name() _load_from_dict(saved_containers, ship) except Exception as e: get_logger().exception(e) containers_to_be_recovered = recover_containers_from_kv_store() return containers_to_be_recovered
def _check_if_we_should_recover(saved_containers_path): try: if int(os.environ.get('DOCKER_START_TIMESTAMP')) > int(os.path.getmtime(saved_containers_path)): get_logger().info('Docker daemon restart detected.') return True else: get_logger().info('No need to recover.') return False except: return False
def _get_services_list(filter_microservice_name, filter_env, filter_app_id, filter_local): if filter_local: ship_list = [get_ship_name()] else: ship_list = get_ship_names() services_dict = {} if not ship_list: return {} for ship in ship_list: containers = kv.kv_get('containers_parameters_list/{}'.format(ship)) if containers and isinstance(containers, dict): services_dict.update(containers) services_list = services_dict.keys() result = {} if not services_list: return result if filter_microservice_name: services_list = fnmatch.filter(services_list, 'ships/*/service/{}/*'.format(filter_microservice_name)) for service in services_list: service_dict = services_dict[service] microservice_name = service_dict['ServiceName'] microservice_status = service_dict['Status'] microservice_id = service_dict['ServiceID'] container_id = service_dict['container_id'] microservice_start_timestamp = service_dict['start_timestamp'] not_available = 'n/a' microservice_tags_dict = {} try: if service_dict['params']['microservice_env']: microservice_tags_dict['env'] = service_dict['params']['microservice_env'] if service_dict['params']['microservice_app_id']: microservice_tags_dict['app_id'] = service_dict['params']['microservice_app_id'] except KeyError as e: get_logger().warning(repr(e)) matches_env = (filter_env is None) or (filter_env == microservice_tags_dict.get('env')) matches_app_id = (filter_app_id is None) or (filter_app_id == microservice_tags_dict.get('app_id')) if matches_env and matches_app_id: microservice_dict = { 'name': microservice_name, 'status': microservice_status, 'address': not_available, 'microservice_id': microservice_id, 'container_id': container_id, 'tags': microservice_tags_dict, 'start_timestamp': microservice_start_timestamp, } result[microservice_id] = microservice_dict return result
def _wait_for_armada_start(): timeout_expiration = time.time() + 30 while time.time() < timeout_expiration: time.sleep(1) try: health_status = requests.get('http://localhost/health').text if health_status == 'ok': return except: pass get_logger().error('Could not connect to armada.')
def _load_containers_to_kv_store(saved_containers_path): wait_for_consul_ready() try: ship = get_ship_name() saved_containers = _load_saved_containers_parameters_list(saved_containers_path) if isinstance(saved_containers, dict): _load_from_dict(saved_containers, ship) else: _load_from_list(saved_containers, ship) except: get_logger().exception('Unable to load from %s', saved_containers_path)
def _check_if_we_should_recover(saved_containers_path): try: if int(os.environ.get('DOCKER_START_TIMESTAMP')) > int( os.path.getmtime(saved_containers_path)): get_logger().info('Docker daemon restart detected.') return True else: get_logger().info('No need to recover.') return False except: return False
def get_armada_version(address): url = "http://{address}/version".format(address=address) version = "error" try: result = requests.get(url, timeout=0.5) result.raise_for_status() version = result.text.split()[0] except Exception as e: get_logger().exception(e) return version
def _recover_saved_containers_from_path(saved_containers_path): wait_for_consul_ready() try: not_recovered = recover_containers_from_kv_store() if not_recovered: get_logger().error('Following containers were not recovered: %s', not_recovered) return False else: return True except: get_logger().exception('Unable to recover from %s.', saved_containers_path) return False
def _load_containers_to_kv_store(saved_containers_path): wait_for_consul_ready() try: ship = get_ship_name() saved_containers = _load_saved_containers_parameters_list( saved_containers_path) if isinstance(saved_containers, dict): _load_from_dict(saved_containers, ship) else: _load_from_list(saved_containers, ship) except: get_logger().exception('Unable to load from %s', saved_containers_path)
def recover_saved_containers_from_parameters(saved_containers): wait_for_consul_ready() try: ship = get_ship_name() if isinstance(saved_containers, dict): _load_from_dict(saved_containers, ship) else: _load_from_list(saved_containers, ship) except Exception as e: get_logger().exception(e) containers_to_be_recovered = recover_containers_from_kv_store() return containers_to_be_recovered
def _fetch_hermes_from_couriers(courier_addresses): my_ssh_address = get_container_ssh_address(socket.gethostname()) for courier_address in courier_addresses: courier_url = 'http://{courier_address}/update_hermes'.format(**locals()) try: payload = {'ssh': my_ssh_address, 'path': HERMES_DIRECTORY} response = requests.post(courier_url, json.dumps(payload)) response.raise_for_status() if response.text.strip() != 'ok': raise Exception('Error response from courier:\n{}'.format(response.text)) except Exception as e: get_logger().error('Fetching all sources from courier %s failed:', courier_address) get_logger().exception(e)
def _recover_saved_containers_from_path(saved_containers_path): wait_for_consul_ready() try: not_recovered = recover_containers_from_kv_store() if not_recovered: get_logger().error('Following containers were not recovered: {}'.format(not_recovered)) return False else: return True except: traceback.print_exc() get_logger().error('Unable to recover from {}.'.format(saved_containers_path)) return False
def get_other_ship_ips(): try: catalog_nodes_dict = consul_query('catalog/nodes') ship_ips = list(consul_node['Address'] for consul_node in catalog_nodes_dict) my_ship_ip = get_ship_ip() if my_ship_ip in ship_ips: ship_ips.remove(my_ship_ip) return ship_ips except Exception as e: get_logger().exception(e) return []
def _recover_saved_containers_from_path(saved_containers_path): try: saved_containers = _load_saved_containers_parameters_list(saved_containers_path) not_recovered = recover_saved_containers(saved_containers) if not_recovered: get_logger().error('Following containers were not recovered: {}'.format(not_recovered)) return False else: return True except: traceback.print_exc() get_logger().error('Unable to recover from {}.'.format(saved_containers_path)) return False
def _load_containers_to_kv_store(saved_containers_path): wait_for_consul_ready() try: ship = get_ship_name() saved_containers = _load_saved_containers_parameters_list( saved_containers_path) if isinstance(saved_containers, dict): _load_from_dict(saved_containers, ship) else: _load_from_list(saved_containers, ship) except: traceback.print_exc() get_logger().error( 'Unable to load from {}.'.format(saved_containers_path))
def _load_containers_to_kv_store(saved_containers_path): wait_for_consul_ready() try: ship = get_ship_name() containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship)) saved_containers = _load_saved_containers_parameters_list(saved_containers_path) _add_running_services_at_startup(containers_saved_in_kv, ship) if isinstance(saved_containers, dict): _load_from_dict(saved_containers, containers_saved_in_kv, ship) else: _load_from_list(saved_containers, ship) except: traceback.print_exc() get_logger().error('Unable to load from {}.'.format(saved_containers_path))
def _clean_up_kv_store(): global next_kv_clean_up_timestamp if time.time() < next_kv_clean_up_timestamp: return get_logger().info('Cleaning up kv-store:') next_kv_clean_up_timestamp = get_next_kv_clean_up_timestamp() services = armada_api.get_json('list') valid_container_ids = set( service.get('container_id') for service in services) start_timestamp_keys = kv.kv_list('start_timestamp/') or [] for key in start_timestamp_keys: container_id = key.split('/')[-1] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) single_active_instance_keys = kv.kv_list('single_active_instance/') or [] for key in single_active_instance_keys: container_id = key.split('/')[-1].split(':')[0] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) get_logger().info('Finished cleaning up kv-store.')
def main(): setup_sentry() args = _parse_args() saved_containers_path = args.saved_containers_path if not args.force and not _is_recovery_completed(): get_logger().info( 'Recovery is not completed. Aborting saving running containers.') return try: wait_for_consul_ready() saved_containers = get_local_services() containers_parameters_dict = {} for container in saved_containers: container_dict = kv.kv_get(container) containers_parameters_dict[container] = container_dict if not containers_parameters_dict: get_logger().info( 'Aborted saving container because list is empty.') return _save_containers_parameters_list_in_file(containers_parameters_dict, saved_containers_path) get_logger().info( 'Containers have been saved to {}.'.format(saved_containers_path)) except Exception as e: get_logger().exception(e) sys.exit(1)
def main(): setup_sentry() args = _parse_args() saved_containers_path = args.saved_containers_path if not args.force and not _is_recovery_completed(): get_logger().info( 'Recovery is not completed. Aborting saving running containers.') return try: wait_for_consul_ready() services_key = 'services/{}'.format(get_ship_name()) containers_parameters = kv.kv_get_recurse(services_key, strip_keys=False) if not containers_parameters: get_logger().info( 'Aborted saving container because list is empty.') return _save_containers_parameters_list_in_file(containers_parameters, saved_containers_path) get_logger().info( 'Containers have been saved to {}.'.format(saved_containers_path)) except Exception as e: get_logger().exception(e) sys.exit(1)
def main(): args = _parse_args() if not args.force and not _is_recovery_completed(): get_logger().warning('Recovery is not completed. Aborting saving running containers.') return saved_containers_path = args.saved_containers_path try: wait_for_consul_ready() ship = get_ship_name() saved_containers = kv.kv_list('ships/{}/service/'.format(ship)) containers_parameters_dict = {} if saved_containers: for container in saved_containers: container_dict = kv.kv_get(container) containers_parameters_dict[container] = container_dict if containers_parameters_dict: _save_containers_parameters_list_in_file(containers_parameters_dict, saved_containers_path) get_logger().info('Containers have been saved to {}.'.format(saved_containers_path)) try: _save_containers_parameters_list_in_kv_store(containers_parameters_dict) get_logger().info('Containers have been saved to kv store.') except: traceback.print_exc() else: get_logger().info('Aborted saving container because of errors.') except: traceback.print_exc() sys.exit(1)
def _parse_single_ship(services_dict, filter_microservice_name, filter_env, filter_app_id): try: services_list = services_dict.keys() except AttributeError: services_list = None result = {} if not services_list: return result if filter_microservice_name: services_list = fnmatch.filter(services_list, 'ships/*/service/{}/*'.format(filter_microservice_name)) for service in services_list: service_dict = services_dict[service] microservice_name = service_dict['ServiceName'] microservice_status = service_dict['Status'] microservice_id = service_dict['ServiceID'] container_id = service_dict['container_id'] microservice_start_timestamp = service_dict['start_timestamp'] single_active_instance = service_dict.get('single_active_instance', False) not_available = 'n/a' microservice_tags_dict = {} try: if service_dict['params']['microservice_env']: microservice_tags_dict['env'] = service_dict['params']['microservice_env'] if service_dict['params']['microservice_app_id']: microservice_tags_dict['app_id'] = service_dict['params']['microservice_app_id'] except KeyError as e: get_logger().warning(repr(e)) matches_env = (filter_env is None) or (filter_env == microservice_tags_dict.get('env')) matches_app_id = (filter_app_id is None) or (filter_app_id == microservice_tags_dict.get('app_id')) if matches_env and matches_app_id: microservice_dict = { 'name': microservice_name, 'status': microservice_status, 'address': not_available, 'microservice_id': microservice_id, 'container_id': container_id, 'tags': microservice_tags_dict, 'start_timestamp': microservice_start_timestamp, 'single_active_instance': single_active_instance, } result[microservice_id] = microservice_dict return result
def main(): setup_sentry() try: args = _parse_args() _add_running_services_at_startup() if args.force or _check_if_we_should_recover(args.saved_containers_path): _load_containers_to_kv_store(args.saved_containers_path) not_recovered = recover_containers_from_kv_store() if not_recovered: get_logger().error("Containers not recovered: %s", json.dumps(not_recovered)) sys.exit(1) get_logger().info("All containers recovered :)") finally: with open(RECOVERY_COMPLETED_PATH, 'w') as recovery_completed_file: recovery_completed_file.write('1')
def on_post(self, req, resp): consul_host, error = self.get_post_parameter(req, 'host') if error: return self.status_error(resp, error) ship = get_ship_name() local_services_data = { key: kv.kv_get(key) for key in get_local_services_from_kv_store() } armada_size = _get_armada_size() if armada_size > 1: return self.status_error( resp, 'Currently only single ship armadas can join the others. ' 'Your armada has size: {0}.'.format(armada_size)) try: agent_self_dict = consul_query( 'agent/self', consul_address='{0}:8500'.format(consul_host)) datacenter = agent_self_dict['Config']['Datacenter'] except Exception as e: get_logger().exception(e) return self.status_error( resp, 'Could not read remote host datacenter address.') current_consul_mode = _get_current_consul_mode() if current_consul_mode == consul_config.ConsulMode.BOOTSTRAP: override_runtime_settings( consul_mode=consul_config.ConsulMode.CLIENT, ship_ips=[consul_host], datacenter=datacenter) else: override_runtime_settings(ship_ips=[consul_host] + get_other_ship_ips(), datacenter=datacenter) if _restart_consul(): supervisor_server = xmlrpc.client.Server( 'http://localhost:9001/RPC2') hermes_init_output = supervisor_server.supervisor.startProcessGroup( 'hermes_init') get_logger().info('hermes_init start: %s', hermes_init_output) set_ship_name(ship) for key, data in six.iteritems(local_services_data): kv.kv_set(key, data) return self.status_ok(resp) return self.status_error(resp, 'Waiting for armada restart timed out.')
def _recover_saved_containers_from_path(saved_containers_path): wait_for_consul_ready() try: not_recovered = recover_containers_from_kv_store() if not_recovered: get_logger().error( 'Following containers were not recovered: {}'.format( not_recovered)) return False else: return True except: traceback.print_exc() get_logger().error( 'Unable to recover from {}.'.format(saved_containers_path)) return False
def recover_saved_containers(saved_containers): wait_for_consul_ready() running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference(saved_containers, running_containers) recovery_retry_count = 0 while containers_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: {}".format(json.dumps(containers_to_be_recovered))) containers_not_recovered = [] for container_parameters in containers_to_be_recovered: if not _recover_container(container_parameters): containers_not_recovered.append(container_parameters) sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS) running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference(containers_not_recovered, running_containers) recovery_retry_count += 1 return containers_to_be_recovered
def main(): setup_sentry() consul_mode, ship_ips, datacenter, ship_name = _get_runtime_settings() ship_external_ip = get_external_ip() if ship_name is None: ship_name = ship_external_ip consul_config_content = consul_config.get_consul_config(consul_mode, ship_ips, datacenter, ship_external_ip, ship_name) with open(consul_config.CONFIG_PATH, 'w') as config_file: config_file.write(consul_config_content) command = '/usr/local/bin/consul agent -config-file {config_path}'.format(config_path=consul_config.CONFIG_PATH) get_logger().info('RUNNING: %s', command) args = command.split() os.execv(args[0], args)
def _recover_saved_containers_from_path(saved_containers_path): try: saved_containers = _load_saved_containers_parameters_list( saved_containers_path) not_recovered = recover_saved_containers(saved_containers) if not_recovered: get_logger().error( 'Following containers were not recovered: {}'.format( not_recovered)) return False else: return True except: traceback.print_exc() get_logger().error( 'Unable to recover from {}.'.format(saved_containers_path)) return False
def _stop_service(self, container_id, force=False): if force: service_list = get_services_by_ship() else: service_list = get_local_services_from_kv_store() try: keys = fnmatch.filter(service_list, '*/{}'.format(container_id)) except (IndexError, TypeError) as e: get_logger().exception(e) keys = [] if not is_container_running(container_id): for key in keys: kv_remove(key) try: deregister_services(container_id) except Exception as e: get_logger().exception(e) else: run_command_in_container('supervisorctl stop armada_agent', container_id) trigger_hook('pre-stop', container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except Exception as e: get_logger().exception(e) for i in range(3): try: docker_api.stop(container_id) except Exception as e: get_logger().debug(e, exc_info=True) last_exception = e if not is_container_running(container_id): for key in keys: kv_remove(key) break if is_container_running(container_id): get_logger().error('Could not stop container: %s', container_id) raise last_exception
def _add_running_services_at_startup(): wait_for_consul_ready() try: ship = get_ship_name() containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship)) sleep(10) all_services = consul_query('agent/services') del all_services['consul'] for service_id, service_dict in all_services.items(): if ':' in service_id: continue if service_dict['Service'] == 'armada': continue key = 'ships/{}/service/{}/{}'.format(ship, service_dict['Service'], service_id) if not containers_saved_in_kv or key not in containers_saved_in_kv: kv.save_container(ship, service_id, 'started') get_logger().info('Added running service: {}'.format(service_id)) except: get_logger().exception('Unable to add running services.')
def on_get(self, req, resp, microservice_id): if not exists_service(microservice_id): resp.status = falcon.HTTP_404 resp.json = { 'error': 'Could not find service "{microservice_id}"'.format( **locals()), 'error_id': 'SERVICE_NOT_FOUND', } return try: container_id = microservice_id.split(':')[0] mapping = get_container_ports_mapping(container_id) resp.json = mapping except Exception as e: get_logger().exception(e) resp.json = {'error': 'Could not get ports: {}'.format(repr(e))} resp.status = falcon.HTTP_500
def _login_to_dockyard(self, docker_api, dockyard_address, dockyard_user, dockyard_password): if dockyard_user and dockyard_password: logged_in = False # Workaround for abrupt changes in docker-py library. login_exceptions = [] registry_endpoints = [ 'https://{0}/v1/'.format(dockyard_address), 'https://{0}'.format(dockyard_address), dockyard_address ] for registry_endpoint in registry_endpoints: try: docker_api.login(dockyard_user, dockyard_password, registry=registry_endpoint) logged_in = True break except Exception as e: get_logger().debug(e) login_exceptions.append(e) if not logged_in: raise login_exceptions[0]
def _recover_container(container_parameters): get_logger().info('Recovering: {}...\n'.format(json.dumps(container_parameters))) recovery_result = armada_api.post('run', container_parameters) if recovery_result.get('status') == 'ok': get_logger().info('Recovered container: {}'.format(json.dumps(recovery_result))) return True else: get_logger().error('Could not recover container: {}'.format(json.dumps(recovery_result))) return False
def recover_saved_containers(saved_containers): wait_for_consul_ready() running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference(saved_containers, running_containers) recovery_retry_count = 0 while containers_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: {}".format(json.dumps(containers_to_be_recovered))) containers_not_recovered = [] counter_to_be_recovered = Counter(json.dumps(x, sort_keys=True) for x in containers_to_be_recovered) to_be_recovered = [] for container_parameters in counter_to_be_recovered.elements(): try: if to_be_recovered[-1][0] == container_parameters: index = to_be_recovered[-1][1] + 1 else: index = 0 except IndexError: index = 0 to_be_recovered.append((container_parameters, index)) name = json.loads(container_parameters)['microservice_name'] kv.save_service(name, index, 'recovering', json.loads(container_parameters)) for container_parameters, index in to_be_recovered: container_parameters = json.loads(container_parameters) name = container_parameters['microservice_name'] if not _recover_container(container_parameters): containers_not_recovered.append(container_parameters) if recovery_retry_count == (RECOVERY_RETRY_LIMIT - 1): kv.save_service(name, index, 'not-recovered', json.loads(container_parameters)) else: kv.kv_remove('service/{}/{}'.format(name, index)) sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS) running_containers = _get_local_running_containers() containers_to_be_recovered = _multiset_difference(containers_not_recovered, running_containers) recovery_retry_count += 1 return containers_to_be_recovered
def _stop_service(self, container_id): ship = get_ship_name() service_list = kv_list("ships/{}/service/".format(ship)) try: key = fnmatch.filter(service_list, "*/{}".format(container_id))[0] except (IndexError, TypeError): key = None if not is_container_running(container_id): if key: kv_remove(key) try: deregister_services(container_id) except Exception as e: get_logger().exception(e) else: run_command_in_container("supervisorctl stop armada_agent", container_id) # TODO: Compatibility with old microservice images. Should be removed in future armada version. run_command_in_container("supervisorctl stop register_in_service_discovery", container_id) docker_api = docker_client.api() last_exception = None try: deregister_services(container_id) except Exception as e: get_logger().exception(e) for i in range(3): try: docker_api.stop(container_id) except Exception as e: get_logger().debug(e, exc_info=True) last_exception = e if not is_container_running(container_id): if key: kv_remove(key) break if is_container_running(container_id): get_logger().error("Could not stop container: %s", container_id) raise last_exception
def _clean_up_kv_store(): global next_kv_clean_up_timestamp if time.time() < next_kv_clean_up_timestamp: return get_logger().info('Cleaning up kv-store:') next_kv_clean_up_timestamp = get_next_kv_clean_up_timestamp() services = armada_api.get_json('list') valid_container_ids = set(service.get('container_id') for service in services) start_timestamp_keys = kv.kv_list('start_timestamp/') or [] for key in start_timestamp_keys: container_id = key.split('/')[-1] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) single_active_instance_keys = kv.kv_list('single_active_instance/') or [] for key in single_active_instance_keys: container_id = key.split('/')[-1].split(':')[0] if container_id not in valid_container_ids: get_logger().info('Removing key: {}'.format(key)) kv.kv_remove(key) get_logger().info('Finished cleaning up kv-store.')
def _get_courier_addresses(): courier_addresses = set() courier_is_running = False timeout_expiration = time.time() + 30 last_exception = None while time.time() < timeout_expiration: time.sleep(1) try: courier_addresses = _consul_discover('courier') last_exception = None if courier_addresses: courier_is_running = True break except Exception as e: last_exception = e if last_exception is not None: get_logger().error('Could not determine if courier is running:') get_logger().exception(last_exception) elif not courier_is_running: get_logger().info('No running couriers found.') return courier_addresses
def status_exception(self, message, exception): get_logger().exception(exception) error_msg = "API exception: {0}. {1} - {2}".format(message, type(exception).__name__, str(exception)) web.header('Content-Type', 'application/json') return _create_response_with_error(error_msg)
def status_error(self, message=None): get_logger().error('API error: %s', message) web.header('Content-Type', 'application/json') return _create_response_with_error(message)