def command_diagnose(args): microservice_name = args.microservice_name script = "diagnose.sh" if args.logs: script = "logs.sh" diagnostic_command = ( "armada ssh -i {microservice_name} " "bash < /opt/armada/armada_command/diagnostic_scripts/{script}" ).format(**locals()) exit_code = subprocess.call(diagnostic_command, shell=True) if exit_code != 0: instances = get_matched_containers(microservice_name) if instances is not None and len(instances) == 1: instance = instances[0] status = instance['Status'] if status == 'recovering': params = instance['params'] print('RESTART_CONTAINER_PARAMETERS:') print(json.dumps(params, indent=4, sort_keys=True)) elif status in ['crashed', 'not-recovered']: params = instance['params'] print('RESTART_CONTAINER_PARAMETERS:') print(json.dumps(params, indent=4, sort_keys=True)) print('') container_id = instance['container_id'] print('Docker logs of container_id: {}'.format(container_id)) diagnostic_command = ("docker logs {}".format(container_id)) subprocess.call(diagnostic_command, shell=True)
def command_diagnose(args): microservice_name = args.microservice_name script = "diagnose.sh" if args.logs: script = "logs.sh" diagnostic_command = ("armada ssh -i {microservice_name} " "bash < /opt/armada/armada_command/diagnostic_scripts/{script}").format(**locals()) exit_code = subprocess.call(diagnostic_command, shell=True) if exit_code != 0: instances = get_matched_containers(microservice_name) if instances is not None and len(instances) == 1: instance = instances[0] status = instance['Status'] if status == 'recovering': params = instance['params'] print('RESTART_CONTAINER_PARAMETERS:') print(json.dumps(params, indent=4, sort_keys=True)) elif status in ['crashed', 'not-recovered']: params = instance['params'] print('RESTART_CONTAINER_PARAMETERS:') print(json.dumps(params, indent=4, sort_keys=True)) print('') container_id = instance['container_id'] print('Docker logs of container_id: {}'.format(container_id)) diagnostic_command = ("docker logs {}".format(container_id)) subprocess.call(diagnostic_command, shell=True)
def _recover_container(container_parameters): get_logger().info('Recovering: %s ...\n', json.dumps(container_parameters)) recovery_result = armada_api.post('run', container_parameters) if recovery_result.get('status') == 'ok': get_logger().info('Recovered container: %s', json.dumps(recovery_result)) return True else: get_logger().error('Could not recover container: %s', json.dumps(recovery_result)) return False
def _recover_container(container_parameters): get_logger().info('Recovering: %s ...\n', json.dumps(container_parameters)) recovery_result = armada_api.post('run', container_parameters) if recovery_result.get('status') == 'ok': get_logger().info('Recovered container: %s', json.dumps(recovery_result)) return True else: get_logger().error('Could not recover container: %s', json.dumps(recovery_result)) return False
def recover_containers_from_kv_store(): services_to_be_recovered = _get_crashed_services() for service in services_to_be_recovered: kv.update_container_status('recovering', key=service) recovery_retry_count = 0 while services_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: %s", json.dumps(services_to_be_recovered)) services_not_recovered = [] for service in services_to_be_recovered: service_parameters = kv.kv_get(service)['params'] if not _recover_container(service_parameters): services_not_recovered.append(service) else: kv.kv_remove(service) sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS) services_to_be_recovered = services_not_recovered recovery_retry_count += 1 for service in services_to_be_recovered: kv.update_container_status('not-recovered', key=service) return services_to_be_recovered
def recover_containers_from_kv_store(): services_to_be_recovered = _get_crashed_services() for service in services_to_be_recovered: update_container_status('recovering', key=service) recovery_retry_count = 0 while services_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT: get_logger().info("Recovering containers: %s", json.dumps(services_to_be_recovered)) services_not_recovered = [] for service in services_to_be_recovered: service_parameters = kv.kv_get(service)['params'] if not _recover_container(service_parameters): services_not_recovered.append(service) else: kv.kv_remove(service) if services_not_recovered: sleep( min( START_DELAY_BETWEEN_RECOVER_RETRY * 2**recovery_retry_count, MAX_DELAY_BETWEEN_RECOVER_RETRY)) services_to_be_recovered = services_not_recovered recovery_retry_count += 1 for service in services_to_be_recovered: update_container_status('not-recovered', key=service) return services_to_be_recovered
def get_consul_config(consul_mode, ship_ips, datacenter, ship_external_ip, ship_name): is_server = (consul_mode != ConsulMode.CLIENT) config = { 'server': is_server, 'start_join': ship_ips, 'datacenter': str(datacenter), 'node_name': 'ship-{0}'.format(ship_external_ip), 'advertise_addr': str(ship_external_ip), 'client_addr': '0.0.0.0', 'data_dir': '/var/opt/consul-{datacenter}-{consul_mode}'.format(**locals()), 'leave_on_terminate': True, 'performance': {'raft_multiplier': 1}, } if consul_mode == ConsulMode.BOOTSTRAP: config['bootstrap_expect'] = 1 env_pythonpath = 'PYTHONPATH=/opt/armada-docker:$PYTHONPATH' save_runtime_settings_cmd = '{env_pythonpath} python -m armada_backend.runtime_settings'.format(**locals()) running_containers_parameters_path = RUNNING_CONTAINERS_PARAMETERS_PATH save_running_containers_cmd = ('{env_pythonpath} python -m armada_backend.save_running_containers ' '{running_containers_parameters_path} ' '>> /tmp/save_running_containers.out 2>&1').format(**locals()) config['watches'] = [ {'type': 'keyprefix', 'prefix': 'dockyard/', 'handler': save_runtime_settings_cmd}, {'type': 'nodes', 'handler': save_runtime_settings_cmd}, {'type': 'keyprefix', 'prefix': 'ships/{}/'.format(ship_name), 'handler': save_running_containers_cmd}, ] return json.dumps(config, sort_keys=True, indent=4)
def GET(self, image_name): try: docker_api = docker_client.api() image_info = json.dumps(docker_api.images(image_name)) return self.status_ok({'image_info': '{image_info}'.format(**locals())}) except Exception as e: return self.status_exception("Cannot get info about image.", e)
def GET(self, image_name): try: docker_api = docker_client.api() image_info = json.dumps(docker_api.images(image_name)) return self.status_ok( {'image_info': '{image_info}'.format(**locals())}) except Exception as e: return self.status_exception("Cannot get info about image.", e)
def _restart_consul(): # Services will be registered again by their script 'register_in_service_discovery'. agent_self_dict = consul_query('agent/self') node_name = agent_self_dict['Config']['NodeName'] request_body = json.dumps({'Node': node_name}) consul_put('catalog/deregister', data=request_body) check_call(['consul', 'leave']) return wait_for_consul_ready()
def _save_runtime_settings(): consul_settings = { 'is_commander': is_ship_commander(), 'name': get_ship_name(), 'ships': get_other_ship_ips(), 'datacenter': get_current_datacenter(), 'dockyards': alias.get_list(), } with open(consul_config.RUNTIME_SETTINGS_PATH, 'w') as runtime_settings: runtime_settings.write(json.dumps(consul_settings, sort_keys=True, indent=4))
def _save_runtime_settings(): consul_settings = { 'is_commander': is_ship_commander(), 'name': get_ship_name(), 'ships': get_other_ship_ips(), 'datacenter': get_current_datacenter(), 'dockyards': alias.get_list(), } with open(consul_config.RUNTIME_SETTINGS_PATH, 'w') as runtime_settings: runtime_settings.write(json.dumps(consul_settings, sort_keys=True, indent=4))
def print_result_from_armada_api(result): if result['status'] == 'ok': result_value = dict(result) del result_value['status'] if result_value: print(json.dumps(result_value)) else: if result['status'] == 'error': print_err(result.get('error')) else: print_err(result) sys.exit(1)
def print_result_from_armada_api(result): if result['status'] == 'ok': result_value = dict(result) del result_value['status'] if result_value: print(json.dumps(result_value)) else: if result['status'] == 'error': print_err(result.get('error')) else: print_err(result) sys.exit(1)
def _fetch_hermes_from_couriers(courier_addresses): my_ssh_address = get_container_ssh_address(socket.gethostname()) for courier_address in courier_addresses: courier_url = 'http://{courier_address}/update_hermes'.format(**locals()) try: payload = {'ssh': my_ssh_address, 'path': HERMES_DIRECTORY} response = requests.post(courier_url, json.dumps(payload)) response.raise_for_status() if response.text.strip() != 'ok': raise Exception('Error response from courier:\n{}'.format(response.text)) except Exception as e: get_logger().error('Fetching all sources from courier %s failed:', courier_address) get_logger().exception(e)
def override_runtime_settings(consul_mode=None, ship_name=None, ship_ips=None, datacenter=None): consul_settings = {} if consul_mode is not None: consul_settings['is_commander'] = consul_mode != consul_config.ConsulMode.CLIENT if ship_name is not None: consul_settings['name'] = ship_name if ship_ips is not None: consul_settings['ships'] = ship_ips if datacenter is not None: consul_settings['datacenter'] = datacenter with open(consul_config.OVERRIDE_RUNTIME_SETTINGS_PATH, 'w') as runtime_settings: runtime_settings.write(json.dumps(consul_settings, sort_keys=True, indent=4))
def override_runtime_settings(consul_mode=None, ship_name=None, ship_ips=None, datacenter=None): consul_settings = {} if consul_mode is not None: consul_settings['is_commander'] = consul_mode != consul_config.ConsulMode.CLIENT if ship_name is not None: consul_settings['name'] = ship_name if ship_ips is not None: consul_settings['ships'] = ship_ips if datacenter is not None: consul_settings['datacenter'] = datacenter with open(consul_config.OVERRIDE_RUNTIME_SETTINGS_PATH, 'w') as runtime_settings: runtime_settings.write(json.dumps(consul_settings, sort_keys=True, indent=4))
def on_get(self, req, resp, image_name_or_address, image_name=None): if image_name is None: dockyard_address = None image_name = image_name_or_address else: dockyard_address = image_name_or_address image = LocalArmadaImage(dockyard_address, image_name) try: docker_api = docker_client.api() image_info = json.dumps(docker_api.images(image.image_path)) return self.status_ok( resp, {'image_info': '{image_info}'.format(**locals())}) except Exception as e: return self.status_exception(resp, "Cannot get info about image.", e)
def main(): setup_sentry() try: args = _parse_args() _add_running_services_at_startup() if args.force or _check_if_we_should_recover(args.saved_containers_path): _load_containers_to_kv_store(args.saved_containers_path) not_recovered = recover_containers_from_kv_store() if not_recovered: get_logger().error("Containers not recovered: %s", json.dumps(not_recovered)) sys.exit(1) get_logger().info("All containers recovered :)") finally: with open(RECOVERY_COMPLETED_PATH, 'w') as recovery_completed_file: recovery_completed_file.write('1')
def get_consul_config(consul_mode, ship_ips, datacenter, ship_external_ip, ship_name): is_server = (consul_mode != ConsulMode.CLIENT) config = { 'server': is_server, 'start_join': ship_ips, 'datacenter': str(datacenter), 'node_name': 'ship-{0}'.format(ship_external_ip), 'advertise_addr': str(ship_external_ip), 'client_addr': '0.0.0.0', 'data_dir': '/var/opt/consul-{datacenter}-{consul_mode}'.format(**locals()), 'leave_on_terminate': True, 'performance': { 'raft_multiplier': 1 }, } if consul_mode == ConsulMode.BOOTSTRAP: config['bootstrap_expect'] = 1 env_pythonpath = 'PYTHONPATH=/opt/armada-docker:$PYTHONPATH' save_runtime_settings_cmd = '{env_pythonpath} python -m armada_backend.runtime_settings'.format( **locals()) running_containers_parameters_path = RUNNING_CONTAINERS_PARAMETERS_PATH save_running_containers_cmd = ( '{env_pythonpath} python -m armada_backend.save_running_containers ' '{running_containers_parameters_path} ' '>> /tmp/save_running_containers.out 2>&1').format(**locals()) config['watches'] = [ { 'type': 'keyprefix', 'prefix': 'dockyard/', 'handler': save_runtime_settings_cmd }, { 'type': 'nodes', 'handler': save_runtime_settings_cmd }, { 'type': 'keyprefix', 'prefix': 'ships/{}/'.format(ship_name), 'handler': save_running_containers_cmd }, ] return json.dumps(config, sort_keys=True, indent=4)
def _create_service(self, image_path=None, microservice_name=None, microservice_env=None, microservice_app_id=None, dockyard_user=None, dockyard_password=None, ports=None, environment=None, volumes=None, run_command=None, resource_limits=None, configs=None, **kwargs): # Check required fields in received JSON: if not image_path: raise ValueError('Field image_path cannot be empty.') if not run_command: raise ValueError('Field run_command cannot be empty.') if kwargs: get_logger().warning( 'JSON data sent to API contains unrecognized keys: %s', list(kwargs.keys())) # Set default values: environment = environment or {} ports = ports or {} volumes = volumes or {} resource_limits = resource_limits or {} configs = configs or [] image_name = split_image_path(image_path)[1] microservice_name = microservice_name or environment.get( 'MICROSERVICE_NAME') or image_name microservice_env = microservice_env or environment.get( 'MICROSERVICE_ENV') microservice_app_id = microservice_app_id or environment.get( 'MICROSERVICE_APP_ID') # Update environment variables with armada-specific values: restart_parameters = { 'image_path': image_path, 'microservice_name': microservice_name, 'microservice_env': microservice_env, 'microservice_app_id': microservice_app_id, 'dockyard_user': dockyard_user, 'dockyard_password': dockyard_password, 'ports': ports, 'environment': environment, 'volumes': volumes, 'run_command': run_command, 'resource_limits': resource_limits, 'configs': configs, } dev = environment.get('ARMADA_DEVELOP') if dev: restart_parameters['image_path'] = image_path.split('/', 1)[-1] environment['ARMADA_RUN_COMMAND'] = base64.b64encode( run_command.encode()) environment['IMAGE_NAME'] = image_name environment['MICROSERVICE_NAME'] = microservice_name environment['RESTART_CONTAINER_PARAMETERS'] = base64.b64encode( json.dumps(restart_parameters, sort_keys=True).encode()) if microservice_env: environment['MICROSERVICE_ENV'] = microservice_env if microservice_app_id: environment['MICROSERVICE_APP_ID'] = microservice_app_id config_path, hermes_volumes = process_hermes(microservice_name, image_name, microservice_env, microservice_app_id, configs) if config_path: environment['CONFIG_PATH'] = config_path volumes[docker_client. DOCKER_SOCKET_PATH] = docker_client.DOCKER_SOCKET_PATH volumes.update(hermes_volumes or {}) long_container_id = self._create_container(image_path, ports, environment, volumes, dockyard_user, dockyard_password, resource_limits) return long_container_id
def _multiset_difference(a, b): a_counter = Counter(json.dumps(x, sort_keys=True) for x in a) b_counter = Counter(json.dumps(x, sort_keys=True) for x in b) difference = a_counter - b_counter return [json.loads(x) for x in difference.elements()]
def _multiset_difference(a, b): a_counter = Counter(json.dumps(x, sort_keys=True) for x in a) b_counter = Counter(json.dumps(x, sort_keys=True) for x in b) difference = a_counter - b_counter return [json.loads(x) for x in difference.elements()]
def status_ok(self, extra_result=None): extra_result = extra_result or {} extra_result['status'] = 'ok' web.header('Content-Type', 'application/json') return json.dumps(extra_result, indent=4, sort_keys=True)
def consul_put(query, data=None, consul_address=None): data = data or {} return requests.put(__get_consul_url(query, consul_address), data=json.dumps(data), timeout=CONSUL_TIMEOUT_IN_SECONDS)
def _create_response_with_error(error_msg=None): return json.dumps({'status': 'error', 'error': error_msg or ''})
def consul_put(query, data, consul_address=None): return requests.put(__get_consul_url(query, consul_address), data=json.dumps(data), timeout=CONSUL_TIMEOUT_IN_SECONDS)