Example #1
0
    def POST(self):
        consul_host, error = self.get_post_parameter('host')
        if error:
            return self.status_error(error)

        armada_size = _get_armada_size()
        if armada_size > 1:
            return self.status_error('Currently only single ship armadas can join the others. '
                                     'Your armada has size: {0}.'.format(armada_size))

        try:
            agent_self_dict = consul_query('agent/self', consul_address='{0}:8500'.format(consul_host))
            datacenter = agent_self_dict['Config']['Datacenter']
        except:
            return self.status_error('Could not read remote host datacenter address.')

        current_consul_mode = _get_current_consul_mode()
        if current_consul_mode == consul_config.ConsulMode.BOOTSTRAP:
            override_runtime_settings(consul_mode=consul_config.ConsulMode.CLIENT,
                                      ship_ips=[consul_host],
                                      datacenter=datacenter)
        else:
            override_runtime_settings(ship_ips=[consul_host] + get_other_ship_ips(),
                                      datacenter=datacenter)

        if _restart_consul():
            supervisor_server = xmlrpclib.Server('http://localhost:9001/RPC2')
            hermes_init_output = supervisor_server.supervisor.startProcessGroup('hermes_init')
            get_logger().info('hermes_init start: {}'.format(hermes_init_output))
            return self.status_ok()
        return self.status_error('Waiting for armada restart timed out.')
def _load_from_list(saved_containers, ship):
    wait_for_consul_ready()
    running_containers = _get_local_running_containers()
    containers_to_be_added = _multiset_difference(saved_containers, running_containers)
    for container_parameters in containers_to_be_added:
        get_logger().info('Added service: {}'.format(container_parameters))
        kv.save_container(ship, _generate_id(), 'crashed', params=container_parameters)
def recover_containers_from_kv_store():
    services_to_be_recovered = _get_crashed_services()

    for service in services_to_be_recovered:
        kv.update_container_status('recovering', key=service)

    recovery_retry_count = 0
    while services_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT:
        get_logger().info("Recovering containers: %s", json.dumps(services_to_be_recovered))
        services_not_recovered = []

        for service in services_to_be_recovered:
            service_parameters = kv.kv_get(service)['params']
            if not _recover_container(service_parameters):
                services_not_recovered.append(service)
            else:
                kv.kv_remove(service)
        sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS)
        services_to_be_recovered = services_not_recovered
        recovery_retry_count += 1

    for service in services_to_be_recovered:
        kv.update_container_status('not-recovered', key=service)

    return services_to_be_recovered
def main():
    args = _parse_args()
    if not args.force and not _is_recovery_completed():
        get_logger().warning('Recovery is not completed. Aborting saving running containers.')
        return
    saved_containers_path = args.saved_containers_path
    try:
        wait_for_consul_ready()
        containers_ids = get_local_containers_ids()
        containers_parameters_list = []
        errors_count = 0
        for container_id in containers_ids:
            try:
                container_parameters = get_container_parameters(container_id)
                if container_parameters:
                    containers_parameters_list.append(container_parameters)
            except:
                errors_count += 1
                get_logger().error('ERROR on getting container parameters for {}:'.format(container_id))
                traceback.print_exc()
        containers_parameters_list.sort()
        # Don't overwrite saved containers' list if it would become empty because of errors.
        if containers_parameters_list or not errors_count:
            _save_containers_parameters_list_in_file(containers_parameters_list, saved_containers_path)
            get_logger().info('Containers have been saved to {}.'.format(saved_containers_path))
            try:
                _save_containers_parameters_list_in_kv_store(containers_parameters_list)
                get_logger().info('Containers have been saved to kv store.')
            except:
                traceback.print_exc()
        else:
            get_logger().info('Aborted saving container because of errors.')
    except:
        traceback.print_exc()
        sys.exit(1)
Example #5
0
    def _create_service(self, image_path=None, microservice_name=None, microservice_env=None, microservice_app_id=None,
                        dockyard_user=None, dockyard_password=None, ports=None, environment=None, volumes=None,
                        run_command=None, resource_limits=None, configs=None, **kwargs):
        # Check required fields in received JSON:
        if not image_path:
            raise ValueError('Field image_path cannot be empty.')
        if not run_command:
            raise ValueError('Field run_command cannot be empty.')

        if kwargs:
            get_logger().warning('JSON data sent to API contains unrecognized keys: {}'.format(list(kwargs.keys())))

        # Set default values:
        environment = environment or {}
        ports = ports or {}
        volumes = volumes or {}
        resource_limits = resource_limits or {}
        configs = configs or []
        image_name = split_image_path(image_path)[1]
        microservice_name = microservice_name or environment.get('MICROSERVICE_NAME') or image_name
        microservice_env = microservice_env or environment.get('MICROSERVICE_ENV')
        microservice_app_id = microservice_app_id or environment.get('MICROSERVICE_APP_ID')

        # Update environment variables with armada-specific values:
        restart_parameters = {
            'image_path': image_path,
            'microservice_name': microservice_name,
            'microservice_env': microservice_env,
            'microservice_app_id': microservice_app_id,
            'dockyard_user': dockyard_user,
            'dockyard_password': dockyard_password,
            'ports': ports,
            'environment': environment,
            'volumes': volumes,
            'run_command': run_command,
            'resource_limits': resource_limits,
            'configs': configs,
        }
        environment['RESTART_CONTAINER_PARAMETERS'] = base64.b64encode(json.dumps(restart_parameters, sort_keys=True))
        environment['ARMADA_RUN_COMMAND'] = base64.b64encode(run_command)
        environment['MICROSERVICE_NAME'] = microservice_name
        if microservice_env:
            environment['MICROSERVICE_ENV'] = microservice_env
        if microservice_app_id:
            environment['MICROSERVICE_APP_ID'] = microservice_app_id
        config_path, hermes_volumes = process_hermes(microservice_name, image_name, microservice_env,
                                                     microservice_app_id, configs)
        if config_path:
            environment['CONFIG_PATH'] = config_path

        volumes[docker_client.DOCKER_SOCKET_PATH] = docker_client.DOCKER_SOCKET_PATH
        volumes.update(hermes_volumes or {})
        long_container_id = self._create_container(
            image_path, microservice_name, ports, environment, volumes,
            dockyard_user, dockyard_password, resource_limits)
        return long_container_id
def _check_if_we_should_recover(saved_containers_path):
    try:
        if int(os.environ.get('DOCKER_START_TIMESTAMP')) > int(os.path.getmtime(saved_containers_path)):
            get_logger().info('Docker daemon restart detected.')
            return True
        else:
            get_logger().info('No need to recover.')
            return False
    except:
        return False
Example #7
0
def _get_services_list(filter_microservice_name, filter_env, filter_app_id, filter_local):
    if filter_local:
        ship_list = [get_ship_name()]
    else:
        ship_list = get_ship_names()
    services_dict = {}
    if not ship_list:
        return {}
    for ship in ship_list:
        containers = kv.kv_get('containers_parameters_list/{}'.format(ship))
        if containers and isinstance(containers, dict):
            services_dict.update(containers)

    services_list = services_dict.keys()

    result = {}
    if not services_list:
        return result

    if filter_microservice_name:
        services_list = fnmatch.filter(services_list, 'ships/*/service/{}/*'.format(filter_microservice_name))

    for service in services_list:
        service_dict = services_dict[service]
        microservice_name = service_dict['ServiceName']
        microservice_status = service_dict['Status']
        microservice_id = service_dict['ServiceID']
        container_id = service_dict['container_id']
        microservice_start_timestamp = service_dict['start_timestamp']
        not_available = 'n/a'

        microservice_tags_dict = {}
        try:
            if service_dict['params']['microservice_env']:
                microservice_tags_dict['env'] = service_dict['params']['microservice_env']
            if service_dict['params']['microservice_app_id']:
                microservice_tags_dict['app_id'] = service_dict['params']['microservice_app_id']
        except KeyError as e:
            get_logger().warning(repr(e))

        matches_env = (filter_env is None) or (filter_env == microservice_tags_dict.get('env'))
        matches_app_id = (filter_app_id is None) or (filter_app_id == microservice_tags_dict.get('app_id'))

        if matches_env and matches_app_id:
            microservice_dict = {
                'name': microservice_name,
                'status': microservice_status,
                'address': not_available,
                'microservice_id': microservice_id,
                'container_id': container_id,
                'tags': microservice_tags_dict,
                'start_timestamp': microservice_start_timestamp,
            }
            result[microservice_id] = microservice_dict
    return result
Example #8
0
def _wait_for_armada_start():
    timeout_expiration = time.time() + 30
    while time.time() < timeout_expiration:
        time.sleep(1)
        try:
            health_status = requests.get('http://localhost/health').text
            if health_status == 'ok':
                return
        except:
            pass
    get_logger().error('Could not connect to armada.')
def _load_containers_to_kv_store(saved_containers_path):
    wait_for_consul_ready()
    try:
        ship = get_ship_name()
        saved_containers = _load_saved_containers_parameters_list(saved_containers_path)
        if isinstance(saved_containers, dict):
            _load_from_dict(saved_containers, ship)
        else:
            _load_from_list(saved_containers, ship)
    except:
        get_logger().exception('Unable to load from %s', saved_containers_path)
Example #10
0
def _recover_saved_containers_from_path(saved_containers_path):
    wait_for_consul_ready()
    try:
        not_recovered = recover_containers_from_kv_store()
        if not_recovered:
            get_logger().error('Following containers were not recovered: %s', not_recovered)
            return False
        else:
            return True
    except:
        get_logger().exception('Unable to recover from %s.', saved_containers_path)
    return False
def _recover_saved_containers_from_path(saved_containers_path):
    wait_for_consul_ready()
    try:
        not_recovered = recover_containers_from_kv_store()
        if not_recovered:
            get_logger().error('Following containers were not recovered: {}'.format(not_recovered))
            return False
        else:
            return True
    except:
        traceback.print_exc()
        get_logger().error('Unable to recover from {}.'.format(saved_containers_path))
    return False
def _recover_saved_containers_from_path(saved_containers_path):
    try:
        saved_containers = _load_saved_containers_parameters_list(saved_containers_path)
        not_recovered = recover_saved_containers(saved_containers)
        if not_recovered:
            get_logger().error('Following containers were not recovered: {}'.format(not_recovered))
            return False
        else:
            return True
    except:
        traceback.print_exc()
        get_logger().error('Unable to recover from {}.'.format(saved_containers_path))
    return False
Example #13
0
def _fetch_hermes_from_couriers(courier_addresses):
    my_ssh_address = get_container_ssh_address(socket.gethostname())
    for courier_address in courier_addresses:
        courier_url = 'http://{courier_address}/update_hermes'.format(**locals())
        try:
            payload = {'ssh': my_ssh_address, 'path': HERMES_DIRECTORY}
            response = requests.post(courier_url, json.dumps(payload))
            response.raise_for_status()
            if response.text.strip() != 'ok':
                raise Exception('Error response from courier:\n{}'.format(response.text))
        except Exception as e:
            get_logger().error('Fetching all sources from courier %s failed:', courier_address)
            get_logger().exception(e)
Example #14
0
def recover_saved_containers_from_parameters(saved_containers):
    wait_for_consul_ready()
    try:
        ship = get_ship_name()
        if isinstance(saved_containers, dict):
            _load_from_dict(saved_containers, ship)
        else:
            _load_from_list(saved_containers, ship)
    except Exception as e:
        get_logger().exception(e)

    containers_to_be_recovered = recover_containers_from_kv_store()
    return containers_to_be_recovered
def _load_containers_to_kv_store(saved_containers_path):
    wait_for_consul_ready()
    try:
        ship = get_ship_name()
        containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship))
        saved_containers = _load_saved_containers_parameters_list(saved_containers_path)
        _add_running_services_at_startup(containers_saved_in_kv, ship)
        if isinstance(saved_containers, dict):
            _load_from_dict(saved_containers, containers_saved_in_kv, ship)
        else:
            _load_from_list(saved_containers, ship)
    except:
        traceback.print_exc()
        get_logger().error('Unable to load from {}.'.format(saved_containers_path))
def main():
    args = _parse_args()
    if not args.force and not _is_recovery_completed():
        get_logger().warning('Recovery is not completed. Aborting saving running containers.')
        return
    saved_containers_path = args.saved_containers_path
    try:
        wait_for_consul_ready()
        ship = get_ship_name()
        saved_containers = kv.kv_list('ships/{}/service/'.format(ship))
        containers_parameters_dict = {}
        if saved_containers:
            for container in saved_containers:
                container_dict = kv.kv_get(container)
                containers_parameters_dict[container] = container_dict

        if containers_parameters_dict:
            _save_containers_parameters_list_in_file(containers_parameters_dict, saved_containers_path)
            get_logger().info('Containers have been saved to {}.'.format(saved_containers_path))
            try:
                _save_containers_parameters_list_in_kv_store(containers_parameters_dict)
                get_logger().info('Containers have been saved to kv store.')
            except:
                traceback.print_exc()
        else:
            get_logger().info('Aborted saving container because of errors.')
    except:
        traceback.print_exc()
        sys.exit(1)
Example #17
0
def _parse_single_ship(services_dict, filter_microservice_name, filter_env, filter_app_id):
    try:
        services_list = services_dict.keys()
    except AttributeError:
        services_list = None

    result = {}
    if not services_list:
        return result

    if filter_microservice_name:
        services_list = fnmatch.filter(services_list, 'ships/*/service/{}/*'.format(filter_microservice_name))

    for service in services_list:
        service_dict = services_dict[service]
        microservice_name = service_dict['ServiceName']
        microservice_status = service_dict['Status']
        microservice_id = service_dict['ServiceID']
        container_id = service_dict['container_id']
        microservice_start_timestamp = service_dict['start_timestamp']
        single_active_instance = service_dict.get('single_active_instance', False)
        not_available = 'n/a'

        microservice_tags_dict = {}
        try:
            if service_dict['params']['microservice_env']:
                microservice_tags_dict['env'] = service_dict['params']['microservice_env']
            if service_dict['params']['microservice_app_id']:
                microservice_tags_dict['app_id'] = service_dict['params']['microservice_app_id']
        except KeyError as e:
            get_logger().warning(repr(e))

        matches_env = (filter_env is None) or (filter_env == microservice_tags_dict.get('env'))
        matches_app_id = (filter_app_id is None) or (filter_app_id == microservice_tags_dict.get('app_id'))

        if matches_env and matches_app_id:
            microservice_dict = {
                'name': microservice_name,
                'status': microservice_status,
                'address': not_available,
                'microservice_id': microservice_id,
                'container_id': container_id,
                'tags': microservice_tags_dict,
                'start_timestamp': microservice_start_timestamp,
                'single_active_instance': single_active_instance,
            }
            result[microservice_id] = microservice_dict

    return result
def recover_saved_containers(saved_containers):
    wait_for_consul_ready()
    running_containers = _get_local_running_containers()
    containers_to_be_recovered = _multiset_difference(saved_containers, running_containers)
    recovery_retry_count = 0
    while containers_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT:
        get_logger().info("Recovering containers: {}".format(json.dumps(containers_to_be_recovered)))
        containers_not_recovered = []
        for container_parameters in containers_to_be_recovered:
            if not _recover_container(container_parameters):
                containers_not_recovered.append(container_parameters)
        sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS)
        running_containers = _get_local_running_containers()
        containers_to_be_recovered = _multiset_difference(containers_not_recovered, running_containers)
        recovery_retry_count += 1
    return containers_to_be_recovered
Example #19
0
def main():
    setup_sentry()
    consul_mode, ship_ips, datacenter, ship_name = _get_runtime_settings()
    ship_external_ip = get_external_ip()
    if ship_name is None:
        ship_name = ship_external_ip
    consul_config_content = consul_config.get_consul_config(consul_mode, ship_ips, datacenter, ship_external_ip,
                                                            ship_name)

    with open(consul_config.CONFIG_PATH, 'w') as config_file:
        config_file.write(consul_config_content)

    command = '/usr/local/bin/consul agent -config-file {config_path}'.format(config_path=consul_config.CONFIG_PATH)
    get_logger().info('RUNNING: %s', command)

    args = command.split()
    os.execv(args[0], args)
Example #20
0
def _add_running_services_at_startup():
    wait_for_consul_ready()
    try:
        ship = get_ship_name()
        containers_saved_in_kv = kv.kv_list('ships/{}/service/'.format(ship))
        sleep(10)
        all_services = consul_query('agent/services')
        del all_services['consul']
        for service_id, service_dict in all_services.items():
            if ':' in service_id:
                continue
            if service_dict['Service'] == 'armada':
                continue
            key = 'ships/{}/service/{}/{}'.format(ship, service_dict['Service'], service_id)
            if not containers_saved_in_kv or key not in containers_saved_in_kv:
                kv.save_container(ship, service_id, 'started')
                get_logger().info('Added running service: {}'.format(service_id))
    except:
        get_logger().exception('Unable to add running services.')
Example #21
0
    def _login_to_dockyard(self, docker_api, dockyard_address, dockyard_user, dockyard_password):
        if dockyard_user and dockyard_password:
            logged_in = False
            # Workaround for abrupt changes in docker-py library.
            login_exceptions = []
            registry_endpoints = [
                'https://{0}/v1/'.format(dockyard_address),
                'https://{0}'.format(dockyard_address),
                dockyard_address
            ]
            for registry_endpoint in registry_endpoints:
                try:
                    docker_api.login(dockyard_user, dockyard_password, registry=registry_endpoint)
                    logged_in = True
                    break
                except Exception as e:
                    get_logger().debug(e)
                    login_exceptions.append(e)

            if not logged_in:
                raise login_exceptions[0]
def _recover_container(container_parameters):
    get_logger().info('Recovering: {}...\n'.format(json.dumps(container_parameters)))
    recovery_result = armada_api.post('run', container_parameters)
    if recovery_result.get('status') == 'ok':
        get_logger().info('Recovered container: {}'.format(json.dumps(recovery_result)))
        return True
    else:
        get_logger().error('Could not recover container: {}'.format(json.dumps(recovery_result)))
        return False
def recover_saved_containers(saved_containers):
    wait_for_consul_ready()
    running_containers = _get_local_running_containers()
    containers_to_be_recovered = _multiset_difference(saved_containers, running_containers)
    recovery_retry_count = 0
    while containers_to_be_recovered and recovery_retry_count < RECOVERY_RETRY_LIMIT:
        get_logger().info("Recovering containers: {}".format(json.dumps(containers_to_be_recovered)))
        containers_not_recovered = []
        counter_to_be_recovered = Counter(json.dumps(x, sort_keys=True) for x in containers_to_be_recovered)
        to_be_recovered = []
        for container_parameters in counter_to_be_recovered.elements():
            try:
                if to_be_recovered[-1][0] == container_parameters:
                    index = to_be_recovered[-1][1] + 1
                else:
                    index = 0
            except IndexError:
                index = 0
            to_be_recovered.append((container_parameters, index))
            name = json.loads(container_parameters)['microservice_name']
            kv.save_service(name, index, 'recovering', json.loads(container_parameters))

        for container_parameters, index in to_be_recovered:
            container_parameters = json.loads(container_parameters)
            name = container_parameters['microservice_name']
            if not _recover_container(container_parameters):
                containers_not_recovered.append(container_parameters)
                if recovery_retry_count == (RECOVERY_RETRY_LIMIT - 1):
                    kv.save_service(name, index, 'not-recovered', json.loads(container_parameters))
            else:
                kv.kv_remove('service/{}/{}'.format(name, index))
        sleep(DELAY_BETWEEN_RECOVER_RETRY_SECONDS)
        running_containers = _get_local_running_containers()
        containers_to_be_recovered = _multiset_difference(containers_not_recovered, running_containers)
        recovery_retry_count += 1

    return containers_to_be_recovered
Example #24
0
    def _stop_service(self, container_id):
        ship = get_ship_name()
        service_list = kv_list("ships/{}/service/".format(ship))
        try:
            key = fnmatch.filter(service_list, "*/{}".format(container_id))[0]
        except (IndexError, TypeError):
            key = None

        if not is_container_running(container_id):
            if key:
                kv_remove(key)
            try:
                deregister_services(container_id)
            except Exception as e:
                get_logger().exception(e)
        else:
            run_command_in_container("supervisorctl stop armada_agent", container_id)

            # TODO: Compatibility with old microservice images. Should be removed in future armada version.
            run_command_in_container("supervisorctl stop register_in_service_discovery", container_id)

            docker_api = docker_client.api()
            last_exception = None
            try:
                deregister_services(container_id)
            except Exception as e:
                get_logger().exception(e)
            for i in range(3):
                try:
                    docker_api.stop(container_id)
                except Exception as e:
                    get_logger().debug(e, exc_info=True)
                    last_exception = e
                if not is_container_running(container_id):
                    if key:
                        kv_remove(key)
                    break
            if is_container_running(container_id):
                get_logger().error("Could not stop container: %s", container_id)
                raise last_exception
Example #25
0
def _clean_up_kv_store():
    global next_kv_clean_up_timestamp
    if time.time() < next_kv_clean_up_timestamp:
        return
    get_logger().info('Cleaning up kv-store:')
    next_kv_clean_up_timestamp = get_next_kv_clean_up_timestamp()

    services = armada_api.get_json('list')
    valid_container_ids = set(service.get('container_id') for service in services)

    start_timestamp_keys = kv.kv_list('start_timestamp/') or []
    for key in start_timestamp_keys:
        container_id = key.split('/')[-1]
        if container_id not in valid_container_ids:
            get_logger().info('Removing key: {}'.format(key))
            kv.kv_remove(key)

    single_active_instance_keys = kv.kv_list('single_active_instance/') or []
    for key in single_active_instance_keys:
        container_id = key.split('/')[-1].split(':')[0]
        if container_id not in valid_container_ids:
            get_logger().info('Removing key: {}'.format(key))
            kv.kv_remove(key)
    get_logger().info('Finished cleaning up kv-store.')
Example #26
0
def _get_courier_addresses():
    courier_addresses = set()
    courier_is_running = False

    timeout_expiration = time.time() + 30
    last_exception = None
    while time.time() < timeout_expiration:
        time.sleep(1)
        try:
            courier_addresses = _consul_discover('courier')
            last_exception = None
            if courier_addresses:
                courier_is_running = True
                break
        except Exception as e:
            last_exception = e
    if last_exception is not None:
        get_logger().error('Could not determine if courier is running:')
        get_logger().exception(last_exception)
    elif not courier_is_running:
        get_logger().info('No running couriers found.')
    return courier_addresses
Example #27
0
 def status_exception(self, message, exception):
     get_logger().exception(exception)
     error_msg = "API exception: {0}. {1} - {2}".format(message, type(exception).__name__, str(exception))
     web.header('Content-Type', 'application/json')
     return _create_response_with_error(error_msg)
Example #28
0
 def status_error(self, message=None):
     get_logger().error('API error: %s', message)
     web.header('Content-Type', 'application/json')
     return _create_response_with_error(message)