def __init__(self, kernel_manager, proxy_config):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager,
                                                      proxy_config)
        self.application_id = None
        self.yarn_endpoint \
            = proxy_config.get('yarn_endpoint',
                               kernel_manager.parent.parent.yarn_endpoint)
        self.yarn_endpoint_security_enabled \
            = proxy_config.get('yarn_endpoint_security_enabled',
                               kernel_manager.parent.parent.yarn_endpoint_security_enabled)
        yarn_url = urlparse(self.yarn_endpoint)
        yarn_master = yarn_url.hostname
        yarn_port = yarn_url.port
        if self.yarn_endpoint_security_enabled is True:
            self.resource_mgr = ResourceManager(
                address=yarn_master,
                port=yarn_port,
                kerberos_enabled=self.yarn_endpoint_security_enabled)
        else:
            self.resource_mgr = ResourceManager(address=yarn_master,
                                                port=yarn_port)

        # YARN applications tend to take longer than the default 5 second wait time.  Rather than
        # require a command-line option for those using YARN, we'll adjust based on a local env that
        # defaults to 15 seconds.  Note: we'll only adjust if the current wait time is shorter than
        # the desired value.
        if kernel_manager.shutdown_wait_time < yarn_shutdown_wait_time:
            kernel_manager.shutdown_wait_time = yarn_shutdown_wait_time
            self.log.debug(
                "{class_name} shutdown wait time adjusted to {wait_time} seconds."
                .format(class_name=type(self).__name__,
                        wait_time=kernel_manager.shutdown_wait_time))
    def _initialize_resource_manager(self, **kwargs):
        """Initialize the Hadoop YARN Resource Manager instance used for this kernel's lifecycle."""

        endpoints = None
        if self.yarn_endpoint:
            endpoints = [self.yarn_endpoint]

            # Only check alternate if "primary" is set.
            if self.alt_yarn_endpoint:
                endpoints.append(self.alt_yarn_endpoint)

        if self.yarn_endpoint_security_enabled:
            from requests_kerberos import HTTPKerberosAuth
            auth = HTTPKerberosAuth()
        else:
            # If we have the appropriate version of yarn-api-client, use its SimpleAuth class.
            # This allows EG to continue to issue requests against the YARN api when anonymous
            # access is not allowed. (Default is to allow anonymous access.)
            try:
                from yarn_api_client.auth import SimpleAuth
                kernel_username = KernelSessionManager.get_kernel_username(
                    **kwargs)
                auth = SimpleAuth(kernel_username)
                self.log.debug(
                    f"Using SimpleAuth with '{kernel_username}' against endpoints: {endpoints}"
                )
            except ImportError:
                auth = None

        self.resource_mgr = ResourceManager(service_endpoints=endpoints,
                                            auth=auth,
                                            verify=cert_path)

        self.rm_addr = self.resource_mgr.get_active_endpoint()
Esempio n. 3
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.application_id = None
        self.last_known_state = None
        self.candidate_queue = None
        self.candidate_partition = None

        endpoints = None
        if self.yarn_endpoint:
            endpoints = [self.yarn_endpoint]

            # Only check alternate if "primary" is set.
            if self.alt_yarn_endpoint:
                endpoints.append(self.alt_yarn_endpoint)

        auth = None
        if self.yarn_endpoint_security_enabled:
            from requests_kerberos import HTTPKerberosAuth
            auth = HTTPKerberosAuth()

        self.resource_mgr = ResourceManager(service_endpoints=endpoints,
                                            auth=auth,
                                            verify=cert_path)

        self.rm_addr = self.resource_mgr.get_active_endpoint()

        # If yarn resource check is enabled and it isn't available immediately,
        # 20% of kernel_launch_timeout is used to wait
        # and retry at fixed interval before pronouncing as not feasible to launch.
        self.yarn_resource_check_wait_time = 0.20 * self.launch_timeout
Esempio n. 4
0
 def __init__(self, kernel_manager, proxy_config):
     super(YarnClusterProcessProxy, self).__init__(kernel_manager,
                                                   proxy_config)
     self.application_id = None
     self.yarn_endpoint = proxy_config.get(
         'yarn_endpoint', kernel_manager.parent.parent.yarn_endpoint)
     yarn_master = urlparse(self.yarn_endpoint).hostname
     self.resource_mgr = ResourceManager(address=yarn_master)
Esempio n. 5
0
    def __init__(self, kernel_manager, proxy_config):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager,
                                                      proxy_config)
        self.application_id = None
        self.last_known_state = None
        self.candidate_queue = None
        self.candidate_partition = None
        self.local_proc = None
        self.pid = None
        self.ip = None

        self.yarn_endpoint \
            = proxy_config.get('yarn_endpoint',
                               kernel_manager.yarn_endpoint)
        self.alt_yarn_endpoint \
            = proxy_config.get('alt_yarn_endpoint',
                               kernel_manager.alt_yarn_endpoint)

        self.yarn_endpoint_security_enabled \
            = proxy_config.get('yarn_endpoint_security_enabled',
                               kernel_manager.yarn_endpoint_security_enabled)

        endpoints = None
        if self.yarn_endpoint:
            endpoints = [self.yarn_endpoint]

            # Only check alternate if "primary" is set.
            if self.alt_yarn_endpoint:
                endpoints.append(self.alt_yarn_endpoint)

        auth = None
        if self.yarn_endpoint_security_enabled:
            from requests_kerberos import HTTPKerberosAuth
            auth = HTTPKerberosAuth()

        self.resource_mgr = ResourceManager(service_endpoints=endpoints,
                                            auth=auth,
                                            verify=cert_path)

        self.rm_addr = self.resource_mgr.get_active_endpoint()

        # YARN applications tend to take longer than the default 5 second wait time.  Rather than
        # require a command-line option for those using YARN, we'll adjust based on a local env that
        # defaults to 15 seconds.  Note: we'll only adjust if the current wait time is shorter than
        # the desired value.
        if kernel_manager.shutdown_wait_time < yarn_shutdown_wait_time:
            kernel_manager.shutdown_wait_time = yarn_shutdown_wait_time
            self.log.debug(
                "{class_name} shutdown wait time adjusted to {wait_time} seconds."
                .format(class_name=type(self).__name__,
                        wait_time=kernel_manager.shutdown_wait_time))

        # If yarn resource check is enabled and it isn't available immediately,
        # 20% of kernel_launch_timeout is used to wait
        # and retry at fixed interval before pronouncing as not feasible to launch.
        self.yarn_resource_check_wait_time = 0.20 * self.kernel_launch_timeout
    def setUpClass(self):
        self.configured = False
        if os.getenv('YARN_ENDPOINT'):
            yarn_endpoint = os.getenv('YARN_ENDPOINT')
            yarn_endpoint_uri = urlparse(yarn_endpoint)

            if yarn_endpoint_uri.hostname and yarn_endpoint_uri.port:
                self.configured = True
                self.resourceManager = ResourceManager(
                    yarn_endpoint_uri.hostname, yarn_endpoint_uri.port)
Esempio n. 7
0
    def __init__(self, kernel_manager, proxy_config):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager,
                                                      proxy_config)
        self.application_id = None
        self.rm_addr = None
        self.yarn_endpoint \
            = proxy_config.get('yarn_endpoint',
                               kernel_manager.parent.parent.yarn_endpoint)
        self.alt_yarn_endpoint \
            = proxy_config.get('alt_yarn_endpoint',
                               kernel_manager.parent.parent.alt_yarn_endpoint)

        self.yarn_endpoint_security_enabled \
            = proxy_config.get('yarn_endpoint_security_enabled',
                               kernel_manager.parent.parent.yarn_endpoint_security_enabled)

        yarn_master = alt_yarn_master = None
        yarn_port = alt_yarn_port = None
        if self.yarn_endpoint:
            yarn_url = urlparse(self.yarn_endpoint)
            yarn_master = yarn_url.hostname
            yarn_port = yarn_url.port
            # Only check alternate if "primary" is set.
            if self.alt_yarn_endpoint:
                alt_yarn_url = urlparse(self.alt_yarn_endpoint)
                alt_yarn_master = alt_yarn_url.hostname
                alt_yarn_port = alt_yarn_url.port

        self.resource_mgr = ResourceManager(
            address=yarn_master,
            port=yarn_port,
            alt_address=alt_yarn_master,
            alt_port=alt_yarn_port,
            kerberos_enabled=self.yarn_endpoint_security_enabled)

        host, port = self.resource_mgr.get_active_host_port()
        self.rm_addr = host + ':' + str(port)

        # YARN applications tend to take longer than the default 5 second wait time.  Rather than
        # require a command-line option for those using YARN, we'll adjust based on a local env that
        # defaults to 15 seconds.  Note: we'll only adjust if the current wait time is shorter than
        # the desired value.
        if kernel_manager.shutdown_wait_time < yarn_shutdown_wait_time:
            kernel_manager.shutdown_wait_time = yarn_shutdown_wait_time
            self.log.debug(
                "{class_name} shutdown wait time adjusted to {wait_time} seconds."
                .format(class_name=type(self).__name__,
                        wait_time=kernel_manager.shutdown_wait_time))

        # If yarn resource check is enabled and it isn't available immediately,
        # 20% of kernel_launch_timeout is used to wait
        # and retry at fixed interval before pronouncing as not feasible to launch.
        self.yarn_resource_check_wait_time = 0.20 * self.kernel_launch_timeout
Esempio n. 8
0
    def __init__(self, kernel_manager, lifecycle_config):
        super(YarnKernelLifecycleManager,
              self).__init__(kernel_manager, lifecycle_config)
        self.application_id = None
        self.rm_addr = None

        # We'd like to have the kernel.json values override the globally configured values but because
        # 'null' is the default value for these (and means to go with the local endpoint), we really
        # can't do that elegantly.  This means that the global setting will be used only if the kernel.json
        # value is 'null' (None).  For those configurations that want to use the local endpoint, they should
        # just avoid setting these altogether.
        self.yarn_endpoint = lifecycle_config.get(
            'yarn_endpoint',
            kernel_manager.provider_config.get('yarn_endpoint'))

        self.alt_yarn_endpoint = lifecycle_config.get(
            'alt_yarn_endpoint',
            kernel_manager.provider_config.get('alt_yarn_endpoint'))

        self.yarn_endpoint_security_enabled = lifecycle_config.get(
            'yarn_endpoint_security_enabled',
            kernel_manager.provider_config.get(
                'yarn_endpoint_security_enabled', False))

        endpoints = None
        if self.yarn_endpoint:
            endpoints = [self.yarn_endpoint]

            # Only check alternate if "primary" is set.
            if self.alt_yarn_endpoint:
                endpoints.append(self.alt_yarn_endpoint)

        auth = None
        if self.yarn_endpoint_security_enabled:
            from requests_kerberos import HTTPKerberosAuth
            auth = HTTPKerberosAuth()

        self.resource_mgr = ResourceManager(service_endpoints=endpoints,
                                            auth=auth)

        self.rm_addr = self.resource_mgr.get_active_endpoint()

        # TODO - fix wait time - should just add member to k-m.
        # YARN applications tend to take longer than the default 5 second wait time.  Rather than
        # require a command-line option for those using YARN, we'll adjust based on a local env that
        # defaults to 15 seconds.  Note: we'll only adjust if the current wait time is shorter than
        # the desired value.
        if kernel_manager.shutdown_wait_time < yarn_shutdown_wait_time:
            kernel_manager.shutdown_wait_time = yarn_shutdown_wait_time
            self.log.debug(
                "{class_name} shutdown wait time adjusted to {wait_time} seconds."
                .format(class_name=type(self).__name__,
                        wait_time=kernel_manager.shutdown_wait_time))
Esempio n. 9
0
 def __init__(self, kernel_manager, proxy_config):
     super(YarnClusterProcessProxy, self).__init__(kernel_manager, proxy_config)
     self.application_id = None
     self.yarn_endpoint \
         = proxy_config.get('yarn_endpoint',
                            kernel_manager.parent.parent.yarn_endpoint)
     self.yarn_endpoint_security_enabled \
         = proxy_config.get('yarn_endpoint_security_enabled',
                            kernel_manager.parent.parent.yarn_endpoint_security_enabled)
     yarn_master = urlparse(self.yarn_endpoint).hostname
     if self.yarn_endpoint_security_enabled is True:
         self.resource_mgr = ResourceManager(address=yarn_master, kerberos_enabled=self.yarn_endpoint_security_enabled)
     else:
         self.resource_mgr = ResourceManager(address=yarn_master)
Esempio n. 10
0
 def __init__(self, kernel_manager, proxy_config):
     super(YarnClusterProcessProxy, self).__init__(kernel_manager, proxy_config)
     self.application_id = None
     self.yarn_endpoint \
         = proxy_config.get('yarn_endpoint',
                            kernel_manager.parent.parent.yarn_endpoint)
     self.yarn_endpoint_security_enabled \
         = proxy_config.get('yarn_endpoint_security_enabled',
                            kernel_manager.parent.parent.yarn_endpoint_security_enabled)
     yarn_master = urlparse(self.yarn_endpoint).hostname
     if self.yarn_endpoint_security_enabled is True:
         self.resource_mgr = ResourceManager(address=yarn_master, kerberos_enabled=self.yarn_endpoint_security_enabled)
     else:
         self.resource_mgr = ResourceManager(address=yarn_master)
Esempio n. 11
0
    def __init__(self, kernel_manager, proxy_config):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager,
                                                      proxy_config)
        self.application_id = None
        self.rm_addr = None
        self.yarn_endpoint \
            = proxy_config.get('yarn_endpoint',
                               kernel_manager.parent.parent.yarn_endpoint)
        self.yarn_endpoint_security_enabled \
            = proxy_config.get('yarn_endpoint_security_enabled',
                               kernel_manager.parent.parent.yarn_endpoint_security_enabled)

        yarn_master = None
        yarn_port = None
        if self.yarn_endpoint:
            yarn_url = urlparse(self.yarn_endpoint)
            yarn_master = yarn_url.hostname
            yarn_port = yarn_url.port

        self.resource_mgr = ResourceManager(
            address=yarn_master,
            port=yarn_port,
            kerberos_enabled=self.yarn_endpoint_security_enabled)

        # Temporary until yarn-api-client can be extended to return host-port info when yarn_master is None.
        self.rm_addr = yarn_master + ':' + str(
            yarn_port) if yarn_master is not None else '(see yarn-site.xml)'

        # YARN applications tend to take longer than the default 5 second wait time.  Rather than
        # require a command-line option for those using YARN, we'll adjust based on a local env that
        # defaults to 15 seconds.  Note: we'll only adjust if the current wait time is shorter than
        # the desired value.
        if kernel_manager.shutdown_wait_time < yarn_shutdown_wait_time:
            kernel_manager.shutdown_wait_time = yarn_shutdown_wait_time
            self.log.debug(
                "{class_name} shutdown wait time adjusted to {wait_time} seconds."
                .format(class_name=type(self).__name__,
                        wait_time=kernel_manager.shutdown_wait_time))
Esempio n. 12
0
def getAppElapsedTime():
    appTimes = []
    rm = ResourceManager(address='localhost', port=8088)
    data = getYarnApplicationsData(rm)
    if data:
        try:
            apps = data['apps']
            if apps:
                appList = apps['app']
                for app in appList:
                    appTimes.append((app['id'], app['elapsedTime']))
        except KeyError:
            pass

    return appTimes
Esempio n. 13
0
def getYarnApps():
    yarnApps = []
    rm = ResourceManager(address='localhost', port=8088)
    data = getYarnApplicationsData(rm)
    if data:
        try:
            apps = data['apps']
            if apps:
                appList = apps['app']
                for app in appList:
                    url = app['trackingUrl']
                    port_path = url.split(':')[2]
                    yarnApps.append(
                        YarnApp._make((app['id'], app['name'], port_path,
                                       app['state'], app['elapsedTime'])))
        except KeyError:
            pass

    return yarnApps
Esempio n. 14
0
class YarnClusterProcessProxy(RemoteProcessProxy):
    initial_states = {'NEW', 'SUBMITTED', 'ACCEPTED', 'RUNNING'}
    final_states = {'FINISHED', 'KILLED'}  # Don't include FAILED state

    def __init__(self, kernel_manager, proxy_config):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager, proxy_config)
        self.application_id = None
        self.yarn_endpoint \
            = proxy_config.get('yarn_endpoint',
                               kernel_manager.parent.parent.yarn_endpoint)
        self.yarn_endpoint_security_enabled \
            = proxy_config.get('yarn_endpoint_security_enabled',
                               kernel_manager.parent.parent.yarn_endpoint_security_enabled)
        yarn_master = urlparse(self.yarn_endpoint).hostname
        if self.yarn_endpoint_security_enabled is True:
            self.resource_mgr = ResourceManager(address=yarn_master, kerberos_enabled=self.yarn_endpoint_security_enabled)
        else:
            self.resource_mgr = ResourceManager(address=yarn_master)

    def launch_process(self, kernel_cmd, **kw):
        """ Launches the Yarn process.  Prior to invocation, connection files will be distributed to each applicable
            Yarn node so that its in place when the kernel is started.
        """
        super(YarnClusterProcessProxy, self).launch_process(kernel_cmd, **kw)

        # launch the local run.sh - which is configured for yarn-cluster...
        self.local_proc = launch_kernel(kernel_cmd, **kw)
        self.pid = self.local_proc.pid
        self.ip = local_ip

        self.log.debug("Yarn cluster kernel launched using YARN endpoint: {}, pid: {}, Kernel ID: {}, cmd: '{}'"
                       .format(self.yarn_endpoint, self.local_proc.pid, self.kernel_id, kernel_cmd))
        self.confirm_remote_startup(kernel_cmd, **kw)

        return self

    def poll(self):
        """Submitting a new kernel/app to YARN will take a while to be ACCEPTED.
        Thus application ID will probably not be available immediately for poll.
        So will regard the application as RUNNING when application ID still in ACCEPTED or SUBMITTED state.

        :return: None if the application's ID is available and state is ACCEPTED/SUBMITTED/RUNNING. Otherwise False. 
        """
        result = False

        if self.get_application_id():
            state = self.query_app_state_by_id(self.application_id)
            if state in YarnClusterProcessProxy.initial_states:
                result = None

        # The following produces too much output (every 3 seconds by default), so commented-out at this time.
        # self.log.debug("YarnProcessProxy.poll, application ID: {}, kernel ID: {}, state: {}".
        #               format(self.application_id, self.kernel_id, state))
        return result

    def send_signal(self, signum):
        """Currently only support 0 as poll and other as kill.

        :param signum
        :return: 
        """
        self.log.debug("YarnClusterProcessProxy.send_signal {}".format(signum))
        if signum == 0:
            return self.poll()
        elif signum == signal.SIGKILL:
            return self.kill()
        else:
            # Yarn api doesn't support the equivalent to interrupts, so take our chances
            # via a remote signal.  Note that this condition cannot check against the
            # signum value because altternate interrupt signals might be in play.
            return super(YarnClusterProcessProxy, self).send_signal(signum)

    def kill(self):
        """Kill a kernel.
        :return: None if the application existed and is not in RUNNING state, False otherwise. 
        """
        state = None
        result = False
        if self.get_application_id():
            resp = self.kill_app_by_id(self.application_id)
            self.log.debug(
                "YarnClusterProcessProxy.kill: kill_app_by_id({}) response: {}, confirming app state is not RUNNING"
                    .format(self.application_id, resp))

            i = 1
            state = self.query_app_state_by_id(self.application_id)
            while state not in YarnClusterProcessProxy.final_states and i <= max_poll_attempts:
                time.sleep(poll_interval)
                state = self.query_app_state_by_id(self.application_id)
                i = i + 1

            if state in YarnClusterProcessProxy.final_states:
                result = None

        super(YarnClusterProcessProxy, self).kill()

        self.log.debug("YarnClusterProcessProxy.kill, application ID: {}, kernel ID: {}, state: {}"
                       .format(self.application_id, self.kernel_id, state))
        return result

    def cleanup(self):
        # we might have a defunct process (if using waitAppCompletion = false) - so poll, kill, wait when we have
        # a local_proc.
        if self.local_proc:
            self.log.debug("YarnClusterProcessProxy.cleanup: Clearing possible defunct process, pid={}...".
                           format(self.local_proc.pid))
            if super(YarnClusterProcessProxy, self).poll():
                super(YarnClusterProcessProxy, self).kill()
            super(YarnClusterProcessProxy, self).wait()
            self.local_proc = None

        # reset application id to force new query - handles kernel restarts/interrupts
        self.application_id = None

        # for cleanup, we should call the superclass last
        super(YarnClusterProcessProxy, self).cleanup()

    def confirm_remote_startup(self, kernel_cmd, **kw):
        """ Confirms the yarn application is in a started state before returning.  Should post-RUNNING states be
            unexpectedly encountered (FINISHED, KILLED) then we must throw, otherwise the rest of the JKG will
            believe its talking to a valid kernel.
        """
        self.start_time = RemoteProcessProxy.get_current_time()
        i = 0
        ready_to_connect = False  # we're ready to connect when we have a connection file to use
        while not ready_to_connect:
            i += 1
            self.handle_timeout()

            if self.get_application_id(True):
                # Once we have an application ID, start monitoring state, obtain assigned host and get connection info
                app_state = self.get_application_state()

                if app_state in YarnClusterProcessProxy.final_states:
                    error_message = "KernelID: '{}', ApplicationID: '{}' unexpectedly found in " \
                                                     "state '{}' during kernel startup!".\
                                    format(self.kernel_id, self.application_id, app_state)
                    self.log_and_raise(http_status_code=500, reason=error_message)

                self.log.debug("{}: State: '{}', Host: '{}', KernelID: '{}', ApplicationID: '{}'".
                               format(i, app_state, self.assigned_host, self.kernel_id, self.application_id))

                if self.assigned_host != '':
                    ready_to_connect = self.receive_connection_info()
            else:
                self.detect_launch_failure()

    def get_application_state(self):
        # Gets the current application state using the application_id already obtained.  Once the assigned host
        # has been identified, it is nolonger accessed.
        app_state = None
        app = self.query_app_by_id(self.application_id)

        if app:
            if app.get('state'):
                app_state = app.get('state')
            if self.assigned_host == '' and app.get('amHostHttpAddress'):
                self.assigned_host = app.get('amHostHttpAddress').split(':')[0]
                # Set the kernel manager ip to the actual host where the application landed.
                self.assigned_ip = socket.gethostbyname(self.assigned_host)
        return app_state

    def handle_timeout(self):
        time.sleep(poll_interval)
        time_interval = RemoteProcessProxy.get_time_diff(self.start_time, RemoteProcessProxy.get_current_time())

        if time_interval > self.kernel_launch_timeout:
            reason = "Application ID is None. Failed to submit a new application to YARN within {} seconds.  " \
                     "Check Enterprise Gateway log for more information.". \
                format(self.kernel_launch_timeout)
            error_http_code = 500
            if self.get_application_id(True):
                if self.query_app_state_by_id(self.application_id) != "RUNNING":
                    reason = "YARN resources unavailable after {} seconds for app {}, launch timeout: {}!  "\
                        "Check YARN configuration.".format(time_interval, self.application_id, self.kernel_launch_timeout)
                    error_http_code = 503
                else:
                    reason = "App {} is RUNNING, but waited too long ({} secs) to get connection file.  " \
                        "Check YARN logs for more information.".format(self.application_id, self.kernel_launch_timeout)
            self.kill()
            timeout_message = "KernelID: '{}' launch timeout due to: {}".format(self.kernel_id, reason)
            self.log_and_raise(http_status_code=error_http_code, reason=timeout_message)

    def get_application_id(self, ignore_final_states=False):
        # Return the kernel's YARN application ID if available, otherwise None.  If we're obtaining application_id
        # from scratch, do not consider kernels in final states.
        if not self.application_id:
            app = self.query_app_by_name(self.kernel_id)
            state_condition = True
            if type(app) is dict and ignore_final_states:
                state_condition = app.get('state') not in YarnClusterProcessProxy.final_states

            if type(app) is dict and len(app.get('id', '')) > 0 and state_condition:
                self.application_id = app['id']
                time_interval = RemoteProcessProxy.get_time_diff(self.start_time, RemoteProcessProxy.get_current_time())
                self.log.info("ApplicationID: '{}' assigned for KernelID: '{}', state: {}, {} seconds after starting."
                              .format(app['id'], self.kernel_id, app.get('state'), time_interval))
            else:
                self.log.debug("ApplicationID not yet assigned for KernelID: '{}' - retrying...".format(self.kernel_id))
        return self.application_id

    def get_process_info(self):
        process_info = super(YarnClusterProcessProxy, self).get_process_info()
        process_info.update({'application_id': self.application_id})
        return process_info

    def load_process_info(self, process_info):
        super(YarnClusterProcessProxy, self).load_process_info(process_info)
        self.application_id = process_info['application_id']

    def query_app_by_name(self, kernel_id):
        """Retrieve application by using kernel_id as the unique app name.
        With the started_time_begin as a parameter to filter applications started earlier than the target one from YARN.
        When submit a new app, it may take a while for YARN to accept and run and generate the application ID.
        Note: if a kernel restarts with the same kernel id as app name, multiple applications will be returned.
        For now, the app/kernel with the top most application ID will be returned as the target app, assuming the app
        ID will be incremented automatically on the YARN side.

        :param kernel_id: as the unique app name for query
        :return: The JSON object of an application.
        """
        top_most_app_id = ''
        target_app = None
        data = None
        try:
            data = self.resource_mgr.cluster_applications(started_time_begin=str(self.start_time)).data
        except socket.error as sock_err:
            if sock_err.errno == errno.ECONNREFUSED:
                self.log.warning("YARN end-point: '{}' refused the connection.  Is the resource manager running?".
                               format(self.yarn_endpoint))
            else:
                self.log.warning("Query for kernel ID '{}' failed with exception: {} - '{}'.  Continuing...".
                                 format(kernel_id, type(sock_err), sock_err))
        except Exception as e:
            self.log.warning("Query for kernel ID '{}' failed with exception: {} - '{}'.  Continuing...".
                             format(kernel_id, type(e), e))

        if type(data) is dict and type(data.get("apps")) is dict and 'app' in data.get("apps"):
            for app in data['apps']['app']:
                if app.get('name', '').find(kernel_id) >= 0 and app.get('id') > top_most_app_id:
                    target_app = app
                    top_most_app_id = app.get('id')
        return target_app

    def query_app_by_id(self, app_id):
        """Retrieve an application by application ID.

        :param app_id
        :return: The JSON object of an application.
        """
        data = None
        try:
            data = self.resource_mgr.cluster_application(application_id=app_id).data
        except Exception as e:
            self.log.warning("Query for application ID '{}' failed with exception: '{}'.  Continuing...".
                           format(app_id, e))
        if type(data) is dict and 'app' in data:
            return data['app']
        return None

    def query_app_state_by_id(self, app_id):
        """Return the state of an application.

        :param app_id: 
        :return: 
        """
        response = None
        try:
            response = self.resource_mgr.cluster_application_state(application_id=app_id)
        except Exception as e:
            self.log.warning("Query for application '{}' state failed with exception: '{}'.  Continuing...".
                             format(app_id, e))

        return response.data['state']

    def kill_app_by_id(self, app_id):
        """Kill an application. If the app's state is FINISHED or FAILED, it won't be changed to KILLED.

        :param app_id
        :return: The JSON response of killing the application.
        """

        response = None
        try:
            response = self.resource_mgr.cluster_application_kill(application_id=app_id)
        except Exception as e:
            self.log.warning("Termination of application '{}' failed with exception: '{}'.  Continuing...".
                             format(app_id, e))

        return response
Esempio n. 15
0
class YarnClusterProcessProxy(RemoteProcessProxy):
    initial_states = {'NEW', 'SUBMITTED', 'ACCEPTED', 'RUNNING'}
    final_states = {'FINISHED', 'KILLED'}  # Don't include FAILED state

    def __init__(self, kernel_manager):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager)
        self.application_id = None
        self.yarn_endpoint = kernel_manager.parent.parent.yarn_endpoint # from command line or env
        yarn_master = urlparse(self.yarn_endpoint).hostname
        self.resource_mgr = ResourceManager(address=yarn_master)

    def launch_process(self, kernel_cmd, **kw):
        """ Launches the Yarn process.  Prior to invocation, connection files will be distributed to each applicable
            Yarn node so that its in place when the kernel is started.  This step is skipped if pull or socket modes 
            are configured, which results in the kernel process determining ports and generating encoding key.
            Once started, the method will poll the Yarn application (after discovering the application ID via the
            kernel ID) until host is known.  Note that this polling may timeout and result in a 503 Http error (Service 
            unavailable).
            Once the host is determined the connection file is retrieved. If pull mode is configured, the remote file is 
            copied locally and member variables are loaded based on its contents.  If socket mode is configured, the
            kernel launcher sends the connection information - which is then written out upon its reception.  If push
            mode is configured, the kernel manager's IP is updated to the selected node.
        """
        super(YarnClusterProcessProxy, self).launch_process(kernel_cmd, **kw)

        # launch the local run.sh - which is configured for yarn-cluster...
        self.local_proc = launch_kernel(kernel_cmd, **kw)
        self.pid = self.local_proc.pid
        self.ip = local_ip

        self.log.debug("Yarn cluster kernel launched using YARN endpoint: {}, pid: {}, Kernel ID: {}, cmd: '{}'"
                       .format(self.yarn_endpoint, self.local_proc.pid, self.kernel_id, kernel_cmd))
        self.confirm_remote_startup(kernel_cmd, **kw)

        return self

    def poll(self):
        """Submitting a new kernel/app to YARN will take a while to be ACCEPTED.
        Thus application ID will probably not be available immediately for poll.
        So will regard the application as RUNNING when application ID still in ACCEPTED or SUBMITTED state.

        :return: None if the application's ID is available and state is ACCEPTED/SUBMITTED/RUNNING. Otherwise False. 
        """
        result = False

        if self.get_application_id():
            state = self.query_app_state_by_id(self.application_id)
            if state in YarnClusterProcessProxy.initial_states:
                result = None

        # The following produces too much output (every 3 seconds by default), so commented-out at this time.
        # self.log.debug("YarnProcessProxy.poll, application ID: {}, kernel ID: {}, state: {}".
        #               format(self.application_id, self.kernel_id, state))
        return result

    def send_signal(self, signum):
        """Currently only support 0 as poll and other as kill.

        :param signum
        :return: 
        """
        self.log.debug("YarnClusterProcessProxy.send_signal {}".format(signum))
        if signum == 0:
            return self.poll()
        elif signum == signal.SIGKILL:
            return self.kill()
        else:
            # Yarn api doesn't support the equivalent to interrupts, so take our chances
            # via a remote signal.  Note that this condition cannot check against the
            # signum value because altternate interrupt signals might be in play.
            return super(YarnClusterProcessProxy, self).send_signal(signum)

    def kill(self):
        """Kill a kernel.
        :return: None if the application existed and is not in RUNNING state, False otherwise. 
        """
        state = None
        result = False
        if self.get_application_id():
            resp = self.kill_app_by_id(self.application_id)
            self.log.debug(
                "YarnClusterProcessProxy.kill: kill_app_by_id({}) response: {}, confirming app state is not RUNNING"
                    .format(self.application_id, resp))

            i, state = 1, self.query_app_state_by_id(self.application_id)
            while state not in YarnClusterProcessProxy.final_states and i <= max_poll_attempts:
                time.sleep(poll_interval)
                state = self.query_app_state_by_id(self.application_id)
                i = i + 1

            if state in YarnClusterProcessProxy.final_states:
                result = None

        super(YarnClusterProcessProxy, self).kill()

        self.log.debug("YarnClusterProcessProxy.kill, application ID: {}, kernel ID: {}, state: {}"
                       .format(self.application_id, self.kernel_id, state))
        return result

    def cleanup(self):
        # we might have a defunct process (if using waitAppCompletion = false) - so poll, kill, wait when we have
        # a local_proc.
        if self.local_proc:
            self.log.debug("YarnClusterProcessProxy.cleanup: Clearing possible defunct process, pid={}...".
                           format(self.local_proc.pid))
            if super(YarnClusterProcessProxy, self).poll():
                super(YarnClusterProcessProxy, self).kill()
            super(YarnClusterProcessProxy, self).wait()
            self.local_proc = None

        # reset application id to force new query - handles kernel restarts/interrupts
        self.application_id = None

        # for cleanup, we should call the superclass last
        super(YarnClusterProcessProxy, self).cleanup()

    def confirm_remote_startup(self, kernel_cmd, **kw):
        """ Confirms the yarn application is in a started state before returning.  Should post-RUNNING states be
            unexpectedly encountered (FINISHED, KILLED) then we must throw, otherwise the rest of the JKG will
            believe its talking to a valid kernel.
        """
        self.start_time = RemoteProcessProxy.get_current_time()
        i = 0
        ready_to_connect = False  # we're ready to connect when we have a connection file to use
        while not ready_to_connect:
            i += 1
            self.handle_timeout()

            if self.get_application_id(True):
                # Once we have an application ID, start monitoring state, obtain assigned host and get connection info
                app_state = self.get_application_state()

                if app_state in YarnClusterProcessProxy.final_states:
                    raise tornado.web.HTTPError(500, "KernelID: '{}', ApplicationID: '{}' unexpectedly found in"
                                                     "state '{}' during kernel startup!".format(self.kernel_id,
                                                                                                self.application_id,
                                                                                                app_state))

                self.log.debug("{}: State: '{}', Host: '{}', KernelID: '{}', ApplicationID: '{}'".
                               format(i, app_state, self.assigned_host, self.kernel_id, self.application_id))

                if self.assigned_host != '':
                    ready_to_connect = self.receive_connection_info()

    def get_application_state(self):
        # Gets the current application state using the application_id already obtained.  Once the assigned host
        # has been identified, it is nolonger accessed.
        app_state = None
        app = self.query_app_by_id(self.application_id)

        if app:
            if app.get('state'):
                app_state = app.get('state')
            if self.assigned_host == '' and app.get('amHostHttpAddress'):
                self.assigned_host = app.get('amHostHttpAddress').split(':')[0]
                # Set the kernel manager ip to the actual host where the application landed.
                self.assigned_ip = gethostbyname(self.assigned_host)
        return app_state

    def handle_timeout(self):
        time.sleep(poll_interval)
        time_interval = RemoteProcessProxy.get_time_diff(self.start_time, RemoteProcessProxy.get_current_time())

        if time_interval > self.kernel_launch_timeout:
            reason = "Application ID is None. Failed to submit a new application to YARN within {} seconds.". \
                format(self.kernel_launch_timeout)
            error_http_code = 500
            if self.get_application_id(True):
                if self.query_app_state_by_id(self.application_id) != "RUNNING":
                    reason = "YARN resources unavailable after {} seconds for app {}, launch timeout: {}!". \
                        format(time_interval, self.application_id, self.kernel_launch_timeout)
                    error_http_code = 503
                else:
                    reason = "App {} is RUNNING, but waited too long ({} secs) to get connection file". \
                        format(self.application_id, self.kernel_launch_timeout)
            self.kill()
            timeout_message = "KernelID: '{}' launch timeout due to: {}".format(self.kernel_id, reason)
            self.log.error(timeout_message)
            raise tornado.web.HTTPError(error_http_code, timeout_message)

    def get_application_id(self, ignore_final_states=False):
        # Return the kernel's YARN application ID if available, otherwise None.  If we're obtaining application_id
        # from scratch, do not consider kernels in final states.
        if not self.application_id:
            app = self.query_app_by_name(self.kernel_id)
            state_condition = True
            if type(app) is dict and ignore_final_states:
                state_condition = app.get('state') not in YarnClusterProcessProxy.final_states

            if type(app) is dict and len(app.get('id', '')) > 0 and state_condition:
                self.application_id = app['id']
                time_interval = RemoteProcessProxy.get_time_diff(self.start_time, RemoteProcessProxy.get_current_time())
                self.log.info("ApplicationID: '{}' assigned for KernelID: '{}', state: {}, {} seconds after starting."
                              .format(app['id'], self.kernel_id, app.get('state'), time_interval))
            else:
                self.log.debug("ApplicationID not yet assigned for KernelID: '{}' - retrying...".format(self.kernel_id))
        return self.application_id

    def get_process_info(self):
        process_info = super(YarnClusterProcessProxy, self).get_process_info()
        process_info.update({'application_id': self.application_id})
        return process_info

    def load_process_info(self, process_info):
        super(YarnClusterProcessProxy, self).load_process_info(process_info)
        self.application_id = process_info['application_id']

    def query_app_by_name(self, kernel_id):
        """Retrieve application by using kernel_id as the unique app name.
        With the started_time_begin as a parameter to filter applications started earlier than the target one from YARN.
        When submit a new app, it may take a while for YARN to accept and run and generate the application ID.
        Note: if a kernel restarts with the same kernel id as app name, multiple applications will be returned.
        For now, the app/kernel with the top most application ID will be returned as the target app, assuming the app
        ID will be incremented automatically on the YARN side.

        :param kernel_id: as the unique app name for query
        :return: The JSON object of an application.
        """
        top_most_app_id = ''
        target_app = None
        data = self.resource_mgr.cluster_applications(started_time_begin=str(self.start_time)).data
        if type(data) is dict and type(data.get("apps")) is dict and 'app' in data.get("apps"):
            for app in data['apps']['app']:
                if app.get('name', '').find(kernel_id) >= 0 and app.get('id') > top_most_app_id:
                    target_app = app
                    top_most_app_id = app.get('id')
        return target_app

    def query_app_by_id(self, app_id):
        """Retrieve an application by application ID.

        :param app_id
        :return: The JSON object of an application.
        """
        data = self.resource_mgr.cluster_application(application_id=app_id).data
        if type(data) is dict and 'app' in data:
            return data['app']
        return None

    def query_app_state_by_id(self, app_id):
        """Return the state of an application.

        :param app_id: 
        :return: 
        """
        url = '%s/apps/%s/state' % (self.yarn_endpoint, app_id)
        cmd = ['curl', '-X', 'GET', url]
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
        output, stderr = process.communicate()
        return json.loads(output).get('state') if output else None

    def kill_app_by_id(self, app_id):
        """Kill an application. If the app's state is FINISHED or FAILED, it won't be changed to KILLED.
        TODO: extend the yarn_api_client to support cluster_application_kill with PUT, e.g.:
            YarnProcessProxy.resource_mgr.cluster_application_kill(application_id=app_id)

        :param app_id 
        :return: The JSON response of killing the application.
        """
        header = "Content-Type: application/json"
        data = '{"state": "KILLED"}'
        url = '%s/apps/%s/state' % (self.yarn_endpoint, app_id)
        cmd = ['curl', '-X', 'PUT', '-H', header, '-d', data, url]
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
        output, stderr = process.communicate()
        return json.loads(output) if output else None
Esempio n. 16
0
 def __init__(self, kernel_manager):
     super(YarnClusterProcessProxy, self).__init__(kernel_manager)
     self.application_id = None
     self.yarn_endpoint = kernel_manager.parent.parent.yarn_endpoint # from command line or env
     yarn_master = urlparse(self.yarn_endpoint).hostname
     self.resource_mgr = ResourceManager(address=yarn_master)
 def setUp(self):
     self.rm = ResourceManager('localhost')
Esempio n. 18
0
class YarnClusterProcessProxy(RemoteProcessProxy):
    """Kernel lifecycle management for YARN clusters."""
    initial_states = {'NEW', 'SUBMITTED', 'ACCEPTED', 'RUNNING'}
    final_states = {'FINISHED', 'KILLED'}  # Don't include FAILED state

    def __init__(self, kernel_manager, proxy_config):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager, proxy_config)
        self.application_id = None
        self.rm_addr = None
        self.yarn_endpoint \
            = proxy_config.get('yarn_endpoint',
                               kernel_manager.parent.parent.yarn_endpoint)
        self.alt_yarn_endpoint \
            = proxy_config.get('alt_yarn_endpoint',
                               kernel_manager.parent.parent.alt_yarn_endpoint)

        self.yarn_endpoint_security_enabled \
            = proxy_config.get('yarn_endpoint_security_enabled',
                               kernel_manager.parent.parent.yarn_endpoint_security_enabled)

        yarn_master = alt_yarn_master = None
        yarn_port = alt_yarn_port = None
        if self.yarn_endpoint:
            yarn_url = urlparse(self.yarn_endpoint)
            yarn_master = yarn_url.hostname
            yarn_port = yarn_url.port
            # Only check alternate if "primary" is set.
            if self.alt_yarn_endpoint:
                alt_yarn_url = urlparse(self.alt_yarn_endpoint)
                alt_yarn_master = alt_yarn_url.hostname
                alt_yarn_port = alt_yarn_url.port

        self.resource_mgr = ResourceManager(address=yarn_master,
                                            port=yarn_port,
                                            alt_address=alt_yarn_master,
                                            alt_port=alt_yarn_port,
                                            kerberos_enabled=self.yarn_endpoint_security_enabled)

        host, port = self.resource_mgr.get_active_host_port()
        self.rm_addr = host + ':' + str(port)

        # YARN applications tend to take longer than the default 5 second wait time.  Rather than
        # require a command-line option for those using YARN, we'll adjust based on a local env that
        # defaults to 15 seconds.  Note: we'll only adjust if the current wait time is shorter than
        # the desired value.
        if kernel_manager.shutdown_wait_time < yarn_shutdown_wait_time:
            kernel_manager.shutdown_wait_time = yarn_shutdown_wait_time
            self.log.debug("{class_name} shutdown wait time adjusted to {wait_time} seconds.".
                           format(class_name=type(self).__name__, wait_time=kernel_manager.shutdown_wait_time))

        # If yarn resource check is enabled and it isn't available immediately,
        # 20% of kernel_launch_timeout is used to wait
        # and retry at fixed interval before pronouncing as not feasible to launch.
        self.yarn_resource_check_wait_time = 0.20 * self.kernel_launch_timeout

    def launch_process(self, kernel_cmd, **kwargs):
        # checks to see if the queue resource is available
        # if not kernel startup is not tried
        self.confirm_yarn_queue_availability(**kwargs)
        """Launches the specified process within a YARN cluster environment."""
        super(YarnClusterProcessProxy, self).launch_process(kernel_cmd, **kwargs)

        # launch the local run.sh - which is configured for yarn-cluster...
        self.local_proc = launch_kernel(kernel_cmd, **kwargs)
        self.pid = self.local_proc.pid
        self.ip = local_ip

        self.log.debug("Yarn cluster kernel launched using YARN RM address: {}, pid: {}, Kernel ID: {}, cmd: '{}'"
                       .format(self.rm_addr, self.local_proc.pid, self.kernel_id, kernel_cmd))
        self.confirm_remote_startup()

        return self

    """Submitting jobs to yarn queue and then checking till the jobs are in running state
        will lead to orphan jobs being created in some scenarios.
        We take kernel_launch_timeout time and divide this into two parts.
        if the queue is unavailable we take max 20% of the time to poll the queue periodically
        and if the queue becomes available the rest of timeout is met in 80% of the remmaining
        time."""

    def confirm_yarn_queue_availability(self, **kwargs):
        """
        This algorithm is subject to change. Please read the below cases to understand
        when and how checks are applied.

        Confirms if the yarn queue has capacity to handle the resource requests that
        will be sent to it.

        First check ensures the driver and executor memory request falls within
        the container size of yarn configuration. This check requires executor and
        driver memory to be available in the env.

        Second,Current version of check, takes into consideration node label partitioning
        on given queues. Provided the queue name and node label this checks if
        the given partition has capacity available for kernel startup.

        All Checks are optional. If we have KERNEL_EXECUTOR_MEMORY and KERNEL_DRIVER_MEMORY
        specified, first check is performed.

        If we have KERNEL_QUEUE and KERNEL_NODE_LABEL specified, second check is performed.

        Proper error messages are sent back for user experience
        :param kwargs:
        :return:
        """
        env_dict = kwargs.get('env', {})

        executor_memory = int(env_dict.get('KERNEL_EXECUTOR_MEMORY', 0))
        driver_memory = int(env_dict.get('KERNEL_DRIVER_MEMORY', 0))

        if executor_memory * driver_memory > 0:
            container_memory = self.resource_mgr.cluster_node_container_memory()
            if max(executor_memory, driver_memory) > container_memory:
                self.log_and_raise(http_status_code=500,
                                   reason="Container Memory not sufficient for a executor/driver allocation")

        candidate_queue_name = (env_dict.get('KERNEL_QUEUE', None))
        node_label = env_dict.get('KERNEL_NODE_LABEL', None)
        partition_availability_threshold = float(env_dict.get('YARN_PARTITION_THRESHOLD', 95.0))

        if candidate_queue_name is None or node_label is None:
            return

        # else the resources may or may not be available now. it may be possible that if we wait then the resources
        # become available. start  a timeout process

        self.start_time = RemoteProcessProxy.get_current_time()
        self.candidate_queue = self.resource_mgr.cluster_scheduler_queue(candidate_queue_name)

        if self.candidate_queue is None:
            self.log.warning("Queue: {} not found in cluster."
                             "Availability check will not be performed".format(candidate_queue_name))
            return

        self.candidate_partition = self.resource_mgr.cluster_queue_partition(self.candidate_queue, node_label)

        if self.candidate_partition is None:
            self.log.debug("Partition: {} not found in {} queue."
                           "Availability check will not be performed".format(node_label, candidate_queue_name))
            return

        self.log.debug("Checking endpoint: {} if partition: {} "
                       "has used capacity <= {}%".format(self.yarn_endpoint,
                                                         self.candidate_partition, partition_availability_threshold))

        yarn_available = self.resource_mgr.cluster_scheduler_queue_availability(self.candidate_partition,
                                                                                partition_availability_threshold)
        if not yarn_available:
            self.log.debug(
                "Retrying for {} ms since resources are not available".format(self.yarn_resource_check_wait_time))
            while not yarn_available:
                self.handle_yarn_queue_timeout()
                yarn_available = self.resource_mgr.cluster_scheduler_queue_availability(
                    self.candidate_partition, partition_availability_threshold)

        # subtracting the total amount of time spent for polling for queue availability
        self.kernel_launch_timeout -= RemoteProcessProxy.get_time_diff(self.start_time,
                                                                       RemoteProcessProxy.get_current_time())

    def handle_yarn_queue_timeout(self):

        time.sleep(poll_interval)
        time_interval = RemoteProcessProxy.get_time_diff(self.start_time, RemoteProcessProxy.get_current_time())

        if time_interval > self.yarn_resource_check_wait_time:
            error_http_code = 500
            reason = "Yarn Compute Resource is unavailable after {} seconds".format(self.yarn_resource_check_wait_time)
            self.log_and_raise(http_status_code=error_http_code, reason=reason)

    def poll(self):
        """Submitting a new kernel/app to YARN will take a while to be ACCEPTED.
        Thus application ID will probably not be available immediately for poll.
        So will regard the application as RUNNING when application ID still in ACCEPTED or SUBMITTED state.

        :return: None if the application's ID is available and state is ACCEPTED/SUBMITTED/RUNNING. Otherwise False.
        """
        result = False

        if self._get_application_id():
            state = self._query_app_state_by_id(self.application_id)
            if state in YarnClusterProcessProxy.initial_states:
                result = None

        # The following produces too much output (every 3 seconds by default), so commented-out at this time.
        # self.log.debug("YarnProcessProxy.poll, application ID: {}, kernel ID: {}, state: {}".
        #               format(self.application_id, self.kernel_id, state))
        return result

    def send_signal(self, signum):
        """Currently only support 0 as poll and other as kill.

        :param signum
        :return:
        """
        if signum == 0:
            return self.poll()
        elif signum == signal.SIGKILL:
            return self.kill()
        else:
            # Yarn api doesn't support the equivalent to interrupts, so take our chances
            # via a remote signal.  Note that this condition cannot check against the
            # signum value because altternate interrupt signals might be in play.
            return super(YarnClusterProcessProxy, self).send_signal(signum)

    def kill(self):
        """Kill a kernel.
        :return: None if the application existed and is not in RUNNING state, False otherwise.
        """
        state = None
        result = False
        if self._get_application_id():
            self._kill_app_by_id(self.application_id)
            # Check that state has moved to a final state (most likely KILLED)
            i = 1
            state = self._query_app_state_by_id(self.application_id)
            while state not in YarnClusterProcessProxy.final_states and i <= max_poll_attempts:
                time.sleep(poll_interval)
                state = self._query_app_state_by_id(self.application_id)
                i = i + 1

            if state in YarnClusterProcessProxy.final_states:
                result = None

        if result is False:  # We couldn't terminate via Yarn, try remote signal
            result = super(YarnClusterProcessProxy, self).kill()

        self.log.debug("YarnClusterProcessProxy.kill, application ID: {}, kernel ID: {}, state: {}, result: {}"
                       .format(self.application_id, self.kernel_id, state, result))
        return result

    def cleanup(self):
        """"""
        # we might have a defunct process (if using waitAppCompletion = false) - so poll, kill, wait when we have
        # a local_proc.
        if self.local_proc:
            self.log.debug("YarnClusterProcessProxy.cleanup: Clearing possible defunct process, pid={}...".
                           format(self.local_proc.pid))
            if super(YarnClusterProcessProxy, self).poll():
                super(YarnClusterProcessProxy, self).kill()
            super(YarnClusterProcessProxy, self).wait()
            self.local_proc = None

        # reset application id to force new query - handles kernel restarts/interrupts
        self.application_id = None

        # for cleanup, we should call the superclass last
        super(YarnClusterProcessProxy, self).cleanup()

    def confirm_remote_startup(self):
        """ Confirms the yarn application is in a started state before returning.  Should post-RUNNING states be
            unexpectedly encountered (FINISHED, KILLED) then we must throw, otherwise the rest of the gateway will
            believe its talking to a valid kernel.
        """
        self.start_time = RemoteProcessProxy.get_current_time()
        i = 0
        ready_to_connect = False  # we're ready to connect when we have a connection file to use
        while not ready_to_connect:
            i += 1
            self.handle_timeout()

            if self._get_application_id(True):
                # Once we have an application ID, start monitoring state, obtain assigned host and get connection info
                app_state = self._get_application_state()

                if app_state in YarnClusterProcessProxy.final_states:
                    error_message = "KernelID: '{}', ApplicationID: '{}' unexpectedly found in state '{}'" \
                                    " during kernel startup!".format(self.kernel_id, self.application_id, app_state)
                    self.log_and_raise(http_status_code=500, reason=error_message)

                self.log.debug("{}: State: '{}', Host: '{}', KernelID: '{}', ApplicationID: '{}'".
                               format(i, app_state, self.assigned_host, self.kernel_id, self.application_id))

                if self.assigned_host != '':
                    ready_to_connect = self.receive_connection_info()
            else:
                self.detect_launch_failure()

    def _get_application_state(self):
        # Gets the current application state using the application_id already obtained.  Once the assigned host
        # has been identified, it is nolonger accessed.
        app_state = None
        app = self._query_app_by_id(self.application_id)

        if app:
            if app.get('state'):
                app_state = app.get('state')
            if self.assigned_host == '' and app.get('amHostHttpAddress'):
                self.assigned_host = app.get('amHostHttpAddress').split(':')[0]
                # Set the kernel manager ip to the actual host where the application landed.
                self.assigned_ip = socket.gethostbyname(self.assigned_host)
        return app_state

    def handle_timeout(self):
        """Checks to see if the kernel launch timeout has been exceeded while awaiting connection info."""
        time.sleep(poll_interval)
        time_interval = RemoteProcessProxy.get_time_diff(self.start_time, RemoteProcessProxy.get_current_time())

        if time_interval > self.kernel_launch_timeout:
            reason = "Application ID is None. Failed to submit a new application to YARN within {} seconds.  " \
                     "Check Enterprise Gateway log for more information.". \
                format(self.kernel_launch_timeout)
            error_http_code = 500
            if self._get_application_id(True):
                if self._query_app_state_by_id(self.application_id) != "RUNNING":
                    reason = "YARN resources unavailable after {} seconds for app {}, launch timeout: {}!  " \
                             "Check YARN configuration.".format(time_interval, self.application_id,
                                                                self.kernel_launch_timeout)
                    error_http_code = 503
                else:
                    reason = "App {} is RUNNING, but waited too long ({} secs) to get connection file.  " \
                             "Check YARN logs for more information.".format(self.application_id,
                                                                            self.kernel_launch_timeout)
            self.kill()
            timeout_message = "KernelID: '{}' launch timeout due to: {}".format(self.kernel_id, reason)
            self.log_and_raise(http_status_code=error_http_code, reason=timeout_message)

    def _get_application_id(self, ignore_final_states=False):
        # Return the kernel's YARN application ID if available, otherwise None.  If we're obtaining application_id
        # from scratch, do not consider kernels in final states.
        if not self.application_id:
            app = self._query_app_by_name(self.kernel_id)
            state_condition = True
            if type(app) is dict and ignore_final_states:
                state_condition = app.get('state') not in YarnClusterProcessProxy.final_states

            if type(app) is dict and len(app.get('id', '')) > 0 and state_condition:
                self.application_id = app['id']
                time_interval = RemoteProcessProxy.get_time_diff(self.start_time, RemoteProcessProxy.get_current_time())
                self.log.info("ApplicationID: '{}' assigned for KernelID: '{}', state: {}, {} seconds after starting."
                              .format(app['id'], self.kernel_id, app.get('state'), time_interval))
            else:
                self.log.debug("ApplicationID not yet assigned for KernelID: '{}' - retrying...".format(self.kernel_id))
        return self.application_id

    def get_process_info(self):
        """Captures the base information necessary for kernel persistence relative to YARN clusters."""
        process_info = super(YarnClusterProcessProxy, self).get_process_info()
        process_info.update({'application_id': self.application_id})
        return process_info

    def load_process_info(self, process_info):
        """Loads the base information necessary for kernel persistence relative to YARN clusters."""
        super(YarnClusterProcessProxy, self).load_process_info(process_info)
        self.application_id = process_info['application_id']

    def _query_app_by_name(self, kernel_id):
        """Retrieve application by using kernel_id as the unique app name.
        With the started_time_begin as a parameter to filter applications started earlier than the target one from YARN.
        When submit a new app, it may take a while for YARN to accept and run and generate the application ID.
        Note: if a kernel restarts with the same kernel id as app name, multiple applications will be returned.
        For now, the app/kernel with the top most application ID will be returned as the target app, assuming the app
        ID will be incremented automatically on the YARN side.

        :param kernel_id: as the unique app name for query
        :return: The JSON object of an application.
        """
        top_most_app_id = ''
        target_app = None
        data = None
        try:
            data = self.resource_mgr.cluster_applications(started_time_begin=str(self.start_time)).data
        except socket.error as sock_err:
            if sock_err.errno == errno.ECONNREFUSED:
                self.log.warning("YARN RM address: '{}' refused the connection.  Is the resource manager running?".
                                 format(self.rm_addr))
            else:
                self.log.warning("Query for kernel ID '{}' failed with exception: {} - '{}'.  Continuing...".
                                 format(kernel_id, type(sock_err), sock_err))
        except Exception as e:
            self.log.warning("Query for kernel ID '{}' failed with exception: {} - '{}'.  Continuing...".
                             format(kernel_id, type(e), e))

        if type(data) is dict and type(data.get("apps")) is dict and 'app' in data.get("apps"):
            for app in data['apps']['app']:
                if app.get('name', '').find(kernel_id) >= 0 and app.get('id') > top_most_app_id:
                    target_app = app
                    top_most_app_id = app.get('id')
        return target_app

    def _query_app_by_id(self, app_id):
        """Retrieve an application by application ID.

        :param app_id
        :return: The JSON object of an application.
        """
        data = None
        try:
            data = self.resource_mgr.cluster_application(application_id=app_id).data
        except Exception as e:
            self.log.warning("Query for application ID '{}' failed with exception: '{}'.  Continuing...".
                             format(app_id, e))
        if type(data) is dict and 'app' in data:
            return data['app']
        return None

    def _query_app_state_by_id(self, app_id):
        """Return the state of an application.

        :param app_id:
        :return:
        """
        response = None
        try:
            response = self.resource_mgr.cluster_application_state(application_id=app_id)
        except Exception as e:
            self.log.warning("Query for application '{}' state failed with exception: '{}'.  Continuing...".
                             format(app_id, e))

        return response.data['state']

    def _kill_app_by_id(self, app_id):
        """Kill an application. If the app's state is FINISHED or FAILED, it won't be changed to KILLED.

        :param app_id
        :return: The JSON response of killing the application.
        """

        response = None
        try:
            response = self.resource_mgr.cluster_application_kill(application_id=app_id)
        except Exception as e:
            self.log.warning("Termination of application '{}' failed with exception: '{}'.  Continuing...".
                             format(app_id, e))

        return response
class ResourceManagerTestCase(TestCase):
    """
    Integration test that, given a provided YARN ENDPOINT,
    execute some real scenario test against that server.

    Note that, if no YARN ENDPOINT is provided, the tests
    are ignored.
    """
    @classmethod
    def setUpClass(self):
        self.configured = False
        if os.getenv('YARN_ENDPOINT'):
            yarn_endpoint = os.getenv('YARN_ENDPOINT')
            yarn_endpoint_uri = urlparse(yarn_endpoint)

            if yarn_endpoint_uri.hostname and yarn_endpoint_uri.port:
                self.configured = True
                self.resource_manager = ResourceManager([
                    yarn_endpoint_uri.hostname + ":" +
                    str(yarn_endpoint_uri.port)
                ])

    def test_cluster_information(self):
        if self.configured:
            info = self.resource_manager.cluster_information()
            pprint(info.data)
            self.assertEqual(info.data['clusterInfo']['state'], 'STARTED')

    def test_cluster_metrics(self):
        if self.configured:
            metrics = self.resource_manager.cluster_metrics()
            pprint(metrics.data)
            self.assertGreater(metrics.data['clusterMetrics']['activeNodes'],
                               0)
            self.assertIsNotNone(metrics.data['clusterMetrics']['totalNodes'])

    def test_cluster_scheduler(self):
        if self.configured:
            scheduler = self.resource_manager.cluster_scheduler()
            pprint(scheduler.data)
            self.assertIsNotNone(scheduler.data['scheduler']['schedulerInfo'])

    def test_cluster_applications(self):
        if self.configured:
            apps = self.resource_manager.cluster_applications()
            pprint(apps.data)
            self.assertIsNotNone(apps.data['apps'])

    def test_cluster_application_state(self):
        if self.configured:
            apps = self.resource_manager.cluster_applications()
            appid = apps.data['apps']['app'][0]['id']
            print(appid)
            response = self.resource_manager.cluster_application_state(appid)
            pprint(response.data)
            pprint(response.data['state'])
            self.assertIsNotNone(apps.data['apps'])

    def test_cluster_application_statistics(self):
        if self.configured:
            appstats = self.resource_manager.cluster_application_statistics()
            pprint(appstats.data)
            self.assertIsNotNone(appstats.data['appStatInfo'])

    def test_cluster_nodes(self):
        if self.configured:
            nodes = self.resource_manager.cluster_nodes()
            pprint(nodes.data)
            self.assertIsNotNone(nodes.data['nodes'])

            running_nodes = self.resource_manager.cluster_nodes(
                state='RUNNING', healthy='true')
            pprint(running_nodes.data)
            self.assertIsNotNone(nodes.data['nodes'])
class ResourceManagerTestCase(TestCase):
    def setUp(self):
        self.rm = ResourceManager('localhost')

    @patch('yarn_api_client.resource_manager.get_resource_manager_host_port')
    def test__init__(self, get_config_mock, request_mock):
        get_config_mock.return_value = (None, None)
        ResourceManager()
        get_config_mock.assert_called_with()

    def test_cluster_information(self, request_mock):
        self.rm.cluster_information()
        request_mock.assert_called_with('/ws/v1/cluster/info')

    def test_cluster_metrics(self, request_mock):
        self.rm.cluster_metrics()
        request_mock.assert_called_with('/ws/v1/cluster/metrics')

    def test_cluster_scheduler(self, request_mock):
        self.rm.cluster_scheduler()
        request_mock.assert_called_with('/ws/v1/cluster/scheduler')

    def test_cluster_applications(self, request_mock):
        self.rm.cluster_applications()
        request_mock.assert_called_with('/ws/v1/cluster/apps')

        self.rm.cluster_applications(state='KILLED', final_status='FAILED',
                                     user='******', queue='low', limit=10,
                                     started_time_begin=1, started_time_end=2,
                                     finished_time_begin=3, finished_time_end=4)
        request_mock.assert_called_with('/ws/v1/cluster/apps', state='KILLED',
                                        finalStatus='FAILED', user='******',
                                        queue='low', limit=10,
                                        startedTimeBegin=1, startedTimeEnd=2,
                                        finishedTimeBegin=3, finishedTimeEnd=4)

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(state='ololo')

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(final_status='ololo')

    def test_cluster_application_statistics(self, request_mock):
        self.rm.cluster_application_statistics()
        request_mock.assert_called_with('/ws/v1/cluster/appstatistics')
        # TODO: test arguments

    def test_cluster_application(self, request_mock):
        self.rm.cluster_application('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1')

    def test_cluster_application_attempts(self, request_mock):
        self.rm.cluster_application_attempts('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/appattempts')

    def test_cluster_nodes(self, request_mock):
        self.rm.cluster_nodes()
        request_mock.assert_called_with('/ws/v1/cluster/nodes', params={})

        self.rm.cluster_nodes(state='NEW', healthy='true')
        request_mock.assert_called_with('/ws/v1/cluster/nodes',
                                        params={"state": 'NEW', "healthy": 'true'})

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_nodes(state='NEW', healthy='ololo')

    def test_cluster_node(self, request_mock):
        self.rm.cluster_node('node_1')
        request_mock.assert_called_with('/ws/v1/cluster/nodes/node_1')

    # TODO
    # def test_cluster_submit_application(self, request_mock):
    #     self.rm.cluster_submit_application()
    #     request_mock.assert_called_with('/ws/v1/cluster/apps')

    def test_cluster_new_application(self, request_mock):
        self.rm.cluster_new_application()
        request_mock.assert_called_with('/ws/v1/cluster/apps/new-application', 'POST')

    def test_cluster_get_application_queue(self, request_mock):
        self.rm.cluster_get_application_queue('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/queue')

    def test_cluster_change_application_queue(self, request_mock):
        self.rm.cluster_change_application_queue('app_1', 'queue_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/queue', 'PUT', data={"queue": 'queue_1'})

    def test_cluster_get_application_priority(self, request_mock):
        self.rm.cluster_get_application_priority('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/priority')

    def test_cluster_change_application_priority(self, request_mock):
        self.rm.cluster_change_application_priority('app_1', 'priority_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/priority', 'PUT', data={"priority": 'priority_1'})
class ResourceManagerTestCase(TestCase):
    def setUp(self):
        self.rm = ResourceManager('localhost')

    @patch('yarn_api_client.resource_manager.get_resource_manager_host_port')
    def test__init__(self, get_config_mock, request_mock):
        get_config_mock.return_value = (None, None)
        ResourceManager()
        get_config_mock.assert_called_with()

    def test_cluster_information(self, request_mock):
        self.rm.cluster_information()
        request_mock.assert_called_with('/ws/v1/cluster/info')

    def test_cluster_metrics(self, request_mock):
        self.rm.cluster_metrics()
        request_mock.assert_called_with('/ws/v1/cluster/metrics')

    def test_cluster_scheduler(self, request_mock):
        self.rm.cluster_scheduler()
        request_mock.assert_called_with('/ws/v1/cluster/scheduler')

    def test_cluster_applications(self, request_mock):
        self.rm.cluster_applications()
        request_mock.assert_called_with('/ws/v1/cluster/apps', params={})

        self.rm.cluster_applications(state='KILLED',
                                     final_status='FAILED',
                                     user='******',
                                     queue='low',
                                     limit=10,
                                     started_time_begin=1,
                                     started_time_end=2,
                                     finished_time_begin=3,
                                     finished_time_end=4)
        request_mock.assert_called_with('/ws/v1/cluster/apps',
                                        params={
                                            'state': 'KILLED',
                                            'finalStatus': 'FAILED',
                                            'user': '******',
                                            'queue': 'low',
                                            'limit': 10,
                                            'startedTimeBegin': 1,
                                            'startedTimeEnd': 2,
                                            'finishedTimeBegin': 3,
                                            'finishedTimeEnd': 4
                                        })

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(state='ololo')

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(final_status='ololo')

    def test_cluster_application_statistics(self, request_mock):
        self.rm.cluster_application_statistics()
        request_mock.assert_called_with('/ws/v1/cluster/appstatistics',
                                        params={})
        # TODO: test arguments

    def test_cluster_application(self, request_mock):
        self.rm.cluster_application('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1')

    def test_cluster_application_attempts(self, request_mock):
        self.rm.cluster_application_attempts('app_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/apps/app_1/appattempts')

    def test_cluster_nodes(self, request_mock):
        self.rm.cluster_nodes()
        request_mock.assert_called_with('/ws/v1/cluster/nodes', params={})

        self.rm.cluster_nodes(state='NEW', healthy='true')
        request_mock.assert_called_with('/ws/v1/cluster/nodes',
                                        params={
                                            "state": 'NEW',
                                            "healthy": 'true'
                                        })

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_nodes(state='NEW', healthy='ololo')

    def test_cluster_node(self, request_mock):
        self.rm.cluster_node('node_1')
        request_mock.assert_called_with('/ws/v1/cluster/nodes/node_1')

    # TODO
    # def test_cluster_submit_application(self, request_mock):
    #     self.rm.cluster_submit_application()
    #     request_mock.assert_called_with('/ws/v1/cluster/apps')

    def test_cluster_new_application(self, request_mock):
        self.rm.cluster_new_application()
        request_mock.assert_called_with('/ws/v1/cluster/apps/new-application',
                                        'POST')

    def test_cluster_get_application_queue(self, request_mock):
        self.rm.cluster_get_application_queue('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/queue')

    def test_cluster_change_application_queue(self, request_mock):
        self.rm.cluster_change_application_queue('app_1', 'queue_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/queue',
                                        'PUT',
                                        data={"queue": 'queue_1'})

    def test_cluster_get_application_priority(self, request_mock):
        self.rm.cluster_get_application_priority('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/priority')

    def test_cluster_change_application_priority(self, request_mock):
        self.rm.cluster_change_application_priority('app_1', 'priority_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/priority',
                                        'PUT',
                                        data={"priority": 'priority_1'})
 def test__init__(self, get_config_mock, request_mock):
     get_config_mock.return_value = (None, None)
     ResourceManager()
     get_config_mock.assert_called_with()
 def setUp(self):
     self.rm = ResourceManager('localhost')
Esempio n. 24
0
class YarnProvisioner(RemoteProvisionerBase):
    """
    Kernel lifecycle management for YARN clusters.
    """

    yarn_endpoint_env = 'RP_YARN_ENDPOINT'
    yarn_endpoint = Unicode(
        None,
        config=True,
        allow_none=True,
        help=
        """The http url specifying the YARN Resource Manager. Note: If this value is NOT set,
                            the YARN library will use the files within the local HADOOP_CONFIG_DIR to determine the
                            active resource manager. (RP_YARN_ENDPOINT env var)"""
    )

    @default('yarn_endpoint')
    def yarn_endpoint_default(self):
        return os.getenv(self.yarn_endpoint_env)

    # Alt Yarn endpoint
    alt_yarn_endpoint_env = 'RP_ALT_YARN_ENDPOINT'
    alt_yarn_endpoint = Unicode(
        None,
        config=True,
        allow_none=True,
        help=
        """The http url specifying the alternate YARN Resource Manager.  This value should
                                be set when YARN Resource Managers are configured for high availability.  Note: If both
                                YARN endpoints are NOT set, the YARN library will use the files within the local
                                HADOOP_CONFIG_DIR to determine the active resource manager.
                                (RP_ALT_YARN_ENDPOINT env var)""")

    @default('alt_yarn_endpoint')
    def alt_yarn_endpoint_default(self):
        return os.getenv(self.alt_yarn_endpoint_env)

    yarn_endpoint_security_enabled_env = 'RP_YARN_ENDPOINT_SECURITY_ENABLED'
    yarn_endpoint_security_enabled_default_value = False
    yarn_endpoint_security_enabled = Bool(
        yarn_endpoint_security_enabled_default_value,
        config=True,
        help="""Is YARN Kerberos/SPNEGO Security enabled (True/False).
                                          (RP_YARN_ENDPOINT_SECURITY_ENABLED env var)"""
    )

    @default('yarn_endpoint_security_enabled')
    def yarn_endpoint_security_enabled_default(self):
        return bool(
            os.getenv(self.yarn_endpoint_security_enabled_env,
                      self.yarn_endpoint_security_enabled_default_value))

    initial_states = {'NEW', 'SUBMITTED', 'ACCEPTED', 'RUNNING'}
    final_states = {'FINISHED', 'KILLED', 'FAILED'}

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.application_id = None
        self.last_known_state = None
        self.candidate_queue = None
        self.candidate_partition = None

        endpoints = None
        if self.yarn_endpoint:
            endpoints = [self.yarn_endpoint]

            # Only check alternate if "primary" is set.
            if self.alt_yarn_endpoint:
                endpoints.append(self.alt_yarn_endpoint)

        auth = None
        if self.yarn_endpoint_security_enabled:
            from requests_kerberos import HTTPKerberosAuth
            auth = HTTPKerberosAuth()

        self.resource_mgr = ResourceManager(service_endpoints=endpoints,
                                            auth=auth,
                                            verify=cert_path)

        self.rm_addr = self.resource_mgr.get_active_endpoint()

        # If yarn resource check is enabled and it isn't available immediately,
        # 20% of kernel_launch_timeout is used to wait
        # and retry at fixed interval before pronouncing as not feasible to launch.
        self.yarn_resource_check_wait_time = 0.20 * self.launch_timeout

    async def pre_launch(self, **kwargs: Any) -> Dict[str, Any]:
        """
        Launches the specified process within a YARN cluster environment.
        """
        self.application_id = None
        self.last_known_state = None
        self.candidate_queue = None
        self.candidate_partition = None

        kwargs = await super().pre_launch(**kwargs)

        # checks to see if the queue resource is available
        # if not available, kernel startup is not attempted
        self.confirm_yarn_queue_availability(**kwargs)

        return kwargs

    def log_kernel_launch(self, cmd: List[str]) -> None:
        self.log.info(
            f"{self.__class__.__name__}: kernel launched. YARN RM: {self.rm_addr}, "
            f"pid: {self.local_proc.pid}, Kernel ID: {self.kernel_id}, cmd: '{cmd}'"
        )

    def get_shutdown_wait_time(self,
                               recommended: Optional[float] = 5.0) -> float:
        """Returns the time allowed for a complete shutdown.  This may vary by provisioner.

        The recommended value will typically be what is configured in the kernel manager.
        """

        # YARN applications tend to take longer than the default 5 second wait time.  Rather than
        # require a command-line option for those using YARN, we'll adjust based on a local env that
        # defaults to 15 seconds.  Note: we'll only adjust if the current wait time is shorter than
        # the desired value.
        if recommended < yarn_shutdown_wait_time:
            recommended = yarn_shutdown_wait_time
            self.log.debug(
                f"{type(self).__name__} shutdown wait time adjusted to {recommended} seconds."
            )

        return recommended

    def confirm_yarn_queue_availability(self, **kwargs: Dict[str,
                                                             Any]) -> None:
        """
        Submitting jobs to yarn queue and then checking till the jobs are in running state
        will lead to orphan jobs being created in some scenarios.

        We take kernel_launch_timeout time and divide this into two parts.
        If the queue is unavailable we take max 20% of the time to poll the queue periodically
        and if the queue becomes available the rest of timeout is met in 80% of the remaining
        time.

        This algorithm is subject to change. Please read the below cases to understand
        when and how checks are applied.

        Confirms if the yarn queue has capacity to handle the resource requests that
        will be sent to it.

        First check ensures the driver and executor memory request falls within
        the container size of yarn configuration. This check requires executor and
        driver memory to be available in the env.

        Second,Current version of check, takes into consideration node label partitioning
        on given queues. Provided the queue name and node label this checks if
        the given partition has capacity available for kernel startup.

        All Checks are optional. If we have KERNEL_EXECUTOR_MEMORY and KERNEL_DRIVER_MEMORY
        specified, first check is performed.

        If we have KERNEL_QUEUE and KERNEL_NODE_LABEL specified, second check is performed.

        Proper error messages are sent back for user experience
        :param kwargs:
        :return:
        """
        env_dict = kwargs.get('env', {})

        executor_memory = int(env_dict.get('KERNEL_EXECUTOR_MEMORY', 0))
        driver_memory = int(env_dict.get('KERNEL_DRIVER_MEMORY', 0))

        if executor_memory * driver_memory > 0:
            container_memory = self.resource_mgr.cluster_node_container_memory(
            )
            if max(executor_memory, driver_memory) > container_memory:
                self.log_and_raise(
                    ValueError(
                        "Container Memory not sufficient for a executor/driver allocation"
                    ))

        candidate_queue_name = (env_dict.get('KERNEL_QUEUE', None))
        node_label = env_dict.get('KERNEL_NODE_LABEL', None)
        partition_availability_threshold = float(
            env_dict.get('YARN_PARTITION_THRESHOLD', 95.0))

        if candidate_queue_name is None or node_label is None:
            return

        # else the resources may or may not be available now. it may be possible that if we wait then the resources
        # become available. start  a timeout process

        self.start_time = RemoteProvisionerBase.get_current_time()
        self.candidate_queue = self.resource_mgr.cluster_scheduler_queue(
            candidate_queue_name)

        if self.candidate_queue is None:
            self.log.warning(
                f"Queue: {candidate_queue_name} not found in cluster.  "
                "Availability check will not be performed")
            return

        self.candidate_partition = self.resource_mgr.cluster_queue_partition(
            self.candidate_queue, node_label)

        if self.candidate_partition is None:
            self.log.debug(
                f"Partition: {node_label} not found in {candidate_queue_name} queue."
                "Availability check will not be performed")
            return

        self.log.debug(
            f"Checking endpoint: {self.yarn_endpoint} if partition: {self.candidate_partition} "
            f"has used capacity <= {partition_availability_threshold}%")

        yarn_available = self.resource_mgr.cluster_scheduler_queue_availability(
            self.candidate_partition, partition_availability_threshold)
        if not yarn_available:
            self.log.debug(
                f"Retrying for {self.yarn_resource_check_wait_time} ms since resources are not available"
            )
            while not yarn_available:
                self.handle_yarn_queue_timeout()
                yarn_available = self.resource_mgr.cluster_scheduler_queue_availability(
                    self.candidate_partition, partition_availability_threshold)

        # subtracting the total amount of time spent for polling for queue availability
        self.launch_timeout -= RemoteProvisionerBase.get_time_diff(
            self.start_time)

    def handle_yarn_queue_timeout(self) -> None:
        time.sleep(poll_interval)
        time_interval = RemoteProvisionerBase.get_time_diff(self.start_time)

        if time_interval > self.yarn_resource_check_wait_time:
            reason = f"Yarn Compute Resource is unavailable after {self.yarn_resource_check_wait_time} seconds"
            self.log_and_raise(TimeoutError(reason))

    @property
    def has_process(self) -> bool:
        return self.local_proc is not None or self.application_id is not None

    async def poll(self) -> Optional[int]:
        """Submitting a new kernel/app to YARN will take a while to be ACCEPTED.
        Thus application ID will probably not be available immediately for poll.
        So will regard the application as RUNNING when application ID still in ACCEPTED or SUBMITTED state.

        :return: None if the application's ID is available and state is ACCEPTED/SUBMITTED/RUNNING. Otherwise 0.
        """
        result = 0

        if self._get_application_id():
            state = self._query_app_state_by_id(self.application_id)
            if state in YarnProvisioner.initial_states:
                result = None

        # The following produces too much output (every 3 seconds by default), so commented-out at this time.
        # self.log.debug("YarnProcessProxy.poll, application ID: {}, kernel ID: {}, state: {}".
        #               format(self.application_id, self.kernel_id, state))
        return result

    async def send_signal(self, signum: int) -> None:
        """Currently only support 0 as poll and other as kill.

        :param signum
        :return:
        """
        if signum == 0:
            return await self.poll()
        elif signum == signal.SIGKILL:
            return await self.kill()
        else:
            # Yarn api doesn't support the equivalent to interrupts, so take our chances
            # via a remote signal.  Note that this condition cannot check against the
            # signum value because alternate interrupt signals might be in play.
            return await super().send_signal(signum)

    async def kill(self, restart: bool = False) -> None:
        """Kill a kernel.
        :return: None if the application existed and is not in RUNNING state, False otherwise.
        """
        state = None
        result = False
        if self._get_application_id():
            result, state = await self.shutdown_application()

        if result is False:  # We couldn't terminate via Yarn, try remote signal
            result = await super().send_signal(
                signal.SIGKILL)  # Must use super here, else infinite

        self.log.debug(
            f"YarnProvisioner.kill, application ID: {self.application_id}, "
            f"kernel ID: {self.kernel_id}, state: {state}, result: {result}")
        return result

    async def terminate(self, restart: bool = False) -> None:
        """Terminate a kernel.

        Similar to kill except that the follow-on kill step is not taken if termination is not confirmed.
        """
        state = None
        result = False
        if self._get_application_id():
            result, state = await self.shutdown_application()

        self.log.debug(
            f"YarnProvisioner.terminate, application ID: {self.application_id}, "
            f"kernel ID: {self.kernel_id}, state: {state}, result: {result}")
        return result

    async def shutdown_application(self) -> Tuple[Optional[bool], str]:
        """Shuts down the YARN application, returning None if final state is confirmed, False otherwise."""
        result = False
        self._kill_app_by_id(self.application_id)
        # Check that state has moved to a final state (most likely KILLED)
        i = 1
        state = self._query_app_state_by_id(self.application_id)
        while state not in YarnProvisioner.final_states and i <= max_poll_attempts:
            await asyncio.sleep(poll_interval)
            state = self._query_app_state_by_id(self.application_id)
            i = i + 1

        if state in YarnProvisioner.final_states:
            result = None

        return result, state

    async def cleanup(self, restart: bool = False):
        """"""
        # we might have a defunct process (if using waitAppCompletion = false) - so poll, kill, wait when we have
        # a local_proc.
        if self.local_proc:
            self.log.debug(
                f"YarnProvisioner.cleanup: Clearing possible defunct "
                f"process, pid={self.local_proc.pid}...")

            if self.local_proc.poll():
                self.local_proc.kill()
            self.local_proc.wait()
            self.local_proc = None

        # reset application id to force new query - handles kernel restarts/interrupts
        self.application_id = None

        # for cleanup, we should call the superclass last
        await super().cleanup(restart=restart)

    async def confirm_remote_startup(self):
        """
        Confirms the yarn application is in a started state before returning.

        Should post-RUNNING states be unexpectedly encountered (FINISHED, KILLED, FAILED)
        then we must throw, otherwise the rest of the gateway will believe its talking
        to a valid kernel.

        Note: This is a complete override of the superclass method.
        """
        self.start_time = RemoteProvisionerBase.get_current_time()
        i = 0
        ready_to_connect = False  # we're ready to connect when we have a connection file to use
        while not ready_to_connect:
            i += 1
            await self.handle_launch_timeout()

            if self._get_application_id(True):
                # Once we have an application ID, start monitoring state, obtain assigned host and get connection info
                app_state = self._get_application_state()

                if app_state in YarnProvisioner.final_states:
                    error_message = f"KernelID: '{self.kernel_id}', ApplicationID: '{self.application_id}' " \
                                    f"unexpectedly found in state '{app_state}' during kernel startup!"
                    self.log_and_raise(RuntimeError(error_message))

                self.log.debug(
                    f"{i}: State: '{app_state}', Host: '{self.assigned_host}', "
                    f"KernelID: '{self.kernel_id}', ApplicationID: '{self.application_id}'"
                )

                if self.assigned_host != '':
                    ready_to_connect = await self.receive_connection_info()
            else:
                self.detect_launch_failure()

    async def handle_launch_timeout(self):
        """
        Checks to see if the kernel launch timeout has been exceeded while awaiting connection info.

        Note: This is a complete override of the superclass method.
        """
        await asyncio.sleep(poll_interval)
        time_interval = RemoteProvisionerBase.get_time_diff(self.start_time)

        if time_interval > self.launch_timeout:
            reason = f"Application ID is None. Failed to submit a new application to YARN within " \
                     f"{self.launch_timeout} seconds.  Check server log for more information."

            if self._get_application_id(True):
                if self._query_app_state_by_id(
                        self.application_id) != "RUNNING":
                    reason = f"YARN resources unavailable after {time_interval} seconds for " \
                             f"app {self.application_id}, launch timeout: {self.launch_timeout}!  " \
                             "Check YARN configuration."
                else:
                    reason = f"App {self.application_id} is RUNNING, but waited too long " \
                             f"({self.launch_timeout} secs) to get connection file.  " \
                             f"Check YARN logs for more information."
            await self.kill()
            timeout_message = f"KernelID: '{self.kernel_id}' launch timeout due to: {reason}"
            self.log_and_raise(TimeoutError(timeout_message))

    async def get_provisioner_info(self) -> Dict:
        """Captures the base information necessary for kernel persistence relative to YARN clusters."""
        provisioner_info = await super().get_provisioner_info()
        provisioner_info.update({'application_id': self.application_id})
        return provisioner_info

    async def load_provisioner_info(self, provisioner_info: Dict) -> None:
        """Loads the base information necessary for kernel persistence relative to YARN clusters."""
        await super().load_provisioner_info(provisioner_info)
        self.application_id = provisioner_info['application_id']

    def _get_application_state(self) -> str:
        # Gets the current application state using the application_id already obtained.  Once the assigned host
        # has been identified, 'amHostHttpAddress' is nolonger accessed.
        app_state = self.last_known_state
        app = self._query_app_by_id(self.application_id)
        if app:
            if app.get('state'):
                app_state = app.get('state')
                self.last_known_state = app_state

            if self.assigned_host == '' and app.get('amHostHttpAddress'):
                self.assigned_host = app.get('amHostHttpAddress').split(':')[0]
                # Set the kernel manager ip to the actual host where the application landed.
                self.assigned_ip = socket.gethostbyname(self.assigned_host)

        return app_state

    def _get_application_id(self, ignore_final_states: bool = False) -> str:
        # Return the kernel's YARN application ID if available, otherwise None.  If we're obtaining application_id
        # from scratch, do not consider kernels in final states.
        if not self.application_id:
            app = self._query_app_by_name(self.kernel_id)
            state_condition = True
            if type(app) is dict:
                state = app.get('state')
                self.last_known_state = state

                if ignore_final_states:
                    state_condition = state not in YarnProvisioner.final_states

                if len(app.get('id', '')) > 0 and state_condition:
                    self.application_id = app['id']
                    time_interval = RemoteProvisionerBase.get_time_diff(
                        self.start_time)
                    self.log.info(
                        f"ApplicationID: '{app['id']}' assigned for KernelID: '{self.kernel_id}', "
                        f"state: {state}, {time_interval} seconds after starting."
                    )
            if not self.application_id:
                self.log.debug(
                    f"ApplicationID not yet assigned for KernelID: '{self.kernel_id}' - retrying..."
                )
        return self.application_id

    def _query_app_by_name(self, kernel_id: str) -> dict:
        """Retrieve application by using kernel_id as the unique app name.
        With the started_time_begin as a parameter to filter applications started earlier than the target one from YARN.
        When submit a new app, it may take a while for YARN to accept and run and generate the application ID.
        Note: if a kernel restarts with the same kernel id as app name, multiple applications will be returned.
        For now, the app/kernel with the top most application ID will be returned as the target app, assuming the app
        ID will be incremented automatically on the YARN side.

        :param kernel_id: as the unique app name for query
        :return: The JSON object of an application.
        """
        top_most_app_id = ''
        target_app = None
        try:
            response = self.resource_mgr.cluster_applications(
                started_time_begin=str(self.start_time))
        except socket.error as sock_err:
            if sock_err.errno == errno.ECONNREFUSED:
                self.log.warning(
                    f"YARN RM address: '{self.rm_addr}' refused the connection.  "
                    f"Is the resource manager running?")
            else:
                self.log.warning(
                    f"Query for kernel ID '{kernel_id}' failed with exception: "
                    f"{type(sock_err)} - '{sock_err}'.  Continuing...")
        except Exception as e:
            self.log.warning(
                f"Query for kernel ID '{kernel_id}' failed with exception: "
                f"{type(e)} - '{e}'.  Continuing...")
        else:
            data = response.data
            if type(data) is dict and type(
                    data.get("apps")) is dict and 'app' in data.get("apps"):
                for app in data['apps']['app']:
                    if app.get('name', '').find(kernel_id) >= 0 and app.get(
                            'id') > top_most_app_id:
                        target_app = app
                        top_most_app_id = app.get('id')
        return target_app

    def _query_app_by_id(self, app_id: str) -> dict:
        """Retrieve an application by application ID.

        :param app_id
        :return: The JSON object of an application.
        """
        app = None
        try:
            response = self.resource_mgr.cluster_application(
                application_id=app_id)
        except Exception as e:
            self.log.warning(
                f"Query for application ID '{app_id}' failed with exception: '{e}'.  Continuing..."
            )
        else:
            data = response.data
            if type(data) is dict and 'app' in data:
                app = data['app']

        return app

    def _query_app_state_by_id(self, app_id: str) -> str:
        """Return the state of an application. If a failure occurs, the last known state is returned.

        :param app_id:
        :return: application state (str)
        """
        state = self.last_known_state
        try:
            response = self.resource_mgr.cluster_application_state(
                application_id=app_id)
        except Exception as e:
            self.log.warning(
                f"Query for application '{app_id}' state failed with exception: '{e}'.  "
                f"Continuing with last known state = '{state}'...")
        else:
            state = response.data['state']
            self.last_known_state = state

        return state

    def _kill_app_by_id(self, app_id: str) -> dict:
        """Kill an application. If the app's state is FINISHED or FAILED, it won't be changed to KILLED.

        :param app_id
        :return: The JSON response of killing the application.
        """

        response = {}
        try:
            response = self.resource_mgr.cluster_application_kill(
                application_id=app_id)
        except Exception as e:
            self.log.warning(
                f"Termination of application '{app_id}' failed with exception: '{e}'.  Continuing..."
            )
        return response
 def test__init__(self, get_config_mock, request_mock):
     get_config_mock.return_value = "https:localhost"
     rm = ResourceManager()
     get_config_mock.assert_called_with(30, None, True)
     self.assertEqual(rm.service_uri.is_https, True)
 def setUp(self, check_is_active_rm_mock):
     check_is_active_rm_mock.return_value = True
     self.rm = ResourceManager(['localhost'])
Esempio n. 27
0
class YarnClusterProcessProxy(RemoteProcessProxy):
    """Kernel lifecycle management for YARN clusters."""
    initial_states = {'NEW', 'SUBMITTED', 'ACCEPTED', 'RUNNING'}
    final_states = {'FINISHED', 'KILLED'}  # Don't include FAILED state

    def __init__(self, kernel_manager, proxy_config):
        super(YarnClusterProcessProxy, self).__init__(kernel_manager,
                                                      proxy_config)
        self.application_id = None
        self.yarn_endpoint \
            = proxy_config.get('yarn_endpoint',
                               kernel_manager.parent.parent.yarn_endpoint)
        self.yarn_endpoint_security_enabled \
            = proxy_config.get('yarn_endpoint_security_enabled',
                               kernel_manager.parent.parent.yarn_endpoint_security_enabled)
        yarn_master = urlparse(self.yarn_endpoint).hostname
        if self.yarn_endpoint_security_enabled is True:
            self.resource_mgr = ResourceManager(
                address=yarn_master,
                kerberos_enabled=self.yarn_endpoint_security_enabled)
        else:
            self.resource_mgr = ResourceManager(address=yarn_master)

    def launch_process(self, kernel_cmd, **kwargs):
        """Launches the specified process within a YARN cluster environment."""
        super(YarnClusterProcessProxy,
              self).launch_process(kernel_cmd, **kwargs)

        # launch the local run.sh - which is configured for yarn-cluster...
        self.local_proc = launch_kernel(kernel_cmd, **kwargs)
        self.pid = self.local_proc.pid
        self.ip = local_ip

        self.log.debug(
            "Yarn cluster kernel launched using YARN endpoint: {}, pid: {}, Kernel ID: {}, cmd: '{}'"
            .format(self.yarn_endpoint, self.local_proc.pid, self.kernel_id,
                    kernel_cmd))
        self.confirm_remote_startup()

        return self

    def poll(self):
        """Submitting a new kernel/app to YARN will take a while to be ACCEPTED.
        Thus application ID will probably not be available immediately for poll.
        So will regard the application as RUNNING when application ID still in ACCEPTED or SUBMITTED state.

        :return: None if the application's ID is available and state is ACCEPTED/SUBMITTED/RUNNING. Otherwise False. 
        """
        result = False

        if self._get_application_id():
            state = self._query_app_state_by_id(self.application_id)
            if state in YarnClusterProcessProxy.initial_states:
                result = None

        # The following produces too much output (every 3 seconds by default), so commented-out at this time.
        # self.log.debug("YarnProcessProxy.poll, application ID: {}, kernel ID: {}, state: {}".
        #               format(self.application_id, self.kernel_id, state))
        return result

    def send_signal(self, signum):
        """Currently only support 0 as poll and other as kill.

        :param signum
        :return: 
        """
        self.log.debug("YarnClusterProcessProxy.send_signal {}".format(signum))
        if signum == 0:
            return self.poll()
        elif signum == signal.SIGKILL:
            return self.kill()
        else:
            # Yarn api doesn't support the equivalent to interrupts, so take our chances
            # via a remote signal.  Note that this condition cannot check against the
            # signum value because altternate interrupt signals might be in play.
            return super(YarnClusterProcessProxy, self).send_signal(signum)

    def kill(self):
        """Kill a kernel.
        :return: None if the application existed and is not in RUNNING state, False otherwise. 
        """
        state = None
        result = False
        if self._get_application_id():
            resp = self._kill_app_by_id(self.application_id)
            self.log.debug(
                "YarnClusterProcessProxy.kill: kill_app_by_id({}) response: {}, confirming app state is not RUNNING"
                .format(self.application_id, resp))

            i = 1
            state = self._query_app_state_by_id(self.application_id)
            while state not in YarnClusterProcessProxy.final_states and i <= max_poll_attempts:
                time.sleep(poll_interval)
                state = self._query_app_state_by_id(self.application_id)
                i = i + 1

            if state in YarnClusterProcessProxy.final_states:
                result = None

        super(YarnClusterProcessProxy, self).kill()

        self.log.debug(
            "YarnClusterProcessProxy.kill, application ID: {}, kernel ID: {}, state: {}"
            .format(self.application_id, self.kernel_id, state))
        return result

    def cleanup(self):
        """"""
        # we might have a defunct process (if using waitAppCompletion = false) - so poll, kill, wait when we have
        # a local_proc.
        if self.local_proc:
            self.log.debug(
                "YarnClusterProcessProxy.cleanup: Clearing possible defunct process, pid={}..."
                .format(self.local_proc.pid))
            if super(YarnClusterProcessProxy, self).poll():
                super(YarnClusterProcessProxy, self).kill()
            super(YarnClusterProcessProxy, self).wait()
            self.local_proc = None

        # reset application id to force new query - handles kernel restarts/interrupts
        self.application_id = None

        # for cleanup, we should call the superclass last
        super(YarnClusterProcessProxy, self).cleanup()

    def confirm_remote_startup(self):
        """ Confirms the yarn application is in a started state before returning.  Should post-RUNNING states be
            unexpectedly encountered (FINISHED, KILLED) then we must throw, otherwise the rest of the gateway will
            believe its talking to a valid kernel.
        """
        self.start_time = RemoteProcessProxy.get_current_time()
        i = 0
        ready_to_connect = False  # we're ready to connect when we have a connection file to use
        while not ready_to_connect:
            i += 1
            self.handle_timeout()

            if self._get_application_id(True):
                # Once we have an application ID, start monitoring state, obtain assigned host and get connection info
                app_state = self._get_application_state()

                if app_state in YarnClusterProcessProxy.final_states:
                    error_message = "KernelID: '{}', ApplicationID: '{}' unexpectedly found in " \
                                                     "state '{}' during kernel startup!".\
                                    format(self.kernel_id, self.application_id, app_state)
                    self.log_and_raise(http_status_code=500,
                                       reason=error_message)

                self.log.debug(
                    "{}: State: '{}', Host: '{}', KernelID: '{}', ApplicationID: '{}'"
                    .format(i, app_state, self.assigned_host, self.kernel_id,
                            self.application_id))

                if self.assigned_host != '':
                    ready_to_connect = self.receive_connection_info()
            else:
                self.detect_launch_failure()

    def _get_application_state(self):
        # Gets the current application state using the application_id already obtained.  Once the assigned host
        # has been identified, it is nolonger accessed.
        app_state = None
        app = self._query_app_by_id(self.application_id)

        if app:
            if app.get('state'):
                app_state = app.get('state')
            if self.assigned_host == '' and app.get('amHostHttpAddress'):
                self.assigned_host = app.get('amHostHttpAddress').split(':')[0]
                # Set the kernel manager ip to the actual host where the application landed.
                self.assigned_ip = socket.gethostbyname(self.assigned_host)
        return app_state

    def handle_timeout(self):
        """Checks to see if the kernel launch timeout has been exceeded while awaiting connection info."""
        time.sleep(poll_interval)
        time_interval = RemoteProcessProxy.get_time_diff(
            self.start_time, RemoteProcessProxy.get_current_time())

        if time_interval > self.kernel_launch_timeout:
            reason = "Application ID is None. Failed to submit a new application to YARN within {} seconds.  " \
                     "Check Enterprise Gateway log for more information.". \
                format(self.kernel_launch_timeout)
            error_http_code = 500
            if self._get_application_id(True):
                if self._query_app_state_by_id(
                        self.application_id) != "RUNNING":
                    reason = "YARN resources unavailable after {} seconds for app {}, launch timeout: {}!  "\
                        "Check YARN configuration.".format(time_interval, self.application_id,
                                                           self.kernel_launch_timeout)
                    error_http_code = 503
                else:
                    reason = "App {} is RUNNING, but waited too long ({} secs) to get connection file.  " \
                        "Check YARN logs for more information.".format(self.application_id, self.kernel_launch_timeout)
            self.kill()
            timeout_message = "KernelID: '{}' launch timeout due to: {}".format(
                self.kernel_id, reason)
            self.log_and_raise(http_status_code=error_http_code,
                               reason=timeout_message)

    def _get_application_id(self, ignore_final_states=False):
        # Return the kernel's YARN application ID if available, otherwise None.  If we're obtaining application_id
        # from scratch, do not consider kernels in final states.
        if not self.application_id:
            app = self._query_app_by_name(self.kernel_id)
            state_condition = True
            if type(app) is dict and ignore_final_states:
                state_condition = app.get(
                    'state') not in YarnClusterProcessProxy.final_states

            if type(app) is dict and len(app.get('id',
                                                 '')) > 0 and state_condition:
                self.application_id = app['id']
                time_interval = RemoteProcessProxy.get_time_diff(
                    self.start_time, RemoteProcessProxy.get_current_time())
                self.log.info(
                    "ApplicationID: '{}' assigned for KernelID: '{}', state: {}, {} seconds after starting."
                    .format(app['id'], self.kernel_id, app.get('state'),
                            time_interval))
            else:
                self.log.debug(
                    "ApplicationID not yet assigned for KernelID: '{}' - retrying..."
                    .format(self.kernel_id))
        return self.application_id

    def get_process_info(self):
        """Captures the base information necessary for kernel persistence relative to YARN clusters."""
        process_info = super(YarnClusterProcessProxy, self).get_process_info()
        process_info.update({'application_id': self.application_id})
        return process_info

    def load_process_info(self, process_info):
        """Loads the base information necessary for kernel persistence relative to YARN clusters."""
        super(YarnClusterProcessProxy, self).load_process_info(process_info)
        self.application_id = process_info['application_id']

    def _query_app_by_name(self, kernel_id):
        """Retrieve application by using kernel_id as the unique app name.
        With the started_time_begin as a parameter to filter applications started earlier than the target one from YARN.
        When submit a new app, it may take a while for YARN to accept and run and generate the application ID.
        Note: if a kernel restarts with the same kernel id as app name, multiple applications will be returned.
        For now, the app/kernel with the top most application ID will be returned as the target app, assuming the app
        ID will be incremented automatically on the YARN side.

        :param kernel_id: as the unique app name for query
        :return: The JSON object of an application.
        """
        top_most_app_id = ''
        target_app = None
        data = None
        try:
            data = self.resource_mgr.cluster_applications(
                started_time_begin=str(self.start_time)).data
        except socket.error as sock_err:
            if sock_err.errno == errno.ECONNREFUSED:
                self.log.warning(
                    "YARN end-point: '{}' refused the connection.  Is the resource manager running?"
                    .format(self.yarn_endpoint))
            else:
                self.log.warning(
                    "Query for kernel ID '{}' failed with exception: {} - '{}'.  Continuing..."
                    .format(kernel_id, type(sock_err), sock_err))
        except Exception as e:
            self.log.warning(
                "Query for kernel ID '{}' failed with exception: {} - '{}'.  Continuing..."
                .format(kernel_id, type(e), e))

        if type(data) is dict and type(
                data.get("apps")) is dict and 'app' in data.get("apps"):
            for app in data['apps']['app']:
                if app.get('name', '').find(kernel_id) >= 0 and app.get(
                        'id') > top_most_app_id:
                    target_app = app
                    top_most_app_id = app.get('id')
        return target_app

    def _query_app_by_id(self, app_id):
        """Retrieve an application by application ID.

        :param app_id
        :return: The JSON object of an application.
        """
        data = None
        try:
            data = self.resource_mgr.cluster_application(
                application_id=app_id).data
        except Exception as e:
            self.log.warning(
                "Query for application ID '{}' failed with exception: '{}'.  Continuing..."
                .format(app_id, e))
        if type(data) is dict and 'app' in data:
            return data['app']
        return None

    def _query_app_state_by_id(self, app_id):
        """Return the state of an application.

        :param app_id: 
        :return: 
        """
        response = None
        try:
            response = self.resource_mgr.cluster_application_state(
                application_id=app_id)
        except Exception as e:
            self.log.warning(
                "Query for application '{}' state failed with exception: '{}'.  Continuing..."
                .format(app_id, e))

        return response.data['state']

    def _kill_app_by_id(self, app_id):
        """Kill an application. If the app's state is FINISHED or FAILED, it won't be changed to KILLED.

        :param app_id
        :return: The JSON response of killing the application.
        """

        response = None
        try:
            response = self.resource_mgr.cluster_application_kill(
                application_id=app_id)
        except Exception as e:
            self.log.warning(
                "Termination of application '{}' failed with exception: '{}'.  Continuing..."
                .format(app_id, e))

        return response
class ResourceManagerTestCase(TestCase):
    @patch('yarn_api_client.resource_manager.check_is_active_rm')
    def setUp(self, check_is_active_rm_mock):
        check_is_active_rm_mock.return_value = True
        self.rm = ResourceManager(['localhost'])

    @patch('yarn_api_client.resource_manager.get_resource_manager_endpoint')
    def test__init__(self, get_config_mock, request_mock):
        get_config_mock.return_value = "https:localhost"
        rm = ResourceManager()
        get_config_mock.assert_called_with(30, None, True)
        self.assertEqual(rm.service_uri.is_https, True)

    def test_cluster_information(self, request_mock):
        self.rm.cluster_information()
        request_mock.assert_called_with('/ws/v1/cluster/info')

    def test_cluster_metrics(self, request_mock):
        self.rm.cluster_metrics()
        request_mock.assert_called_with('/ws/v1/cluster/metrics')

    def test_cluster_scheduler(self, request_mock):
        self.rm.cluster_scheduler()
        request_mock.assert_called_with('/ws/v1/cluster/scheduler')

    def test_cluster_applications(self, request_mock):
        self.rm.cluster_applications()
        request_mock.assert_called_with('/ws/v1/cluster/apps', params={})

        self.rm.cluster_applications(state='KILLED',
                                     states=['KILLED'],
                                     final_status='FAILED',
                                     user='******',
                                     queue='low',
                                     limit=10,
                                     started_time_begin=1,
                                     started_time_end=2,
                                     finished_time_begin=3,
                                     finished_time_end=4,
                                     application_types=['YARN'],
                                     application_tags=['apptag'],
                                     de_selects=['resouceRequests'])
        request_mock.assert_called_with('/ws/v1/cluster/apps',
                                        params={
                                            'state': 'KILLED',
                                            'states': 'KILLED',
                                            'finalStatus': 'FAILED',
                                            'user': '******',
                                            'queue': 'low',
                                            'limit': 10,
                                            'startedTimeBegin': 1,
                                            'startedTimeEnd': 2,
                                            'finishedTimeBegin': 3,
                                            'finishedTimeEnd': 4,
                                            'applicationTypes': 'YARN',
                                            'applicationTags': 'apptag',
                                            'deSelects': 'resouceRequests'
                                        })

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(states=['ololo'])

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(final_status='ololo')

    def test_cluster_application_statistics(self, request_mock):
        self.rm.cluster_application_statistics()
        request_mock.assert_called_with('/ws/v1/cluster/appstatistics',
                                        params={})
        # TODO: test arguments

    def test_cluster_application(self, request_mock):
        self.rm.cluster_application('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1')

    def test_cluster_application_attempts(self, request_mock):
        self.rm.cluster_application_attempts('app_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/apps/app_1/appattempts')

    def test_cluster_application_attempt_info(self, request_mock):
        self.rm.cluster_application_attempt_info('app_1', 'attempt_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/apps/app_1/appattempts/attempt_1')

    def test_cluster_application_attempt_containers(self, request_mock):
        self.rm.cluster_application_attempt_containers('app_1', 'attempt_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/apps/app_1/appattempts/attempt_1/containers')

    def test_cluster_application_attempt_container_info(self, request_mock):
        self.rm.cluster_application_attempt_container_info(
            'app_1', 'attempt_1', 'container_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/apps/app_1/appattempts/attempt_1/containers/container_1'
        )

    def test_cluster_application_state(self, request_mock):
        self.rm.cluster_application_state('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/state')

    def test_cluster_application_kill(self, request_mock):
        self.rm.cluster_application_kill('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/state',
                                        'PUT',
                                        data={"state": 'KILLED'})

    def test_cluster_nodes(self, request_mock):
        self.rm.cluster_nodes()
        request_mock.assert_called_with('/ws/v1/cluster/nodes', params={})

        self.rm.cluster_nodes(states=['NEW'])
        request_mock.assert_called_with('/ws/v1/cluster/nodes',
                                        params={"states": 'NEW'})

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_nodes(states=['ololo'])

    def test_cluster_node(self, request_mock):
        self.rm.cluster_node('node_1')
        request_mock.assert_called_with('/ws/v1/cluster/nodes/node_1')

    def test_cluster_submit_application(self, request_mock):
        self.rm.cluster_submit_application(
            {"application-name": "dummy_application"})
        request_mock.assert_called_with(
            '/ws/v1/cluster/apps',
            'POST',
            data={"application-name": "dummy_application"})

    def test_cluster_new_application(self, request_mock):
        self.rm.cluster_new_application()
        request_mock.assert_called_with('/ws/v1/cluster/apps/new-application',
                                        'POST')

    def test_cluster_get_application_queue(self, request_mock):
        self.rm.cluster_get_application_queue('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/queue')

    def test_cluster_change_application_queue(self, request_mock):
        self.rm.cluster_change_application_queue('app_1', 'queue_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/queue',
                                        'PUT',
                                        data={"queue": 'queue_1'})

    def test_cluster_get_application_priority(self, request_mock):
        self.rm.cluster_get_application_priority('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/priority')

    def test_cluster_change_application_priority(self, request_mock):
        self.rm.cluster_change_application_priority('app_1', 'priority_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/priority',
                                        'PUT',
                                        data={"priority": 'priority_1'})

    @patch('yarn_api_client.hadoop_conf.parse')
    def test_cluster_node_container_memory(self, parse_mock, request_mock):
        parse_mock.return_value = 1024
        value = self.rm.cluster_node_container_memory()
        self.assertEqual(value, 1024)

    # TODO
    # def test_cluster_scheduler_queue(self, request_mock):
    #     class ResponseMock():
    #         def __init__(self, status, data):
    #             self.status = status
    #             self.data = data

    #     request_mock.return_value = ResponseMock(
    #         'OK',
    #         {
    #             'scheduler': {
    #                 'schedulerInfo': {
    #                     "queues": {
    #                         "queue": [
    #                             {
    #                                 'queueName': 'queue_1',
    #                                 'queues': {
    #                                     'queue': [
    #                                         {
    #                                             "queueName": 'queue_2',
    #                                             'queues': {
    #                                                 'queue': [
    #                                                     {
    #                                                         'queueName': 'queue_3'
    #                                                     }
    #                                                 ]
    #                                             }
    #                                         }
    #                                     ]
    #                                 }
    #                             }
    #                         ]
    #                     }
    #                 }
    #             }
    #         }
    #     )
    #     value = self.rm.cluster_scheduler_queue('queue_1')
    #     self.assertIsNotNone(value)

    #     request_mock.return_value = ResponseMock(
    #         'OK',
    #         {
    #             'scheduler': {
    #                 'schedulerInfo': {
    #                     'queueName': 'queue_1'
    #                 }
    #             }
    #         }
    #     )
    #     value = self.rm.cluster_scheduler_queue('queue_2')
    #     self.assertIsNone(value)

    def test_cluster_scheduler_queue_availability(self, request_mock):
        value = self.rm.cluster_scheduler_queue_availability(
            {'absoluteUsedCapacity': 90}, 70)
        self.assertEqual(value, False)

        value = self.rm.cluster_scheduler_queue_availability(
            {'absoluteUsedCapacity': 50}, 70)
        self.assertEqual(value, True)

    def test_cluster_queue_partition(self, request_mock):
        value = self.rm.cluster_queue_partition(
            {
                'capacities': {
                    'queueCapacitiesByPartition': [{
                        'partitionName': 'label_1'
                    }, {
                        'partitionName': 'label_2'
                    }]
                },
            }, 'label_1')
        self.assertIsNotNone(value)

        value = self.rm.cluster_queue_partition(
            {
                'capacities': {
                    'queueCapacitiesByPartition': [{
                        'partitionName': 'label_1'
                    }, {
                        'partitionName': 'label_2'
                    }]
                },
            }, 'label_3')
        self.assertIsNone(value)

    def test_cluster_reservations(self, request_mock):
        self.rm.cluster_reservations('queue_1', 'reservation_1', 0, 5, True)
        request_mock.assert_called_with('/ws/v1/cluster/reservation/list',
                                        params={
                                            "queue": "queue_1",
                                            "reservation-id": "reservation_1",
                                            "start-time": 0,
                                            "end-time": 5,
                                            "include-resource-allocations":
                                            True
                                        })

    def test_cluster_new_delegation_token(self, request_mock):
        self.rm.cluster_new_delegation_token('renewer_1')
        request_mock.assert_called_with('/ws/v1/cluster/delegation-token',
                                        'POST',
                                        data={"renewer": "renewer_1"})

    def test_cluster_renew_delegation_token(self, request_mock):
        self.rm.cluster_renew_delegation_token('delegation_token_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/delegation-token/expiration',
            'POST',
            headers={"Hadoop-YARN-RM-Delegation-Token": 'delegation_token_1'})

    def test_cluster_cancel_delegation_token(self, request_mock):
        self.rm.cluster_cancel_delegation_token('delegation_token_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/delegation-token',
            'DELETE',
            headers={"Hadoop-YARN-RM-Delegation-Token": 'delegation_token_1'})

    def test_cluster_new_reservation(self, request_mock):
        self.rm.cluster_new_reservation()
        request_mock.assert_called_with(
            '/ws/v1/cluster/reservation/new-reservation', 'POST')

    def test_cluster_submit_reservation(self, request_mock):
        self.rm.cluster_submit_reservation({'reservation-id': 'reservation_1'})
        request_mock.assert_called_with(
            '/ws/v1/cluster/reservation/submit',
            'POST',
            data={'reservation-id': 'reservation_1'})

    def test_cluster_update_reservation(self, request_mock):
        self.rm.cluster_update_reservation({'reservation-id': 'reservation_1'})
        request_mock.assert_called_with(
            '/ws/v1/cluster/reservation/update',
            'POST',
            data={'reservation-id': 'reservation_1'})

    def test_cluster_delete_reservation(self, request_mock):
        self.rm.cluster_delete_reservation('reservation_1')
        request_mock.assert_called_with(
            '/ws/v1/cluster/reservation/delete',
            'POST',
            data={'reservation-id': 'reservation_1'})

    def test_cluster_application_timeouts(self, request_mock):
        self.rm.cluster_application_timeouts('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/timeouts')

    def test_cluster_application_timeout(self, request_mock):
        self.rm.cluster_application_timeout('app_1', 'LIFETIME')
        request_mock.assert_called_with(
            '/ws/v1/cluster/apps/app_1/timeouts/LIFETIME')

    def test_cluster_update_application_timeout(self, request_mock):
        self.rm.cluster_update_application_timeout(
            'app_1', 'LIFETIME', '2016-12-05T22:51:00.104+0530')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/timeout',
                                        'PUT',
                                        data={
                                            'timeout': {
                                                'type':
                                                'LIFETIME',
                                                'expiryTime':
                                                '2016-12-05T22:51:00.104+0530'
                                            }
                                        })

    def test_cluster_scheduler_conf_mutation(self, request_mock):
        self.rm.cluster_scheduler_conf_mutation()
        request_mock.assert_called_with('/ws/v1/cluster/scheduler-conf')

    def test_cluster_modify_scheduler_conf_mutation(self, request_mock):
        self.rm.cluster_modify_scheduler_conf_mutation({
            'queue-name': 'queue_1',
            'params': {
                'test': 'test'
            }
        })
        request_mock.assert_called_with('/ws/v1/cluster/scheduler-conf',
                                        'PUT',
                                        data={
                                            'queue-name': 'queue_1',
                                            'params': {
                                                'test': 'test'
                                            }
                                        })
class ResourceManagerTestCase(TestCase):
    def setUp(self):
        self.rm = ResourceManager('localhost')

    @patch('yarn_api_client.resource_manager.get_resource_manager_host_port')
    def test__init__(self, get_config_mock, request_mock):
        get_config_mock.return_value = (None, None)
        ResourceManager()
        get_config_mock.assert_called_with()

    def test_cluster_information(self, request_mock):
        self.rm.cluster_information()
        request_mock.assert_called_with('/ws/v1/cluster/info')

    def test_cluster_metrics(self, request_mock):
        self.rm.cluster_metrics()
        request_mock.assert_called_with('/ws/v1/cluster/metrics')

    def test_cluster_scheduler(self, request_mock):
        self.rm.cluster_scheduler()
        request_mock.assert_called_with('/ws/v1/cluster/scheduler')

    def test_cluster_applications(self, request_mock):
        self.rm.cluster_applications()
        request_mock.assert_called_with('/ws/v1/cluster/apps')

        self.rm.cluster_applications(state='KILLED', final_status='FAILED',
                                     user='******', queue='low', limit=10,
                                     started_time_begin=1, started_time_end=2,
                                     finished_time_begin=3, finished_time_end=4)
        request_mock.assert_called_with('/ws/v1/cluster/apps', state='KILLED',
                                        finalStatus='FAILED', user='******',
                                        queue='low', limit=10,
                                        startedTimeBegin=1, startedTimeEnd=2,
                                        finishedTimeBegin=3, finishedTimeEnd=4)

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(state='ololo')

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_applications(final_status='ololo')

    def test_cluster_application_statistics(self, request_mock):
        self.rm.cluster_application_statistics()
        request_mock.assert_called_with('/ws/v1/cluster/appstatistics')
        # TODO: test arguments

    def test_cluster_application(self, request_mock):
        self.rm.cluster_application('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1')

    def test_cluster_application_attempts(self, request_mock):
        self.rm.cluster_application_attempts('app_1')
        request_mock.assert_called_with('/ws/v1/cluster/apps/app_1/appattempts')

    def test_cluster_nodes(self, request_mock):
        self.rm.cluster_nodes()
        request_mock.assert_called_with('/ws/v1/cluster/nodes')

        self.rm.cluster_nodes(state='NEW', healthy='true')
        request_mock.assert_called_with('/ws/v1/cluster/nodes',
                                        state='NEW', healthy='true')

        with self.assertRaises(IllegalArgumentError):
            self.rm.cluster_nodes(state='NEW', healthy='ololo')

    def test_cluster_node(self, request_mock):
        self.rm.cluster_node('node_1')
        request_mock.assert_called_with('/ws/v1/cluster/nodes/node_1')