Example #1
0
class LivySession(ObjectWithGuid):
    def __init__(self,
                 http_client,
                 properties,
                 ipython_display,
                 session_id=-1,
                 spark_events=None,
                 heartbeat_timeout=0,
                 heartbeat_thread=None):
        super(LivySession, self).__init__()
        assert constants.LIVY_KIND_PARAM in list(properties.keys())
        kind = properties[constants.LIVY_KIND_PARAM]

        should_heartbeat = False
        if heartbeat_timeout > 0:
            should_heartbeat = True
            properties[
                constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout
        elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()):
            properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM)

        self.properties = properties
        self.ipython_display = ipython_display
        self._should_heartbeat = should_heartbeat
        self._user_passed_heartbeat_thread = heartbeat_thread

        if spark_events is None:
            spark_events = SparkEvents()
        self._spark_events = spark_events

        self._policy = ConfigurableRetryPolicy(
            retry_seconds_to_sleep_list=[0.2, 0.5, 0.5, 1, 1, 2],
            max_retries=5000)
        wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds()

        assert wait_for_idle_timeout_seconds > 0

        self.logger = SparkLog(u"LivySession")

        kind = kind.lower()
        if kind not in constants.SESSION_KINDS_SUPPORTED:
            raise BadUserDataException(
                u"Session of kind '{}' not supported. Session must be of kinds {}."
                .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED)))

        self._app_id = None
        self._user = None
        self._logs = u""
        self._http_client = http_client
        self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds
        self._printed_resource_warning = False

        self.kind = kind
        self.id = session_id
        self.session_info = u""

        self._heartbeat_thread = None
        if session_id == -1:
            self.status = constants.NOT_STARTED_SESSION_STATUS
        else:
            self.status = constants.BUSY_SESSION_STATUS
            self._start_heartbeat_thread()

    def __str__(self):
        return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\
            .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url())

    def start(self):
        """Start the session against actual livy server."""
        self._spark_events.emit_session_creation_start_event(
            self.guid, self.kind)
        self._printed_resource_warning = False

        try:
            r = self._http_client.post_session(self.properties)
            self.id = r[u"id"]
            self.status = str(r[u"state"])

            self.ipython_display.writeln(u"Starting Spark application")

            # Start heartbeat thread to keep Livy interactive session alive.
            self._start_heartbeat_thread()

            # We wait for livy_session_startup_timeout_seconds() for the session to start up.
            try:
                self.wait_for_idle(conf.livy_session_startup_timeout_seconds())
            except LivyClientTimeoutException:
                raise LivyClientTimeoutException(
                    u"Session {} did not start up in {} seconds.".format(
                        self.id, conf.livy_session_startup_timeout_seconds()))

            html = get_sessions_info_html([self], self.id)
            self.ipython_display.html(html)

            command = Command("spark")
            (success, out, mimetype) = command.execute(self)

            if success:
                self.ipython_display.writeln(
                    u"SparkSession available as 'spark'.")
                self.sql_context_variable_name = "spark"
            else:
                command = Command("sqlContext")
                (success, out, mimetype) = command.execute(self)
                if success:
                    self.ipython_display.writeln(
                        u"SparkContext available as 'sc'.")
                    if ("hive" in out.lower()):
                        self.ipython_display.writeln(
                            u"HiveContext available as 'sqlContext'.")
                    else:
                        self.ipython_display.writeln(
                            u"SqlContext available as 'sqlContext'.")
                    self.sql_context_variable_name = "sqlContext"
                else:
                    raise SqlContextNotFoundException(
                        u"Neither SparkSession nor HiveContext/SqlContext is available."
                    )
        except Exception as e:
            self._spark_events.emit_session_creation_end_event(
                self.guid, self.kind, self.id, self.status, False,
                e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_creation_end_event(
                self.guid, self.kind, self.id, self.status, True, "", "")

    def get_app_id(self):
        if self._app_id is None:
            self._app_id = self._http_client.get_session(self.id).get("appId")
        return self._app_id

    def get_app_info(self):
        appInfo = self._http_client.get_session(self.id).get("appInfo")
        return appInfo if appInfo is not None else {}

    def get_app_info_member(self, member_name):
        return self.get_app_info().get(member_name)

    def get_driver_log_url(self):
        return self.get_app_info_member("driverLogUrl")

    def get_logs(self):
        log_array = self._http_client.get_all_session_logs(self.id)[u'log']
        self._logs = "\n".join(log_array)
        return self._logs

    def get_spark_ui_url(self):
        return self.get_app_info_member("sparkUiUrl")

    def get_user(self):
        if self._user is None:
            session = self._http_client.get_session(self.id)
            self._user = session.get("proxyUser", session.get("owner"))
        return self._user

    @property
    def http_client(self):
        return self._http_client

    @property
    def endpoint(self):
        return self._http_client.endpoint

    @staticmethod
    def is_final_status(status):
        return status in constants.FINAL_STATUS

    def delete(self):
        session_id = self.id
        self._spark_events.emit_session_deletion_start_event(
            self.guid, self.kind, session_id, self.status)

        try:
            self.logger.debug(u"Deleting session '{}'".format(session_id))

            if self.status != constants.NOT_STARTED_SESSION_STATUS:
                self._http_client.delete_session(session_id)
                self._stop_heartbeat_thread()
                self.status = constants.DEAD_SESSION_STATUS
                self.id = -1
            else:
                self.ipython_display.send_error(
                    u"Cannot delete session {} that is in state '{}'.".format(
                        session_id, self.status))

        except Exception as e:
            self._spark_events.emit_session_deletion_end_event(
                self.guid, self.kind, session_id, self.status, False,
                e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_deletion_end_event(
                self.guid, self.kind, session_id, self.status, True, "", "")

    def wait_for_idle(self, seconds_to_wait=None):
        """Wait for session to go to idle status. Sleep meanwhile.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        if seconds_to_wait is None:
            seconds_to_wait = self._wait_for_idle_timeout_seconds

        retries = 1
        while True:
            self.refresh_status_and_info()
            if self.status == constants.IDLE_SESSION_STATUS:
                return

            if self.status in constants.FINAL_STATUS:
                error = u"Session {} unexpectedly reached final status '{}'."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(
                    error, self.get_logs()))

            if seconds_to_wait <= 0.0:
                error = u"Session {} did not reach idle status in time. Current status is {}."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyClientTimeoutException(error)

            if constants.YARN_RESOURCE_LIMIT_MSG in self.session_info and \
                not self._printed_resource_warning:
                self.ipython_display.send_error(constants.RESOURCE_LIMIT_WARNING\
                                                .format(conf.resource_limit_mitigation_suggestion()))
                self._printed_resource_warning = True

            start_time = time()
            sleep_time = self._policy.seconds_to_sleep(retries)
            retries += 1

            self.logger.debug(
                u"Session {} in state {}. Sleeping {} seconds.".format(
                    self.id, self.status, sleep_time))
            sleep(sleep_time)
            seconds_to_wait -= time() - start_time

    def sleep(self, retries):
        sleep(self._policy.seconds_to_sleep(retries))

    # This function will refresh the status and get the logs in a single call.
    # Only the status will be returned as the return value.
    def refresh_status_and_info(self):
        response = self._http_client.get_session(self.id)
        status = response[u'state']
        log_array = response[u'log']

        if status in constants.POSSIBLE_SESSION_STATUS:
            self.status = status
            self.session_info = u"\n".join(log_array)
        else:
            raise LivyUnexpectedStatusException(
                u"Status '{}' not supported by session.".format(status))

    def _start_heartbeat_thread(self):
        if self._should_heartbeat and self._heartbeat_thread is None:
            refresh_seconds = conf.heartbeat_refresh_seconds()
            retry_seconds = conf.heartbeat_retry_seconds()

            if self._user_passed_heartbeat_thread is None:
                self._heartbeat_thread = _HeartbeatThread(
                    self, refresh_seconds, retry_seconds)
            else:
                self._heartbeat_thread = self._user_passed_heartbeat_thread

            self._heartbeat_thread.daemon = True
            self._heartbeat_thread.start()

    def _stop_heartbeat_thread(self):
        if self._heartbeat_thread is not None:
            self._heartbeat_thread.stop()
            self._heartbeat_thread = None

    def get_row_html(self, current_session_id):
        return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>""".format(
            self.id, self.get_app_id(), self.kind, self.status,
            self.get_html_link(u'Link', self.get_spark_ui_url()),
            self.get_html_link(u'Link', self.get_driver_log_url()),
            self.get_user(), u"" if current_session_id is None
            or current_session_id != self.id else u"\u2714")

    @staticmethod
    def get_html_link(text, url):
        if url is not None:
            return u"""<a target="_blank" href="{1}">{0}</a>""".format(
                text, url)
        else:
            return u""
Example #2
0
class ReliableHttpClient(object):
    """Http client that is reliable in its requests. Uses requests library."""
    def __init__(self, endpoint, headers, retry_policy):
        self._endpoint = endpoint
        self._headers = headers
        self._retry_policy = retry_policy
        self.logger = SparkLog(u"ReliableHttpClient")

        self.verify_ssl = not conf.ignore_ssl_errors()
        if not self.verify_ssl:
            self.logger.debug(
                u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks."
            )
            requests.packages.urllib3.disable_warnings()

    def compose_url(self, relative_url):
        r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/"))
        return self._endpoint.url + r_u

    def get(self, relative_url, accepted_status_codes):
        """Sends a get request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.get)

    def post(self, relative_url, accepted_status_codes, data):
        """Sends a post request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.post, data)

    def delete(self, relative_url, accepted_status_codes):
        """Sends a delete request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.delete)

    def _send_request(self,
                      relative_url,
                      accepted_status_codes,
                      function,
                      data=None):
        return self._send_request_helper(self.compose_url(relative_url),
                                         accepted_status_codes, function, data,
                                         0)

    def _send_request_helper(self, url, accepted_status_codes, function, data,
                             retry_count):
        while True:
            try:
                if not self._endpoint.authenticate:
                    if data is None:
                        r = function(url,
                                     headers=self._headers,
                                     verify=self.verify_ssl)
                    else:
                        r = function(url,
                                     headers=self._headers,
                                     data=json.dumps(data),
                                     verify=self.verify_ssl)
                else:
                    if data is None:
                        r = function(url,
                                     headers=self._headers,
                                     auth=(self._endpoint.username,
                                           self._endpoint.password),
                                     verify=self.verify_ssl)
                    else:
                        r = function(url,
                                     headers=self._headers,
                                     auth=(self._endpoint.username,
                                           self._endpoint.password),
                                     data=json.dumps(data),
                                     verify=self.verify_ssl)
            except requests.exceptions.RequestException as e:
                error = True
                r = None
                status = None

                self.logger.error(u"Request to '{}' failed with '{}'".format(
                    url, e))
            else:
                error = False
                status = r.status_code

            if error or status not in accepted_status_codes:
                if self._retry_policy.should_retry(status, error, retry_count):
                    sleep(self._retry_policy.seconds_to_sleep(retry_count))
                    retry_count += 1
                    continue
                else:
                    raise HttpClientException(
                        u"Invalid status code '{}' or error '{}' from {}".
                        format(status, error, url))
            return r
class ReliableHttpClient(object):
    """Http client that is reliable in its requests. Uses requests library."""
    def __init__(self, endpoint, headers, retry_policy):
        self._endpoint = endpoint
        self._headers = headers
        self._retry_policy = retry_policy
        if self._endpoint.auth == constants.AUTH_KERBEROS:
            self._auth = HTTPKerberosAuth(mutual_authentication=REQUIRED)
        elif self._endpoint.auth == constants.AUTH_BASIC:
            self._auth = (self._endpoint.username, self._endpoint.password)
        elif self._endpoint.auth != constants.NO_AUTH:
            raise BadUserConfigurationException(u"Unsupported auth %s" %
                                                self._endpoint.auth)

        self.logger = SparkLog(u"ReliableHttpClient")

        self.verify_ssl = not conf.ignore_ssl_errors()
        if not self.verify_ssl:
            self.logger.debug(
                u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks."
            )
            requests.packages.urllib3.disable_warnings()

    def get_headers(self):
        return self._headers

    def compose_url(self, relative_url):
        r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/"))
        return self._endpoint.url + r_u

    def get(self, relative_url, accepted_status_codes):
        """Sends a get request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.get)

    def post(self, relative_url, accepted_status_codes, data):
        """Sends a post request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.post, data)

    def delete(self, relative_url, accepted_status_codes):
        """Sends a delete request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.delete)

    def _send_request(self,
                      relative_url,
                      accepted_status_codes,
                      function,
                      data=None):
        return self._send_request_helper(self.compose_url(relative_url),
                                         accepted_status_codes, function, data,
                                         0)

    def _send_request_helper(self, url, accepted_status_codes, function, data,
                             retry_count):
        while True:
            try:
                if self._endpoint.auth == constants.NO_AUTH:
                    if data is None:
                        r = function(url,
                                     headers=self._headers,
                                     verify=self.verify_ssl)
                    else:
                        r = function(url,
                                     headers=self._headers,
                                     data=json.dumps(data),
                                     verify=self.verify_ssl)
                else:
                    if data is None:
                        r = function(url,
                                     headers=self._headers,
                                     auth=self._auth,
                                     verify=self.verify_ssl)
                    else:
                        r = function(url,
                                     headers=self._headers,
                                     auth=self._auth,
                                     data=json.dumps(data),
                                     verify=self.verify_ssl)
            except requests.exceptions.RequestException as e:
                error = True
                r = None
                status = None
                text = None

                self.logger.error(u"Request to '{}' failed with '{}'".format(
                    url, e))
            else:
                error = False
                status = r.status_code
                text = r.text

            if error or status not in accepted_status_codes:
                if self._retry_policy.should_retry(status, error, retry_count):
                    sleep(self._retry_policy.seconds_to_sleep(retry_count))
                    retry_count += 1
                    continue

                if error:
                    raise HttpClientException(
                        u"Error sending http request and maximum retry encountered."
                    )
                else:
                    raise HttpClientException(
                        u"Invalid status code '{}' from {} with error payload: {}"
                        .format(status, url, text))
            return r
Example #4
0
class SparkKernelBase(IPythonKernel):
    def __init__(self,
                 implementation,
                 implementation_version,
                 language,
                 language_version,
                 language_info,
                 session_language,
                 user_code_parser=None,
                 **kwargs):
        # Required by Jupyter - Override
        self.implementation = implementation
        self.implementation_version = implementation_version
        self.language = language
        self.language_version = language_version
        self.language_info = language_info

        # Override
        self.session_language = session_language

        super(SparkKernelBase, self).__init__(**kwargs)

        self.logger = SparkLog(u"{}_jupyter_kernel".format(
            self.session_language))
        self._fatal_error = None
        self.ipython_display = IpythonDisplay()

        if user_code_parser is None:
            self.user_code_parser = UserCodeParser()
        else:
            self.user_code_parser = user_code_parser

        # Disable warnings for test env in HDI
        requests.packages.urllib3.disable_warnings()

        if not kwargs.get("testing", False):
            self._load_magics_extension()
            self._change_language()
            if conf.use_auto_viz():
                self._register_auto_viz()

    def do_execute(self,
                   code,
                   silent,
                   store_history=True,
                   user_expressions=None,
                   allow_stdin=False):
        def f(self):
            if self._fatal_error is not None:
                return self._repeat_fatal_error()

            return self._do_execute(code, silent, store_history,
                                    user_expressions, allow_stdin)

        return wrap_unexpected_exceptions(f, self._complete_cell)(self)

    def do_shutdown(self, restart):
        # Cleanup
        # self._delete_session()

        return self._do_shutdown_ipykernel(restart)

    def _do_execute(self, code, silent, store_history, user_expressions,
                    allow_stdin):
        code_to_run = self.user_code_parser.get_code_to_run(code)

        res = self._execute_cell(code_to_run, silent, store_history,
                                 user_expressions, allow_stdin)

        return res

    def _load_magics_extension(self):
        register_magics_code = "%load_ext sparkmagic.kernels"
        self._execute_cell(
            register_magics_code,
            True,
            False,
            shutdown_if_error=True,
            log_if_error="Failed to load the Spark kernels magics library.")
        self.logger.debug("Loaded magics.")

    def _change_language(self):
        register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(
            self.session_language)
        self._execute_cell(
            register_magics_code,
            True,
            False,
            shutdown_if_error=True,
            log_if_error="Failed to change language to {}.".format(
                self.session_language))
        self.logger.debug("Changed language.")

    def _register_auto_viz(self):
        from sparkmagic.utils.sparkevents import get_spark_events_handler
        import autovizwidget.utils.configuration as c

        handler = get_spark_events_handler()
        c.override("events_handler", handler)

        register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe
ip = get_ipython()
ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)"""
        self._execute_cell(
            register_auto_viz_code,
            True,
            False,
            shutdown_if_error=True,
            log_if_error="Failed to register auto viz for notebook.")
        self.logger.debug("Registered auto viz.")

    def _delete_session(self):
        code = "%%_do_not_call_delete_session\n "
        self._execute_cell_for_user(code, True, False)

    def _execute_cell(self,
                      code,
                      silent,
                      store_history=True,
                      user_expressions=None,
                      allow_stdin=False,
                      shutdown_if_error=False,
                      log_if_error=None):
        reply_content = self._execute_cell_for_user(code, silent,
                                                    store_history,
                                                    user_expressions,
                                                    allow_stdin)

        if shutdown_if_error and reply_content[u"status"] == u"error":
            error_from_reply = reply_content[u"evalue"]
            if log_if_error is not None:
                message = "{}\nException details:\n\t\"{}\"".format(
                    log_if_error, error_from_reply)
                return self._abort_with_fatal_error(message)

        return reply_content

    def _execute_cell_for_user(self,
                               code,
                               silent,
                               store_history=True,
                               user_expressions=None,
                               allow_stdin=False):
        return super(SparkKernelBase,
                     self).do_execute(code, silent, store_history,
                                      user_expressions, allow_stdin)

    def _do_shutdown_ipykernel(self, restart):
        return super(SparkKernelBase, self).do_shutdown(restart)

    def _complete_cell(self):
        """A method that runs a cell with no effect. Call this and return the value it
        returns when there's some sort of error preventing the user's cell from executing; this
        will register the cell from the Jupyter UI as being completed."""
        return self._execute_cell("None", False, True, None, False)

    def _show_user_error(self, message):
        self.logger.error(message)
        self.ipython_display.send_error(message)

    def _queue_fatal_error(self, message):
        """Queues up a fatal error to be thrown when the next cell is executed; does not
        raise an error immediately. We use this for errors that happen on kernel startup,
        since IPython crashes if we throw an exception in the __init__ method."""
        self._fatal_error = message

    def _abort_with_fatal_error(self, message):
        """Queues up a fatal error and throws it immediately."""
        self._queue_fatal_error(message)
        return self._repeat_fatal_error()

    def _repeat_fatal_error(self):
        """Throws an error that has already been queued."""
        error = conf.fatal_error_suggestion().format(self._fatal_error)
        self.logger.error(error)
        self.ipython_display.send_error(error)
        return self._complete_cell()
Example #5
0
class LivySession(ObjectWithGuid):
    def __init__(self, http_client, properties, ipython_display,
                 session_id=-1, sql_created=None, spark_events=None):
        super(LivySession, self).__init__()
        assert u"kind" in list(properties.keys())
        kind = properties[u"kind"]
        self.properties = properties
        self.ipython_display = ipython_display

        if spark_events is None:
            spark_events = SparkEvents()
        self._spark_events = spark_events

        status_sleep_seconds = conf.status_sleep_seconds()
        statement_sleep_seconds = conf.statement_sleep_seconds()
        wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds()

        assert status_sleep_seconds > 0
        assert statement_sleep_seconds > 0
        assert wait_for_idle_timeout_seconds > 0
        if session_id == -1 and sql_created is True:
            raise BadUserDataException(u"Cannot indicate sql state without session id.")

        self.logger = SparkLog(u"LivySession")

        kind = kind.lower()
        if kind not in constants.SESSION_KINDS_SUPPORTED:
            raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}."
                                       .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED)))

        if session_id == -1:
            self.status = constants.NOT_STARTED_SESSION_STATUS
            sql_created = False
        else:
            self.status = constants.BUSY_SESSION_STATUS

        self._app_id = None
        self._logs = u""
        self._http_client = http_client
        self._status_sleep_seconds = status_sleep_seconds
        self._statement_sleep_seconds = statement_sleep_seconds
        self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds

        self.kind = kind
        self.id = session_id
        self.created_sql_context = sql_created

    def __str__(self):
        return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\
            .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url())

    def start(self, create_sql_context=True):
        """Start the session against actual livy server."""
        self._spark_events.emit_session_creation_start_event(self.guid, self.kind)

        try:
            r = self._http_client.post_session(self.properties)
            self.id = r[u"id"]
            self.status = str(r[u"state"])

            self.ipython_display.writeln(u"Creating SparkContext as 'sc'")
            # We wait for livy_session_startup_timeout_seconds() for the session to start up.
            try:
                self.wait_for_idle(conf.livy_session_startup_timeout_seconds())
            except LivyClientTimeoutException:
                raise LivyClientTimeoutException(u"Session {} did not start up in {} seconds."
                                                 .format(self.id, conf.livy_session_startup_timeout_seconds()))

            if create_sql_context:
                self.create_sql_context()
        except Exception as e:
            self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status,
                                                               False, e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "")

    def create_sql_context(self):
        """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'."""
        if self.created_sql_context:
            return
        self.logger.debug(u"Starting '{}' hive session.".format(self.kind))
        self.ipython_display.writeln(u"Creating HiveContext as 'sqlContext'")
        command = self._get_sql_context_creation_command()
        try:
            (success, out) = command.execute(self)
        except LivyClientTimeoutException:
            raise LivyClientTimeoutException(u"Failed to create the SqlContext in time. Timed out after {} seconds."
                                             .format(self._wait_for_idle_timeout_seconds))
        if success:
            self.created_sql_context = True
        else:
            raise FailedToCreateSqlContextException(u"Failed to create the SqlContext.\nError: '{}'".format(out))

    def get_app_id(self):
        if self._app_id is None:
            self._app_id = self._http_client.get_session(self.id).get("appId")
        return self._app_id

    def get_app_info(self):
        appInfo = self._http_client.get_session(self.id).get("appInfo")
        return appInfo if appInfo is not None else {}

    def get_app_info_member(self, member_name):
        return self.get_app_info().get(member_name)

    def get_driver_log_url(self):
        return self.get_app_info_member("driverLogUrl")

    def get_logs(self):
        log_array = self._http_client.get_all_session_logs(self.id)[u'log']
        self._logs = "\n".join(log_array)
        return self._logs

    def get_spark_ui_url(self):
        return self.get_app_info_member("sparkUiUrl")

    @property
    def http_client(self):
        return self._http_client

    @staticmethod
    def is_final_status(status):
        return status in constants.FINAL_STATUS

    def delete(self):
        session_id = self.id
        self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status)

        try:
            self.logger.debug(u"Deleting session '{}'".format(session_id))

            if self.status != constants.NOT_STARTED_SESSION_STATUS:
                self._http_client.delete_session(session_id)
                self.status = constants.DEAD_SESSION_STATUS
                self.id = -1
            else:
                self.ipython_display.send_error(u"Cannot delete session {} that is in state '{}'."
                                                .format(session_id, self.status))
        except Exception as e:
            self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False,
                                                               e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "")

    def wait_for_idle(self, seconds_to_wait=None):
        """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as
        indicated by the constructor.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        if seconds_to_wait is None:
            seconds_to_wait = self._wait_for_idle_timeout_seconds

        while True:
            self.refresh_status()
            if self.status == constants.IDLE_SESSION_STATUS:
                return

            if self.status in constants.FINAL_STATUS:
                error = u"Session {} unexpectedly reached final status '{}'."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(error, self.get_logs()))

            if seconds_to_wait <= 0.0:
                error = u"Session {} did not reach idle status in time. Current status is {}."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyClientTimeoutException(error)

            start_time = time()
            self.logger.debug(u"Session {} in state {}. Sleeping {} seconds."
                              .format(self.id, self.status, self._status_sleep_seconds))
            sleep(self._status_sleep_seconds)
            seconds_to_wait -= time() - start_time

    def sleep(self):
        sleep(self._statement_sleep_seconds)

    def refresh_status(self):
        status = self._http_client.get_session(self.id)[u'state']

        if status in constants.POSSIBLE_SESSION_STATUS:
            self.status = status
        else:
            raise LivyUnexpectedStatusException(u"Status '{}' not supported by session.".format(status))

        return self.status

    def _get_sql_context_creation_command(self):
        if self.kind == constants.SESSION_KIND_SPARK:
            sql_context_command = u"val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_PYSPARK:
            sql_context_command = u"from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_SPARKR:
            sql_context_command = u"sqlContext <- sparkRHive.init(sc)"
        else:
            raise BadUserDataException(u"Do not know how to create HiveContext in session of kind {}.".format(self.kind))

        return Command(sql_context_command)
Example #6
0
class ReconnectHandler(IPythonHandler):
    logger = None

    @web.authenticated
    @gen.coroutine
    def post(self):
        self.logger = SparkLog(u"ReconnectHandler")

        spark_events = self._get_spark_events()

        try:
            data = json_decode(self.request.body)
        except ValueError as e:
            self.set_status(400)
            msg = "Invalid JSON in request body."
            self.logger.error(msg)
            self.finish(msg)
            spark_events.emit_cluster_change_event(None, 400, False, msg)
            return

        endpoint = None
        try:
            path = self._get_argument_or_raise(data, 'path')
            username = self._get_argument_or_raise(data, 'username')
            password = self._get_argument_or_raise(data, 'password')
            endpoint = self._get_argument_or_raise(data, 'endpoint')
            auth = self._get_argument_if_exists(data, 'auth')
            if auth is None:
                if username == '' and password == '':
                    auth = constants.NO_AUTH
                else:
                    auth = constants.AUTH_BASIC
        except MissingArgumentError as e:
            self.set_status(400)
            self.finish(str(e))
            self.logger.error(str(e))
            spark_events.emit_cluster_change_event(endpoint, 400, False, str(e))
            return

        kernel_name = self._get_kernel_name(data)

        # Get kernel manager, create a new kernel if none exists or restart the existing one when applicable
        kernel_manager = yield self._get_kernel_manager(path, kernel_name)

        # Execute code
        client = kernel_manager.client()
        code = '%{} -s {} -u {} -p {} -t {}'.format(KernelMagics._do_not_call_change_endpoint.__name__, endpoint, username, password, auth)
        response_id = client.execute(code, silent=False, store_history=False)
        msg = client.get_shell_msg(response_id)

        # Get execution info
        successful_message = self._msg_successful(msg)
        error = self._msg_error(msg)
        if successful_message:
            status_code = 200
        else:
            self.logger.error(u"Code to reconnect errored out: {}".format(error))
            status_code = 500

        # Post execution info
        self.set_status(status_code)
        self.finish(json.dumps(dict(success=successful_message, error=error), sort_keys=True))
        spark_events.emit_cluster_change_event(endpoint, status_code, successful_message, error)

    def _get_kernel_name(self, data):
        kernel_name = self._get_argument_if_exists(data, 'kernelname')
        self.logger.debug("Kernel name is {}".format(kernel_name))
        if kernel_name is None:
            kernel_name = conf.server_extension_default_kernel_name()
            self.logger.debug("Defaulting to kernel name {}".format(kernel_name))
        return kernel_name

    def _get_argument_if_exists(self, data, key):
        return data.get(key)

    def _get_argument_or_raise(self, data, key):
        try:
            return data[key]
        except KeyError:
            raise MissingArgumentError(key)

    @gen.coroutine
    def _get_kernel_manager(self, path, kernel_name):
        sessions = self.session_manager.list_sessions()

        kernel_id = None
        for session in sessions:
            if session['notebook']['path'] == path:
                session_id = session['id']
                kernel_id = session['kernel']['id']
                existing_kernel_name = session['kernel']['name']
                break

        if kernel_id is None:
            self.logger.debug(u"Kernel not found. Starting a new kernel.")
            k_m = yield self._get_kernel_manager_new_session(path, kernel_name)
        elif existing_kernel_name != kernel_name:
            self.logger.debug(u"Existing kernel name '{}' does not match requested '{}'. Starting a new kernel.".format(existing_kernel_name, kernel_name))
            self._delete_session(session_id)
            k_m = yield self._get_kernel_manager_new_session(path, kernel_name)
        else:
            self.logger.debug(u"Kernel found. Restarting kernel.")
            k_m = self.kernel_manager.get_kernel(kernel_id)
            k_m.restart_kernel()

        raise gen.Return(k_m)

    @gen.coroutine
    def _get_kernel_manager_new_session(self, path, kernel_name):
        model_future = self.session_manager.create_session(kernel_name=kernel_name, path=path, type="notebook")
        model = yield model_future
        kernel_id = model["kernel"]["id"]
        self.logger.debug("Kernel created with id {}".format(str(kernel_id)))
        k_m = self.kernel_manager.get_kernel(kernel_id)
        raise gen.Return(k_m)

    def _delete_session(self, session_id):
        self.session_manager.delete_session(session_id)

    def _msg_status(self, msg):
        return msg['content']['status']

    def _msg_successful(self, msg):
        return self._msg_status(msg) == 'ok'

    def _msg_error(self, msg):
        if self._msg_status(msg) != 'error':
            return None
        return u'{}:\n{}'.format(msg['content']['ename'], msg['content']['evalue'])

    def _get_spark_events(self):
        spark_events = getattr(self, 'spark_events', None)
        if spark_events is None:
            return SparkEvents()
        return spark_events
class ReliableHttpClient(object):
    """Http client that is reliable in its requests. Uses requests library."""

    def __init__(self, endpoint, headers, retry_policy):
        self._endpoint = endpoint
        self._headers = headers
        self._retry_policy = retry_policy
        self.logger = SparkLog(u"ReliableHttpClient")

        self.verify_ssl = not conf.ignore_ssl_errors()
        if not self.verify_ssl:
            self.logger.debug(u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.")
            requests.packages.urllib3.disable_warnings()

    def compose_url(self, relative_url):
        r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/"))
        return self._endpoint.url + r_u

    def get(self, relative_url, accepted_status_codes):
        """Sends a get request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes, requests.get)

    def post(self, relative_url, accepted_status_codes, data):
        """Sends a post request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes, requests.post, data)

    def delete(self, relative_url, accepted_status_codes):
        """Sends a delete request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes, requests.delete)

    def _send_request(self, relative_url, accepted_status_codes, function, data=None):
        return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0)

    def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count):
        while True:
            try:
                if not self._endpoint.authenticate:
                    if data is None:
                        r = function(url, headers=self._headers, verify=self.verify_ssl)
                    else:
                        r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl)
                else:
                    if data is None:
                        r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password),
                                     verify=self.verify_ssl)
                    else:
                        r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password),
                                     data=json.dumps(data), verify=self.verify_ssl)
            except requests.exceptions.RequestException as e:
                error = True
                r = None
                status = None

                self.logger.error(u"Request to '{}' failed with '{}'".format(url, e))
            else:
                error = False
                status = r.status_code

            if error or status not in accepted_status_codes:
                if self._retry_policy.should_retry(status, error, retry_count):
                    sleep(self._retry_policy.seconds_to_sleep(retry_count))
                    retry_count += 1
                    continue
                else:
                    raise HttpClientException(u"Invalid status code '{}' or error '{}' from {}"
                                              .format(status, error, url))
            return r
Example #8
0
class LivySession(ObjectWithGuid):
    def __init__(self,
                 http_client,
                 properties,
                 ipython_display,
                 session_id=-1,
                 sql_created=None,
                 spark_events=None,
                 should_heartbeat=False,
                 heartbeat_thread=None):
        super(LivySession, self).__init__()
        assert u"kind" in list(properties.keys())
        kind = properties[u"kind"]
        self.properties = properties
        self.ipython_display = ipython_display
        self._should_heartbeat = should_heartbeat
        self._user_passed_heartbeat_thread = heartbeat_thread

        if spark_events is None:
            spark_events = SparkEvents()
        self._spark_events = spark_events

        status_sleep_seconds = conf.status_sleep_seconds()
        statement_sleep_seconds = conf.statement_sleep_seconds()
        wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds()

        assert status_sleep_seconds > 0
        assert statement_sleep_seconds > 0
        assert wait_for_idle_timeout_seconds > 0
        if session_id == -1 and sql_created is True:
            raise BadUserDataException(
                u"Cannot indicate sql state without session id.")

        self.logger = SparkLog(u"LivySession")

        kind = kind.lower()
        if kind not in constants.SESSION_KINDS_SUPPORTED:
            raise BadUserDataException(
                u"Session of kind '{}' not supported. Session must be of kinds {}."
                .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED)))

        self._app_id = None
        self._logs = u""
        self._http_client = http_client
        self._status_sleep_seconds = status_sleep_seconds
        self._statement_sleep_seconds = statement_sleep_seconds
        self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds

        self.kind = kind
        self.id = session_id
        self.created_sql_context = sql_created

        self._heartbeat_thread = None
        if session_id == -1:
            self.status = constants.NOT_STARTED_SESSION_STATUS
            sql_created = False
        else:
            self.status = constants.BUSY_SESSION_STATUS
            self._start_heartbeat_thread()

    def __str__(self):
        return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\
            .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url())

    def start(self, create_sql_context=True):
        """Start the session against actual livy server."""
        self._spark_events.emit_session_creation_start_event(
            self.guid, self.kind)

        try:
            r = self._http_client.post_session(self.properties)
            self.id = r[u"id"]
            self.status = str(r[u"state"])

            self.ipython_display.writeln(u"Creating SparkContext as 'sc'")

            # Start heartbeat thread to keep Livy interactive session alive.
            self._start_heartbeat_thread()

            # We wait for livy_session_startup_timeout_seconds() for the session to start up.
            try:
                self.wait_for_idle(conf.livy_session_startup_timeout_seconds())
            except LivyClientTimeoutException:
                raise LivyClientTimeoutException(
                    u"Session {} did not start up in {} seconds.".format(
                        self.id, conf.livy_session_startup_timeout_seconds()))

            html = get_sessions_info_html([self], self.id)
            self.ipython_display.html(html)

            if create_sql_context:
                self.create_sql_context()
        except Exception as e:
            self._spark_events.emit_session_creation_end_event(
                self.guid, self.kind, self.id, self.status, False,
                e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_creation_end_event(
                self.guid, self.kind, self.id, self.status, True, "", "")

    def create_sql_context(self):
        """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'."""
        if self.created_sql_context:
            return
        self.logger.debug(u"Starting '{}' hive session.".format(self.kind))
        self.ipython_display.writeln(u"Creating HiveContext as 'sqlContext'")
        command = self._get_sql_context_creation_command()
        try:
            (success, out) = command.execute(self)
        except LivyClientTimeoutException:
            raise LivyClientTimeoutException(
                u"Failed to create the SqlContext in time. Timed out after {} seconds."
                .format(self._wait_for_idle_timeout_seconds))
        if success:
            self.ipython_display.writeln(
                u"SparkContext and HiveContext created. Executing user code ..."
            )
            self.created_sql_context = True
        else:
            raise FailedToCreateSqlContextException(
                u"Failed to create the SqlContext.\nError: '{}'".format(out))

    def get_app_id(self):
        if self._app_id is None:
            self._app_id = self._http_client.get_session(self.id).get("appId")
        return self._app_id

    def get_app_info(self):
        appInfo = self._http_client.get_session(self.id).get("appInfo")
        return appInfo if appInfo is not None else {}

    def get_app_info_member(self, member_name):
        return self.get_app_info().get(member_name)

    def get_driver_log_url(self):
        return self.get_app_info_member("driverLogUrl")

    def get_logs(self):
        log_array = self._http_client.get_all_session_logs(self.id)[u'log']
        self._logs = "\n".join(log_array)
        return self._logs

    def get_spark_ui_url(self):
        return self.get_app_info_member("sparkUiUrl")

    @property
    def http_client(self):
        return self._http_client

    @staticmethod
    def is_final_status(status):
        return status in constants.FINAL_STATUS

    def delete(self):
        session_id = self.id
        self._spark_events.emit_session_deletion_start_event(
            self.guid, self.kind, session_id, self.status)

        try:
            self.logger.debug(u"Deleting session '{}'".format(session_id))

            if self.status != constants.NOT_STARTED_SESSION_STATUS:
                self._http_client.delete_session(session_id)
                self._stop_heartbeat_thread()
                self.status = constants.DEAD_SESSION_STATUS
                self.id = -1
            else:
                self.ipython_display.send_error(
                    u"Cannot delete session {} that is in state '{}'.".format(
                        session_id, self.status))

        except Exception as e:
            self._spark_events.emit_session_deletion_end_event(
                self.guid, self.kind, session_id, self.status, False,
                e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_deletion_end_event(
                self.guid, self.kind, session_id, self.status, True, "", "")

    def wait_for_idle(self, seconds_to_wait=None):
        """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as
        indicated by the constructor.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        if seconds_to_wait is None:
            seconds_to_wait = self._wait_for_idle_timeout_seconds

        while True:
            self.refresh_status()
            if self.status == constants.IDLE_SESSION_STATUS:
                return

            if self.status in constants.FINAL_STATUS:
                error = u"Session {} unexpectedly reached final status '{}'."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(
                    error, self.get_logs()))

            if seconds_to_wait <= 0.0:
                error = u"Session {} did not reach idle status in time. Current status is {}."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyClientTimeoutException(error)

            start_time = time()
            self.logger.debug(
                u"Session {} in state {}. Sleeping {} seconds.".format(
                    self.id, self.status, self._status_sleep_seconds))
            sleep(self._status_sleep_seconds)
            seconds_to_wait -= time() - start_time

    def sleep(self):
        sleep(self._statement_sleep_seconds)

    def refresh_status(self):
        status = self._http_client.get_session(self.id)[u'state']

        if status in constants.POSSIBLE_SESSION_STATUS:
            self.status = status
        else:
            raise LivyUnexpectedStatusException(
                u"Status '{}' not supported by session.".format(status))

        return self.status

    def _get_sql_context_creation_command(self):
        if self.kind == constants.SESSION_KIND_SPARK:
            sql_context_command = u"val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_PYSPARK:
            sql_context_command = u"from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_SPARKR:
            sql_context_command = u"sqlContext <- sparkRHive.init(sc)"
        else:
            raise BadUserDataException(
                u"Do not know how to create HiveContext in session of kind {}."
                .format(self.kind))

        return Command(sql_context_command)

    def _start_heartbeat_thread(self):
        if self._should_heartbeat and self._heartbeat_thread is None:
            refresh_seconds = conf.heartbeat_refresh_seconds()
            retry_seconds = conf.heartbeat_retry_seconds()

            if self._user_passed_heartbeat_thread is None:
                self._heartbeat_thread = _HeartbeatThread(
                    self, refresh_seconds, retry_seconds)
            else:
                self._heartbeat_thread = self._user_passed_heartbeat_thread

            self._heartbeat_thread.daemon = True
            self._heartbeat_thread.start()

    def _stop_heartbeat_thread(self):
        if self._heartbeat_thread is not None:
            self._heartbeat_thread.stop()
            self._heartbeat_thread = None

    def get_row_html(self, current_session_id):
        return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format(
            self.id, self.get_app_id(), self.kind, self.status,
            self.get_html_link(u'Link', self.get_spark_ui_url()),
            self.get_html_link(u'Link', self.get_driver_log_url()),
            u"" if current_session_id is None or current_session_id != self.id
            else u"\u2714")

    @staticmethod
    def get_html_link(text, url):
        if url is not None:
            return u"""<a target="_blank" href="{1}">{0}</a>""".format(
                text, url)
        else:
            return u""
class LivySession(ObjectWithGuid):
    def __init__(self, http_client, properties, ipython_display,
                 session_id=-1, spark_events=None,
                 heartbeat_timeout=0, heartbeat_thread=None):
        super(LivySession, self).__init__()
        assert constants.LIVY_KIND_PARAM in list(properties.keys())
        kind = properties[constants.LIVY_KIND_PARAM]

        should_heartbeat = False
        if heartbeat_timeout > 0:
            should_heartbeat = True
            properties[constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout
        elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()):
            properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM)

        self.properties = properties
        self.ipython_display = ipython_display
        self._should_heartbeat = should_heartbeat
        self._user_passed_heartbeat_thread = heartbeat_thread

        if spark_events is None:
            spark_events = SparkEvents()
        self._spark_events = spark_events

        status_sleep_seconds = conf.status_sleep_seconds()
        statement_sleep_seconds = conf.statement_sleep_seconds()
        wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds()

        assert status_sleep_seconds > 0
        assert statement_sleep_seconds > 0
        assert wait_for_idle_timeout_seconds > 0

        self.logger = SparkLog(u"LivySession")

        kind = kind.lower()
        if kind not in constants.SESSION_KINDS_SUPPORTED:
            raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}."
                                       .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED)))

        self._app_id = None
        self._logs = u""
        self._http_client = http_client
        self._status_sleep_seconds = status_sleep_seconds
        self._statement_sleep_seconds = statement_sleep_seconds
        self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds
        self._printed_resource_warning = False

        self.kind = kind
        self.id = session_id
        self.session_info = u""
        
        self._heartbeat_thread = None
        if session_id == -1:
            self.status = constants.NOT_STARTED_SESSION_STATUS
        else:
            self.status = constants.BUSY_SESSION_STATUS
            self._start_heartbeat_thread()

    def __str__(self):
        return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\
            .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url())

    def start(self):
        """Start the session against actual livy server."""
        self._spark_events.emit_session_creation_start_event(self.guid, self.kind)
        self._printed_resource_warning = False

        try:
            r = self._http_client.post_session(self.properties)
            self.id = r[u"id"]
            self.status = str(r[u"state"])

            self.ipython_display.writeln(u"Starting Spark application")
            
            # Start heartbeat thread to keep Livy interactive session alive.
            self._start_heartbeat_thread()
            
            # We wait for livy_session_startup_timeout_seconds() for the session to start up.
            try:
                self.wait_for_idle(conf.livy_session_startup_timeout_seconds())
            except LivyClientTimeoutException:
                raise LivyClientTimeoutException(u"Session {} did not start up in {} seconds."
                                                 .format(self.id, conf.livy_session_startup_timeout_seconds()))

            html = get_sessions_info_html([self], self.id)
            self.ipython_display.html(html)

            command = Command("spark")
            (success, out) = command.execute(self)

            if success:
                self.ipython_display.writeln(u"SparkSession available as 'spark'.")
                self.sql_context_variable_name = "spark"
            else:
                command = Command("sqlContext")
                (success, out) = command.execute(self)
                if success:
                    self.ipython_display.writeln(u"SparkContext available as 'sc'.")
                    if ("hive" in out.lower()):
                        self.ipython_display.writeln(u"HiveContext available as 'sqlContext'.")
                    else:
                        self.ipython_display.writeln(u"SqlContext available as 'sqlContext'.")
                    self.sql_context_variable_name = "sqlContext"
                else:
                    raise SqlContextNotFoundException(u"Neither SparkSession nor HiveContext/SqlContext is available.")
        except Exception as e:
            self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status,
                                                               False, e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "")

    def get_app_id(self):
        if self._app_id is None:
            self._app_id = self._http_client.get_session(self.id).get("appId")
        return self._app_id

    def get_app_info(self):
        appInfo = self._http_client.get_session(self.id).get("appInfo")
        return appInfo if appInfo is not None else {}

    def get_app_info_member(self, member_name):
        return self.get_app_info().get(member_name)

    def get_driver_log_url(self):
        return self.get_app_info_member("driverLogUrl")

    def get_logs(self):
        log_array = self._http_client.get_all_session_logs(self.id)[u'log']
        self._logs = "\n".join(log_array)
        return self._logs

    def get_spark_ui_url(self):
        return self.get_app_info_member("sparkUiUrl")

    @property
    def http_client(self):
        return self._http_client

    @staticmethod
    def is_final_status(status):
        return status in constants.FINAL_STATUS

    def delete(self):
        session_id = self.id
        self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status)

        try:
            self.logger.debug(u"Deleting session '{}'".format(session_id))
            
            if self.status != constants.NOT_STARTED_SESSION_STATUS:
                self._http_client.delete_session(session_id)
                self._stop_heartbeat_thread()
                self.status = constants.DEAD_SESSION_STATUS
                self.id = -1
            else:
                self.ipython_display.send_error(u"Cannot delete session {} that is in state '{}'."
                                                .format(session_id, self.status))
            
        except Exception as e:
            self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False,
                                                               e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "")

    def wait_for_idle(self, seconds_to_wait=None):
        """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as
        indicated by the constructor.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        if seconds_to_wait is None:
            seconds_to_wait = self._wait_for_idle_timeout_seconds

        while True:
            self.refresh_status_and_info()
            if self.status == constants.IDLE_SESSION_STATUS:
                return

            if self.status in constants.FINAL_STATUS:
                error = u"Session {} unexpectedly reached final status '{}'."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(error, self.get_logs()))

            if seconds_to_wait <= 0.0:
                error = u"Session {} did not reach idle status in time. Current status is {}."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyClientTimeoutException(error)

            if constants.YARN_RESOURCE_LIMIT_MSG in self.session_info and \
                not self._printed_resource_warning:
                self.ipython_display.send_error(constants.RESOURCE_LIMIT_WARNING\
                                                .format(conf.resource_limit_mitigation_suggestion()))
                self._printed_resource_warning = True

            start_time = time()
            self.logger.debug(u"Session {} in state {}. Sleeping {} seconds."
                              .format(self.id, self.status, self._status_sleep_seconds))
            sleep(self._status_sleep_seconds)
            seconds_to_wait -= time() - start_time

    def sleep(self):
        sleep(self._statement_sleep_seconds)

    # This function will refresh the status and get the logs in a single call.
    # Only the status will be returned as the return value.
    def refresh_status_and_info(self):
        response = self._http_client.get_session(self.id)
        status = response[u'state']
        log_array = response[u'log']

        if status in constants.POSSIBLE_SESSION_STATUS:
            self.status = status
            self.session_info = u"\n".join(log_array)
        else:
           raise LivyUnexpectedStatusException(u"Status '{}' not supported by session.".format(status))

    def _start_heartbeat_thread(self):
        if self._should_heartbeat and self._heartbeat_thread is None:
            refresh_seconds = conf.heartbeat_refresh_seconds()
            retry_seconds = conf.heartbeat_retry_seconds()
            
            if self._user_passed_heartbeat_thread is None:
                self._heartbeat_thread = _HeartbeatThread(self, refresh_seconds, retry_seconds)
            else:
                self._heartbeat_thread = self._user_passed_heartbeat_thread
            
            self._heartbeat_thread.daemon = True
            self._heartbeat_thread.start()

    def _stop_heartbeat_thread(self):
        if self._heartbeat_thread is not None:
            self._heartbeat_thread.stop()
            self._heartbeat_thread = None

    def get_row_html(self, current_session_id):
        return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format(
            self.id, self.get_app_id(), self.kind, self.status,
            self.get_html_link(u'Link', self.get_spark_ui_url()), self.get_html_link(u'Link', self.get_driver_log_url()),
            u"" if current_session_id is None or current_session_id != self.id else u"\u2714"
        )

    @staticmethod
    def get_html_link(text, url):
        if url is not None:
            return u"""<a target="_blank" href="{1}">{0}</a>""".format(text, url)
        else:
            return u""
class SessionManager(object):
    def __init__(self, ipython_display):
        self.logger = SparkLog(u"SessionManager")
        self.ipython_display = ipython_display

        self._sessions = dict()

        self._register_cleanup_on_exit()

    @property
    def sessions(self):
        return self._sessions

    def get_sessions_list(self):
        return list(self._sessions.keys())

    def get_sessions_info(self):
        return [u"Name: {}\t{}".format(k, str(self._sessions[k])) for k in list(self._sessions.keys())]

    def add_session(self, name, session):
        if name in self._sessions:
            raise SessionManagementException(u"Session with name '{}' already exists. Please delete the session"
                                             u" first if you intend to replace it.".format(name))

        self._sessions[name] = session

    def get_any_session(self):
        number_of_sessions = len(self._sessions)
        if number_of_sessions == 1:
            key = self.get_sessions_list()[0]
            return self._sessions[key]
        elif number_of_sessions == 0:
            raise SessionManagementException(u"You need to have at least 1 client created to execute commands.")
        else:
            raise SessionManagementException(u"Please specify the client to use. Possible sessions are {}".format(
                self.get_sessions_list()))
        
    def get_session(self, name):
        if name in self._sessions:
            return self._sessions[name]
        raise SessionManagementException(u"Could not find '{}' session in list of saved sessions. Possible sessions are {}".format(
            name, self.get_sessions_list()))

    def get_session_id_for_client(self, name):
        if name in self.get_sessions_list():
            return self._sessions[name].id
        return None

    def get_session_name_by_id_endpoint(self, id, endpoint):
        for (name, session) in self._sessions.items():
            if session.id == int(id) and session.endpoint == endpoint:
                return name
        return None

    def delete_client(self, name):
        self._remove_session(name)
    
    def clean_up_all(self):
        for name in self.get_sessions_list():
            self._remove_session(name)

    def _remove_session(self, name):
        if name in self.get_sessions_list():
            self._sessions[name].delete()
            del self._sessions[name]
        else:
            raise SessionManagementException(u"Could not find '{}' session in list of saved sessions. Possible sessions are {}"
                                             .format(name, self.get_sessions_list()))

    def _register_cleanup_on_exit(self):
        """
        Stop the livy sessions before python process exits for any reason (if enabled in conf)
        """
        if conf.cleanup_all_sessions_on_exit():
            def cleanup_spark_sessions():
                try:
                    self.clean_up_all()
                except Exception as e:
                    self.logger.error(u"Error cleaning up sessions on exit: {}".format(e))
                    pass
            atexit.register(cleanup_spark_sessions)
            self.ipython_display.writeln(u"Cleaning up livy sessions on exit is enabled")
Example #11
0
class SparkKernelBase(IPythonKernel):
    def __init__(self, implementation, implementation_version, language, language_version, language_info,
                 session_language, user_code_parser=None, **kwargs):
        # Required by Jupyter - Override
        self.implementation = implementation
        self.implementation_version = implementation_version
        self.language = language
        self.language_version = language_version
        self.language_info = language_info

        # Override
        self.session_language = session_language

        super(SparkKernelBase, self).__init__(**kwargs)

        self.logger = SparkLog(u"{}_jupyter_kernel".format(self.session_language))
        self._fatal_error = None
        self.ipython_display = IpythonDisplay()

        if user_code_parser is None:
            self.user_code_parser = UserCodeParser()
        else:
            self.user_code_parser = user_code_parser

        # Disable warnings for test env in HDI
        requests.packages.urllib3.disable_warnings()

        if not kwargs.get("testing", False):
            self._load_magics_extension()
            self._change_language()
            if conf.use_auto_viz():
                self._register_auto_viz()

    def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        def f(self):
            if self._fatal_error is not None:
                return self._repeat_fatal_error()

            return self._do_execute(code, silent, store_history, user_expressions, allow_stdin)
        return wrap_unexpected_exceptions(f, self._complete_cell)(self)

    def do_shutdown(self, restart):
        # Cleanup
        self._delete_session()

        return self._do_shutdown_ipykernel(restart)

    def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin):
        code_to_run = self.user_code_parser.get_code_to_run(code)

        res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin)

        return res

    def _load_magics_extension(self):
        register_magics_code = "%load_ext sparkmagic.kernels"
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to load the Spark kernels magics library.")
        self.logger.debug("Loaded magics.")

    def _change_language(self):
        register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language)
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to change language to {}.".format(self.session_language))
        self.logger.debug("Changed language.")

    def _register_auto_viz(self):
        from sparkmagic.utils.sparkevents import get_spark_events_handler
        import autovizwidget.utils.configuration as c
        
        handler = get_spark_events_handler()
        c.override("events_handler", handler)
        
        register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe
ip = get_ipython()
ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)"""
        self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to register auto viz for notebook.")
        self.logger.debug("Registered auto viz.")

    def _delete_session(self):
        code = "%%_do_not_call_delete_session\n "
        self._execute_cell_for_user(code, True, False)

    def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False,
                      shutdown_if_error=False, log_if_error=None):
        reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin)

        if shutdown_if_error and reply_content[u"status"] == u"error":
            error_from_reply = reply_content[u"evalue"]
            if log_if_error is not None:
                message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply)
                return self._abort_with_fatal_error(message)

        return reply_content

    def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin)

    def _do_shutdown_ipykernel(self, restart):
        return super(SparkKernelBase, self).do_shutdown(restart)

    def _complete_cell(self):
        """A method that runs a cell with no effect. Call this and return the value it
        returns when there's some sort of error preventing the user's cell from executing; this
        will register the cell from the Jupyter UI as being completed."""
        return self._execute_cell("None", False, True, None, False)

    def _show_user_error(self, message):
        self.logger.error(message)
        self.ipython_display.send_error(message)

    def _queue_fatal_error(self, message):
        """Queues up a fatal error to be thrown when the next cell is executed; does not
        raise an error immediately. We use this for errors that happen on kernel startup,
        since IPython crashes if we throw an exception in the __init__ method."""
        self._fatal_error = message

    def _abort_with_fatal_error(self, message):
        """Queues up a fatal error and throws it immediately."""
        self._queue_fatal_error(message)
        return self._repeat_fatal_error()

    def _repeat_fatal_error(self):
        """Throws an error that has already been queued."""
        error = conf.fatal_error_suggestion().format(self._fatal_error)
        self.logger.error(error)
        self.ipython_display.send_error(error)
        return self._complete_cell()
class SparkKernelBase(IPythonKernel):
    def __init__(self, implementation, implementation_version, language, language_version, language_info,
                 session_language, user_code_parser=None, **kwargs):
        # Required by Jupyter - Override
        self.implementation = implementation
        self.implementation_version = implementation_version
        self.language = language
        self.language_version = language_version
        self.language_info = language_info

        # Override
        self.session_language = session_language

        super(SparkKernelBase, self).__init__(**kwargs)

        self.logger = SparkLog(u"{}_jupyter_kernel".format(self.session_language))
        self._fatal_error = None
        self.ipython_display = IpythonDisplay()
        self.spark_controller = SparkController(self.ipython_display)
        if user_code_parser is None:
            self.user_code_parser = UserCodeParser()
        else:
            self.user_code_parser = user_code_parser

        # Disable warnings for test env in HDI
        requests.packages.urllib3.disable_warnings()

        if not kwargs.get("testing", False):
            self._load_magics_extension()
            self._change_language()
            # 项目启动的时候初始化sparkmagic.magic 和session
            self._load_spark_magics_extension()
            self._init_livy_session()
            if conf.use_auto_viz():
                self._register_auto_viz()

    def _is_sql_filter(self, code):
        if conf.is_sql_restrict():
            if re.search(r'\s*show\s+databases', code.lower()):
                return True

            if re.search(r'\s*use\s+', code.lower()):
                return True

        return False

    def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        def f(self):
            if self._is_sql_filter(code):
                self.ipython_display.write("已为您选择好专属数据库, 直接使用show tables 试试看")
                return self._complete_cell()

            if self._fatal_error is not None:
                return self._repeat_fatal_error()

            return self._do_execute(code, silent, store_history, user_expressions, allow_stdin)
        return wrap_unexpected_exceptions(f, self._complete_cell)(self)

    def do_shutdown(self, restart):
        # Cleanup
        self._delete_session()

        return self._do_shutdown_ipykernel(restart)

    def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin):
        code_to_run = self.user_code_parser.get_code_to_run(code)

        res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin)

        return res

    def _load_magics_extension(self):
        register_magics_code = "%load_ext sparkmagic.kernels"
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to load the Spark kernels magics library.")
        self.logger.debug("Loaded magics.")

    def _load_spark_magics_extension(self):
        '''
            初始化spark.magic,类似执行%load_ext sparkmagic.magics
        :return:
        '''
        register_spark_magics_code = "%load_ext sparkmagic.magics"
        self._execute_cell(register_spark_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to load the Spark Magics library.")
        self.logger.debug("Loaded sparkmagic.magics")

    def _change_language(self):
        register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language)
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to change language to {}.".format(self.session_language))
        self.logger.debug("Changed language.")

    def _init_livy_session(self):
        '''
            初始化session不应该在此类执行具体操作,应该委派kernelmagics初始化session,
        :return:
        '''
        register_magics_code = "%%_do_not_call_init_livy_session -i {}\n ".format(self.session_language)
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to init livy session: {}.".format(self.session_language))
        self.logger.debug("Init livy session.")

    def _register_auto_viz(self):
        from sparkmagic.utils.sparkevents import get_spark_events_handler
        import autovizwidget.utils.configuration as c

        handler = get_spark_events_handler()
        c.override("events_handler", handler)

        register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe
ip = get_ipython()
ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)"""
        self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to register auto viz for notebook.")
        self.logger.debug("Registered auto viz.")

    def _delete_session(self):
        code = "%%_do_not_call_delete_session\n "
        self._execute_cell_for_user(code, True, False)

    def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False,
                      shutdown_if_error=False, log_if_error=None):
        reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin)

        if shutdown_if_error and reply_content[u"status"] == u"error":
            error_from_reply = reply_content[u"evalue"]
            if log_if_error is not None:
                message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply)
                return self._abort_with_fatal_error(message)

        return reply_content

    def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin)

    def _do_shutdown_ipykernel(self, restart):
        return super(SparkKernelBase, self).do_shutdown(restart)

    def _complete_cell(self):
        """A method that runs a cell with no effect. Call this and return the value it
        returns when there's some sort of error preventing the user's cell from executing; this
        will register the cell from the Jupyter UI as being completed."""
        return self._execute_cell("None", False, True, None, False)

    def _show_user_error(self, message):
        self.logger.error(message)
        self.ipython_display.send_error(message)

    def _queue_fatal_error(self, message):
        """Queues up a fatal error to be thrown when the next cell is executed; does not
        raise an error immediately. We use this for errors that happen on kernel startup,
        since IPython crashes if we throw an exception in the __init__ method."""
        self._fatal_error = message

    def _abort_with_fatal_error(self, message):
        """Queues up a fatal error and throws it immediately."""
        self._queue_fatal_error(message)
        return self._repeat_fatal_error()

    def _repeat_fatal_error(self):
        """Throws an error that has already been queued."""
        error = conf.fatal_error_suggestion().format(self._fatal_error)
        self.logger.error(error)
        self.ipython_display.send_error(error)
        return self._complete_cell()

    @gen.coroutine
    def complete_request(self, stream, ident, parent):
        content = parent['content']
        code = content['code']
        cursor_pos = content['cursor_pos']

        matches = yield gen.maybe_future(self.do_complete(code, cursor_pos))
        matches = json_clean(matches)
        completion_msg = self.session.send(stream, 'complete_reply',
                                           matches, parent, ident)

    def _experimental_do_complete(self, code, cursor_pos):
        """
        Experimental completions from IPython, using livy completion.
        """

        code = code.strip()
        if cursor_pos is None:
            cursor_pos = len(code)

        matches = []
        with provisionalcompleter():
            session_name = self.spark_controller.generate_livy_session_name(id(self))

            endpoint = build_endpoint(self.session_language)
            session_info_list = self.spark_controller.get_all_sessions_endpoint(endpoint)
            session_id = None
            for session in session_info_list:
                if session.session_name == session_name:
                    session_id = session.id

            if session_id:
                # Only complete the cursor_line
                cursor_line, cursor_column = position_to_cursor(code, cursor_pos)
                lines = code.split("\n")
                completion_line = lines[cursor_line]
                before_lines = lines[:cursor_line]
                if len(lines) > 1 and cursor_line > 0:
                    real_cursor_pos = cursor_pos - len("\n".join(before_lines)) - 1
                else:
                    real_cursor_pos = cursor_pos

                http_client = self.spark_controller._http_client(endpoint)
                kind = conf.get_livy_kind(self.session_language)
                res_completions = http_client.post_completion(session_id, kind, completion_line, real_cursor_pos)
                matches = res_completions.get("candidates", [])

        if matches:
            s = self.__get_cursor_start(code, cursor_pos, matches[0])
        else:
            s = cursor_pos

        res = {
            'matches': matches,
            'cursor_end': cursor_pos,
            'cursor_start': s,
            'metadata': {},
            'status': 'ok'
        }
        return res

    def __get_cursor_start(self, code, cursor_pos, match):
        before_code = code[:cursor_pos]
        before_code_rev = before_code[::-1]
        bucket = []
        for c in before_code_rev:
            if len(bucket) >= len(match):
                break

            if re.match(r"\w", c):
                bucket.insert(0, c)
            else:
                break

            if c == match[0]:
                bucket_len = len(bucket)
                completion_match_prefix = "".join(bucket)
                if completion_match_prefix == match[:bucket_len]:
                    return cursor_pos - bucket_len

        return cursor_pos

    def do_apply(self, content, bufs, msg_id, reply_metadata):
        from sparkmagic.messages_api.apply_request import ApplyRequestHandler
        result_buf = []
        reply_content = ApplyRequestHandler(self).dispath_request(content)
        return reply_content, result_buf