class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, spark_events=None, heartbeat_timeout=0, heartbeat_thread=None): super(LivySession, self).__init__() assert constants.LIVY_KIND_PARAM in list(properties.keys()) kind = properties[constants.LIVY_KIND_PARAM] should_heartbeat = False if heartbeat_timeout > 0: should_heartbeat = True properties[ constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()): properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM) self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events self._policy = ConfigurableRetryPolicy( retry_seconds_to_sleep_list=[0.2, 0.5, 0.5, 1, 1, 2], max_retries=5000) wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert wait_for_idle_timeout_seconds > 0 self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException( u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._user = None self._logs = u"" self._http_client = http_client self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self._printed_resource_warning = False self.kind = kind self.id = session_id self.session_info = u"" self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread() def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event( self.guid, self.kind) self._printed_resource_warning = False try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Starting Spark application") # Start heartbeat thread to keep Livy interactive session alive. self._start_heartbeat_thread() # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException( u"Session {} did not start up in {} seconds.".format( self.id, conf.livy_session_startup_timeout_seconds())) html = get_sessions_info_html([self], self.id) self.ipython_display.html(html) command = Command("spark") (success, out, mimetype) = command.execute(self) if success: self.ipython_display.writeln( u"SparkSession available as 'spark'.") self.sql_context_variable_name = "spark" else: command = Command("sqlContext") (success, out, mimetype) = command.execute(self) if success: self.ipython_display.writeln( u"SparkContext available as 'sc'.") if ("hive" in out.lower()): self.ipython_display.writeln( u"HiveContext available as 'sqlContext'.") else: self.ipython_display.writeln( u"SqlContext available as 'sqlContext'.") self.sql_context_variable_name = "sqlContext" else: raise SqlContextNotFoundException( u"Neither SparkSession nor HiveContext/SqlContext is available." ) except Exception as e: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, True, "", "") def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") def get_user(self): if self._user is None: session = self._http_client.get_session(self.id) self._user = session.get("proxyUser", session.get("owner")) return self._user @property def http_client(self): return self._http_client @property def endpoint(self): return self._http_client.endpoint @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event( self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self._stop_heartbeat_thread() self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error( u"Cannot delete session {} that is in state '{}'.".format( session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds retries = 1 while True: self.refresh_status_and_info() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format( error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) if constants.YARN_RESOURCE_LIMIT_MSG in self.session_info and \ not self._printed_resource_warning: self.ipython_display.send_error(constants.RESOURCE_LIMIT_WARNING\ .format(conf.resource_limit_mitigation_suggestion())) self._printed_resource_warning = True start_time = time() sleep_time = self._policy.seconds_to_sleep(retries) retries += 1 self.logger.debug( u"Session {} in state {}. Sleeping {} seconds.".format( self.id, self.status, sleep_time)) sleep(sleep_time) seconds_to_wait -= time() - start_time def sleep(self, retries): sleep(self._policy.seconds_to_sleep(retries)) # This function will refresh the status and get the logs in a single call. # Only the status will be returned as the return value. def refresh_status_and_info(self): response = self._http_client.get_session(self.id) status = response[u'state'] log_array = response[u'log'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status self.session_info = u"\n".join(log_array) else: raise LivyUnexpectedStatusException( u"Status '{}' not supported by session.".format(status)) def _start_heartbeat_thread(self): if self._should_heartbeat and self._heartbeat_thread is None: refresh_seconds = conf.heartbeat_refresh_seconds() retry_seconds = conf.heartbeat_retry_seconds() if self._user_passed_heartbeat_thread is None: self._heartbeat_thread = _HeartbeatThread( self, refresh_seconds, retry_seconds) else: self._heartbeat_thread = self._user_passed_heartbeat_thread self._heartbeat_thread.daemon = True self._heartbeat_thread.start() def _stop_heartbeat_thread(self): if self._heartbeat_thread is not None: self._heartbeat_thread.stop() self._heartbeat_thread = None def get_row_html(self, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>""".format( self.id, self.get_app_id(), self.kind, self.status, self.get_html_link(u'Link', self.get_spark_ui_url()), self.get_html_link(u'Link', self.get_driver_log_url()), self.get_user(), u"" if current_session_id is None or current_session_id != self.id else u"\u2714") @staticmethod def get_html_link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format( text, url) else: return u""
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error(u"Request to '{}' failed with '{}'".format( url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException( u"Invalid status code '{}' or error '{}' from {}". format(status, error, url)) return r
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy if self._endpoint.auth == constants.AUTH_KERBEROS: self._auth = HTTPKerberosAuth(mutual_authentication=REQUIRED) elif self._endpoint.auth == constants.AUTH_BASIC: self._auth = (self._endpoint.username, self._endpoint.password) elif self._endpoint.auth != constants.NO_AUTH: raise BadUserConfigurationException(u"Unsupported auth %s" % self._endpoint.auth) self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings() def get_headers(self): return self._headers def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if self._endpoint.auth == constants.NO_AUTH: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=self._auth, verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=self._auth, data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None text = None self.logger.error(u"Request to '{}' failed with '{}'".format( url, e)) else: error = False status = r.status_code text = r.text if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue if error: raise HttpClientException( u"Error sending http request and maximum retry encountered." ) else: raise HttpClientException( u"Invalid status code '{}' from {} with error payload: {}" .format(status, url, text)) return r
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = SparkLog(u"{}_jupyter_kernel".format( self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup # self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext sparkmagic.kernels" self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format( self.session_language) self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format( self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): from sparkmagic.utils.sparkevents import get_spark_events_handler import autovizwidget.utils.configuration as c handler = get_spark_events_handler() c.override("events_handler", handler) register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell( register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format( log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None): super(LivySession, self).__init__() assert u"kind" in list(properties.keys()) kind = properties[u"kind"] self.properties = properties self.ipython_display = ipython_display if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException(u"Cannot indicate sql state without session id.") self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Creating SparkContext as 'sc'") # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException(u"Session {} did not start up in {} seconds." .format(self.id, conf.livy_session_startup_timeout_seconds())) if create_sql_context: self.create_sql_context() except Exception as e: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "") def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug(u"Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln(u"Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: (success, out) = command.execute(self) except LivyClientTimeoutException: raise LivyClientTimeoutException(u"Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) if success: self.created_sql_context = True else: raise FailedToCreateSqlContextException(u"Failed to create the SqlContext.\nError: '{}'".format(out)) def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error(u"Cannot delete session {} that is in state '{}'." .format(session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) start_time = time() self.logger.debug(u"Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def refresh_status(self): status = self._http_client.get_session(self.id)[u'state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise LivyUnexpectedStatusException(u"Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = u"val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = u"from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = u"sqlContext <- sparkRHive.init(sc)" else: raise BadUserDataException(u"Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return Command(sql_context_command)
class ReconnectHandler(IPythonHandler): logger = None @web.authenticated @gen.coroutine def post(self): self.logger = SparkLog(u"ReconnectHandler") spark_events = self._get_spark_events() try: data = json_decode(self.request.body) except ValueError as e: self.set_status(400) msg = "Invalid JSON in request body." self.logger.error(msg) self.finish(msg) spark_events.emit_cluster_change_event(None, 400, False, msg) return endpoint = None try: path = self._get_argument_or_raise(data, 'path') username = self._get_argument_or_raise(data, 'username') password = self._get_argument_or_raise(data, 'password') endpoint = self._get_argument_or_raise(data, 'endpoint') auth = self._get_argument_if_exists(data, 'auth') if auth is None: if username == '' and password == '': auth = constants.NO_AUTH else: auth = constants.AUTH_BASIC except MissingArgumentError as e: self.set_status(400) self.finish(str(e)) self.logger.error(str(e)) spark_events.emit_cluster_change_event(endpoint, 400, False, str(e)) return kernel_name = self._get_kernel_name(data) # Get kernel manager, create a new kernel if none exists or restart the existing one when applicable kernel_manager = yield self._get_kernel_manager(path, kernel_name) # Execute code client = kernel_manager.client() code = '%{} -s {} -u {} -p {} -t {}'.format(KernelMagics._do_not_call_change_endpoint.__name__, endpoint, username, password, auth) response_id = client.execute(code, silent=False, store_history=False) msg = client.get_shell_msg(response_id) # Get execution info successful_message = self._msg_successful(msg) error = self._msg_error(msg) if successful_message: status_code = 200 else: self.logger.error(u"Code to reconnect errored out: {}".format(error)) status_code = 500 # Post execution info self.set_status(status_code) self.finish(json.dumps(dict(success=successful_message, error=error), sort_keys=True)) spark_events.emit_cluster_change_event(endpoint, status_code, successful_message, error) def _get_kernel_name(self, data): kernel_name = self._get_argument_if_exists(data, 'kernelname') self.logger.debug("Kernel name is {}".format(kernel_name)) if kernel_name is None: kernel_name = conf.server_extension_default_kernel_name() self.logger.debug("Defaulting to kernel name {}".format(kernel_name)) return kernel_name def _get_argument_if_exists(self, data, key): return data.get(key) def _get_argument_or_raise(self, data, key): try: return data[key] except KeyError: raise MissingArgumentError(key) @gen.coroutine def _get_kernel_manager(self, path, kernel_name): sessions = self.session_manager.list_sessions() kernel_id = None for session in sessions: if session['notebook']['path'] == path: session_id = session['id'] kernel_id = session['kernel']['id'] existing_kernel_name = session['kernel']['name'] break if kernel_id is None: self.logger.debug(u"Kernel not found. Starting a new kernel.") k_m = yield self._get_kernel_manager_new_session(path, kernel_name) elif existing_kernel_name != kernel_name: self.logger.debug(u"Existing kernel name '{}' does not match requested '{}'. Starting a new kernel.".format(existing_kernel_name, kernel_name)) self._delete_session(session_id) k_m = yield self._get_kernel_manager_new_session(path, kernel_name) else: self.logger.debug(u"Kernel found. Restarting kernel.") k_m = self.kernel_manager.get_kernel(kernel_id) k_m.restart_kernel() raise gen.Return(k_m) @gen.coroutine def _get_kernel_manager_new_session(self, path, kernel_name): model_future = self.session_manager.create_session(kernel_name=kernel_name, path=path, type="notebook") model = yield model_future kernel_id = model["kernel"]["id"] self.logger.debug("Kernel created with id {}".format(str(kernel_id))) k_m = self.kernel_manager.get_kernel(kernel_id) raise gen.Return(k_m) def _delete_session(self, session_id): self.session_manager.delete_session(session_id) def _msg_status(self, msg): return msg['content']['status'] def _msg_successful(self, msg): return self._msg_status(msg) == 'ok' def _msg_error(self, msg): if self._msg_status(msg) != 'error': return None return u'{}:\n{}'.format(msg['content']['ename'], msg['content']['evalue']) def _get_spark_events(self): spark_events = getattr(self, 'spark_events', None) if spark_events is None: return SparkEvents() return spark_events
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug(u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.") requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error(u"Request to '{}' failed with '{}'".format(url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException(u"Invalid status code '{}' or error '{}' from {}" .format(status, error, url)) return r
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None, should_heartbeat=False, heartbeat_thread=None): super(LivySession, self).__init__() assert u"kind" in list(properties.keys()) kind = properties[u"kind"] self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException( u"Cannot indicate sql state without session id.") self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException( u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread() def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event( self.guid, self.kind) try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Creating SparkContext as 'sc'") # Start heartbeat thread to keep Livy interactive session alive. self._start_heartbeat_thread() # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException( u"Session {} did not start up in {} seconds.".format( self.id, conf.livy_session_startup_timeout_seconds())) html = get_sessions_info_html([self], self.id) self.ipython_display.html(html) if create_sql_context: self.create_sql_context() except Exception as e: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, True, "", "") def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug(u"Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln(u"Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: (success, out) = command.execute(self) except LivyClientTimeoutException: raise LivyClientTimeoutException( u"Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) if success: self.ipython_display.writeln( u"SparkContext and HiveContext created. Executing user code ..." ) self.created_sql_context = True else: raise FailedToCreateSqlContextException( u"Failed to create the SqlContext.\nError: '{}'".format(out)) def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event( self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self._stop_heartbeat_thread() self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error( u"Cannot delete session {} that is in state '{}'.".format( session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format( error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) start_time = time() self.logger.debug( u"Session {} in state {}. Sleeping {} seconds.".format( self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def refresh_status(self): status = self._http_client.get_session(self.id)[u'state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise LivyUnexpectedStatusException( u"Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = u"val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = u"from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = u"sqlContext <- sparkRHive.init(sc)" else: raise BadUserDataException( u"Do not know how to create HiveContext in session of kind {}." .format(self.kind)) return Command(sql_context_command) def _start_heartbeat_thread(self): if self._should_heartbeat and self._heartbeat_thread is None: refresh_seconds = conf.heartbeat_refresh_seconds() retry_seconds = conf.heartbeat_retry_seconds() if self._user_passed_heartbeat_thread is None: self._heartbeat_thread = _HeartbeatThread( self, refresh_seconds, retry_seconds) else: self._heartbeat_thread = self._user_passed_heartbeat_thread self._heartbeat_thread.daemon = True self._heartbeat_thread.start() def _stop_heartbeat_thread(self): if self._heartbeat_thread is not None: self._heartbeat_thread.stop() self._heartbeat_thread = None def get_row_html(self, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format( self.id, self.get_app_id(), self.kind, self.status, self.get_html_link(u'Link', self.get_spark_ui_url()), self.get_html_link(u'Link', self.get_driver_log_url()), u"" if current_session_id is None or current_session_id != self.id else u"\u2714") @staticmethod def get_html_link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format( text, url) else: return u""
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, spark_events=None, heartbeat_timeout=0, heartbeat_thread=None): super(LivySession, self).__init__() assert constants.LIVY_KIND_PARAM in list(properties.keys()) kind = properties[constants.LIVY_KIND_PARAM] should_heartbeat = False if heartbeat_timeout > 0: should_heartbeat = True properties[constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()): properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM) self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self._printed_resource_warning = False self.kind = kind self.id = session_id self.session_info = u"" self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread() def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) self._printed_resource_warning = False try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Starting Spark application") # Start heartbeat thread to keep Livy interactive session alive. self._start_heartbeat_thread() # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException(u"Session {} did not start up in {} seconds." .format(self.id, conf.livy_session_startup_timeout_seconds())) html = get_sessions_info_html([self], self.id) self.ipython_display.html(html) command = Command("spark") (success, out) = command.execute(self) if success: self.ipython_display.writeln(u"SparkSession available as 'spark'.") self.sql_context_variable_name = "spark" else: command = Command("sqlContext") (success, out) = command.execute(self) if success: self.ipython_display.writeln(u"SparkContext available as 'sc'.") if ("hive" in out.lower()): self.ipython_display.writeln(u"HiveContext available as 'sqlContext'.") else: self.ipython_display.writeln(u"SqlContext available as 'sqlContext'.") self.sql_context_variable_name = "sqlContext" else: raise SqlContextNotFoundException(u"Neither SparkSession nor HiveContext/SqlContext is available.") except Exception as e: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "") def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self._stop_heartbeat_thread() self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error(u"Cannot delete session {} that is in state '{}'." .format(session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status_and_info() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) if constants.YARN_RESOURCE_LIMIT_MSG in self.session_info and \ not self._printed_resource_warning: self.ipython_display.send_error(constants.RESOURCE_LIMIT_WARNING\ .format(conf.resource_limit_mitigation_suggestion())) self._printed_resource_warning = True start_time = time() self.logger.debug(u"Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) # This function will refresh the status and get the logs in a single call. # Only the status will be returned as the return value. def refresh_status_and_info(self): response = self._http_client.get_session(self.id) status = response[u'state'] log_array = response[u'log'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status self.session_info = u"\n".join(log_array) else: raise LivyUnexpectedStatusException(u"Status '{}' not supported by session.".format(status)) def _start_heartbeat_thread(self): if self._should_heartbeat and self._heartbeat_thread is None: refresh_seconds = conf.heartbeat_refresh_seconds() retry_seconds = conf.heartbeat_retry_seconds() if self._user_passed_heartbeat_thread is None: self._heartbeat_thread = _HeartbeatThread(self, refresh_seconds, retry_seconds) else: self._heartbeat_thread = self._user_passed_heartbeat_thread self._heartbeat_thread.daemon = True self._heartbeat_thread.start() def _stop_heartbeat_thread(self): if self._heartbeat_thread is not None: self._heartbeat_thread.stop() self._heartbeat_thread = None def get_row_html(self, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format( self.id, self.get_app_id(), self.kind, self.status, self.get_html_link(u'Link', self.get_spark_ui_url()), self.get_html_link(u'Link', self.get_driver_log_url()), u"" if current_session_id is None or current_session_id != self.id else u"\u2714" ) @staticmethod def get_html_link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format(text, url) else: return u""
class SessionManager(object): def __init__(self, ipython_display): self.logger = SparkLog(u"SessionManager") self.ipython_display = ipython_display self._sessions = dict() self._register_cleanup_on_exit() @property def sessions(self): return self._sessions def get_sessions_list(self): return list(self._sessions.keys()) def get_sessions_info(self): return [u"Name: {}\t{}".format(k, str(self._sessions[k])) for k in list(self._sessions.keys())] def add_session(self, name, session): if name in self._sessions: raise SessionManagementException(u"Session with name '{}' already exists. Please delete the session" u" first if you intend to replace it.".format(name)) self._sessions[name] = session def get_any_session(self): number_of_sessions = len(self._sessions) if number_of_sessions == 1: key = self.get_sessions_list()[0] return self._sessions[key] elif number_of_sessions == 0: raise SessionManagementException(u"You need to have at least 1 client created to execute commands.") else: raise SessionManagementException(u"Please specify the client to use. Possible sessions are {}".format( self.get_sessions_list())) def get_session(self, name): if name in self._sessions: return self._sessions[name] raise SessionManagementException(u"Could not find '{}' session in list of saved sessions. Possible sessions are {}".format( name, self.get_sessions_list())) def get_session_id_for_client(self, name): if name in self.get_sessions_list(): return self._sessions[name].id return None def get_session_name_by_id_endpoint(self, id, endpoint): for (name, session) in self._sessions.items(): if session.id == int(id) and session.endpoint == endpoint: return name return None def delete_client(self, name): self._remove_session(name) def clean_up_all(self): for name in self.get_sessions_list(): self._remove_session(name) def _remove_session(self, name): if name in self.get_sessions_list(): self._sessions[name].delete() del self._sessions[name] else: raise SessionManagementException(u"Could not find '{}' session in list of saved sessions. Possible sessions are {}" .format(name, self.get_sessions_list())) def _register_cleanup_on_exit(self): """ Stop the livy sessions before python process exits for any reason (if enabled in conf) """ if conf.cleanup_all_sessions_on_exit(): def cleanup_spark_sessions(): try: self.clean_up_all() except Exception as e: self.logger.error(u"Error cleaning up sessions on exit: {}".format(e)) pass atexit.register(cleanup_spark_sessions) self.ipython_display.writeln(u"Cleaning up livy sessions on exit is enabled")
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = SparkLog(u"{}_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext sparkmagic.kernels" self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language) self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format(self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): from sparkmagic.utils.sparkevents import get_spark_events_handler import autovizwidget.utils.configuration as c handler = get_spark_events_handler() c.override("events_handler", handler) register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = SparkLog(u"{}_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() # 项目启动的时候初始化sparkmagic.magic 和session self._load_spark_magics_extension() self._init_livy_session() if conf.use_auto_viz(): self._register_auto_viz() def _is_sql_filter(self, code): if conf.is_sql_restrict(): if re.search(r'\s*show\s+databases', code.lower()): return True if re.search(r'\s*use\s+', code.lower()): return True return False def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._is_sql_filter(code): self.ipython_display.write("已为您选择好专属数据库, 直接使用show tables 试试看") return self._complete_cell() if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext sparkmagic.kernels" self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _load_spark_magics_extension(self): ''' 初始化spark.magic,类似执行%load_ext sparkmagic.magics :return: ''' register_spark_magics_code = "%load_ext sparkmagic.magics" self._execute_cell(register_spark_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark Magics library.") self.logger.debug("Loaded sparkmagic.magics") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language) self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format(self.session_language)) self.logger.debug("Changed language.") def _init_livy_session(self): ''' 初始化session不应该在此类执行具体操作,应该委派kernelmagics初始化session, :return: ''' register_magics_code = "%%_do_not_call_init_livy_session -i {}\n ".format(self.session_language) self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to init livy session: {}.".format(self.session_language)) self.logger.debug("Init livy session.") def _register_auto_viz(self): from sparkmagic.utils.sparkevents import get_spark_events_handler import autovizwidget.utils.configuration as c handler = get_spark_events_handler() c.override("events_handler", handler) register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell() @gen.coroutine def complete_request(self, stream, ident, parent): content = parent['content'] code = content['code'] cursor_pos = content['cursor_pos'] matches = yield gen.maybe_future(self.do_complete(code, cursor_pos)) matches = json_clean(matches) completion_msg = self.session.send(stream, 'complete_reply', matches, parent, ident) def _experimental_do_complete(self, code, cursor_pos): """ Experimental completions from IPython, using livy completion. """ code = code.strip() if cursor_pos is None: cursor_pos = len(code) matches = [] with provisionalcompleter(): session_name = self.spark_controller.generate_livy_session_name(id(self)) endpoint = build_endpoint(self.session_language) session_info_list = self.spark_controller.get_all_sessions_endpoint(endpoint) session_id = None for session in session_info_list: if session.session_name == session_name: session_id = session.id if session_id: # Only complete the cursor_line cursor_line, cursor_column = position_to_cursor(code, cursor_pos) lines = code.split("\n") completion_line = lines[cursor_line] before_lines = lines[:cursor_line] if len(lines) > 1 and cursor_line > 0: real_cursor_pos = cursor_pos - len("\n".join(before_lines)) - 1 else: real_cursor_pos = cursor_pos http_client = self.spark_controller._http_client(endpoint) kind = conf.get_livy_kind(self.session_language) res_completions = http_client.post_completion(session_id, kind, completion_line, real_cursor_pos) matches = res_completions.get("candidates", []) if matches: s = self.__get_cursor_start(code, cursor_pos, matches[0]) else: s = cursor_pos res = { 'matches': matches, 'cursor_end': cursor_pos, 'cursor_start': s, 'metadata': {}, 'status': 'ok' } return res def __get_cursor_start(self, code, cursor_pos, match): before_code = code[:cursor_pos] before_code_rev = before_code[::-1] bucket = [] for c in before_code_rev: if len(bucket) >= len(match): break if re.match(r"\w", c): bucket.insert(0, c) else: break if c == match[0]: bucket_len = len(bucket) completion_match_prefix = "".join(bucket) if completion_match_prefix == match[:bucket_len]: return cursor_pos - bucket_len return cursor_pos def do_apply(self, content, bufs, msg_id, reply_metadata): from sparkmagic.messages_api.apply_request import ApplyRequestHandler result_buf = [] reply_content = ApplyRequestHandler(self).dispath_request(content) return reply_content, result_buf