def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy if self._endpoint.auth == constants.AUTH_KERBEROS: if self._endpoint.krb_mutual_auth == constants.AUTH_KERBEROS_MUTUAL_REQ: mutual_auth = REQUIRED elif self._endpoint.krb_mutual_auth == constants.AUTH_KERBEROS_MUTUAL_OPT: mutual_auth = OPTIONAL elif self._endpoint.krb_mutual_auth == constants.AUTH_KERBEROS_MUTUAL_DIS: mutual_auth = DISABLED else: mutual_auth = REQUIRED if self._endpoint.krb_host_override == "": hostname_override = None else: hostname_override = self._endpoint.krb_host_override self._auth = HTTPKerberosAuth(mutual_authentication=mutual_auth, hostname_override=hostname_override) elif self._endpoint.auth == constants.AUTH_BASIC: self._auth = (self._endpoint.username, self._endpoint.password) elif self._endpoint.auth != constants.NO_AUTH: raise BadUserConfigurationException(u"Unsupported auth %s" % self._endpoint.auth) self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings()
def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = SparkLog(u"{}_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz()
def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy if self._endpoint.auth == constants.AUTH_KERBEROS: self._auth = HTTPKerberosAuth(**conf.kerberos_auth_configuration()) elif self._endpoint.auth == constants.AUTH_BASIC: self._auth = (self._endpoint.username, self._endpoint.password) elif self._endpoint.auth == constants.GOOGLE_AUTH: credentials = (conf.google_auth_credentials()) self._auth = credentials #Once we have credentials, we attach them to a transport. We use the transport to #make authenticated requests: how does self._session work? Do I not have to use #AuthorizedSession? Kerberos HTTPKerberosAuth attaches Kerberos Auth to Requests object #so should self._auth = credentials or authed_session? authed_session = AuthorizedSession(credentials) elif self._endpoint.auth != constants.NO_AUTH: raise BadUserConfigurationException(u"Unsupported auth %s" % self._endpoint.auth) self._session = requests.Session() self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings()
def __init__(self, ipython_display): self.logger = SparkLog(u"SessionManager") self.ipython_display = ipython_display self._sessions = dict() self._register_cleanup_on_exit()
def __init__(self, ipython_display): self.logger = SparkLog(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager(ipython_display) # this is to reuse the already created http clients # since the reliablehttpclient uses requests session self._http_clients = {}
def __init__(self, http_client, properties, ipython_display, session_id=-1, spark_events=None, heartbeat_timeout=0, heartbeat_thread=None): super(LivySession, self).__init__() assert constants.LIVY_KIND_PARAM in list(properties.keys()) kind = properties[constants.LIVY_KIND_PARAM] should_heartbeat = False if heartbeat_timeout > 0: should_heartbeat = True properties[ constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()): properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM) self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events self._policy = ConfigurableRetryPolicy( retry_seconds_to_sleep_list=[0.2, 0.5, 0.5, 1, 1, 2], max_retries=5000) wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert wait_for_idle_timeout_seconds > 0 self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException( u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self._printed_resource_warning = False self.kind = kind self.id = session_id self.session_info = u"" self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread()
def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = SparkLog(u"Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events
class Command(ObjectWithGuid): def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = SparkLog(u"Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): self._spark_events.emit_statement_execution_start_event(session.guid, session.kind, session.id, self.guid) statement_id = -1 try: session.wait_for_idle() data = {u"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response[u'id'] output = self._get_statement_output(session, statement_id) except Exception as e: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, True, "", "") return output def _get_statement_output(self, session, statement_id): retries = 1 while True: statement = session.http_client.get_statement(session.id, statement_id) status = statement[u"state"].lower() self.logger.debug(u"Status of statement {} is {}.".format(statement_id, status)) if status not in FINAL_STATEMENT_STATUS: session.sleep(retries) retries += 1 else: statement_output = statement[u"output"] if statement_output is None: return (True, u"") if statement_output[u"status"] == u"ok": return (True, statement_output[u"data"][u"text/plain"]) elif statement_output[u"status"] == u"error": return (False, statement_output[u"evalue"] + u"\n" + u"".join(statement_output[u"traceback"])) else: raise LivyUnexpectedStatusException(u"Unknown output status from Livy: '{}'" .format(statement_output[u"status"]))
class Command(ObjectWithGuid): def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = SparkLog(u"Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): self._spark_events.emit_statement_execution_start_event(session.guid, session.kind, session.id, self.guid) statement_id = -1 try: session.wait_for_idle() data = {u"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response[u'id'] output = self._get_statement_output(session, statement_id) except Exception as e: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, True, "", "") return output def _get_statement_output(self, session, statement_id): statement_running = True out = u"" while statement_running: statement = session.http_client.get_statement(session.id, statement_id) status = statement[u"state"] self.logger.debug(u"Status of statement {} is {}.".format(statement_id, status)) if status == u"running": session.sleep() else: statement_running = False statement_output = statement[u"output"] if statement_output[u"status"] == u"ok": out = (True, statement_output[u"data"][u"text/plain"]) elif statement_output[u"status"] == u"error": out = (False, statement_output[u"evalue"] + u"\n" + u"".join(statement_output[u"traceback"])) else: raise LivyUnexpectedStatusException(u"Unknown output status from Livy: '{}'" .format(statement_output[u"status"])) return out
def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None, should_heartbeat=False, heartbeat_thread=None): super(LivySession, self).__init__() assert u"kind" in list(properties.keys()) kind = properties[u"kind"] self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException( u"Cannot indicate sql state without session id.") self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException( u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread()
def post(self): self.logger = SparkLog(u"ReconnectHandler") spark_events = self._get_spark_events() try: data = json_decode(self.request.body) except ValueError as e: self.set_status(400) msg = "Invalid JSON in request body." self.logger.error(msg) self.finish(msg) spark_events.emit_cluster_change_event(None, 400, False, msg) return endpoint = None try: path = self._get_argument_or_raise(data, 'path') username = self._get_argument_or_raise(data, 'username') password = self._get_argument_or_raise(data, 'password') endpoint = self._get_argument_or_raise(data, 'endpoint') auth = self._get_argument_if_exists(data, 'auth') if auth is None: if username == '' and password == '': auth = constants.NO_AUTH else: auth = constants.AUTH_BASIC except MissingArgumentError as e: self.set_status(400) self.finish(str(e)) self.logger.error(str(e)) spark_events.emit_cluster_change_event(endpoint, 400, False, str(e)) return kernel_name = self._get_kernel_name(data) # Get kernel manager, create a new kernel if none exists or restart the existing one when applicable kernel_manager = yield self._get_kernel_manager(path, kernel_name) # Execute code client = kernel_manager.client() code = '%{} -s {} -u {} -p {} -t {}'.format(KernelMagics._do_not_call_change_endpoint.__name__, endpoint, username, password, auth) response_id = client.execute(code, silent=False, store_history=False) msg = client.get_shell_msg(response_id) # Get execution info successful_message = self._msg_successful(msg) error = self._msg_error(msg) if successful_message: status_code = 200 else: self.logger.error(u"Code to reconnect errored out: {}".format(error)) status_code = 500 # Post execution info self.set_status(status_code) self.finish(json.dumps(dict(success=successful_message, error=error), sort_keys=True)) spark_events.emit_cluster_change_event(endpoint, status_code, successful_message, error)
class SparkMagicBase(Magics): def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = SparkLog(u"SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def execute_spark(self, cell, output_var, samplemethod, maxrows, samplefraction, session_name): (success, out) = self.spark_controller.run_command(Command(cell), session_name) if not success: self.ipython_display.send_error(out) else: self.ipython_display.write(out) if output_var is not None: spark_store_command = self._spark_store_command( output_var, samplemethod, maxrows, samplefraction) df = self.spark_controller.run_command(spark_store_command, session_name) self.shell.user_ns[output_var] = df @staticmethod def _spark_store_command(output_var, samplemethod, maxrows, samplefraction): return SparkStoreCommand(output_var, samplemethod, maxrows, samplefraction) def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction): return SQLQuery(cell, samplemethod, maxrows, samplefraction) def _print_endpoint_info(self, info_sessions, current_session_id): if info_sessions: info_sessions = sorted(info_sessions, key=lambda s: s.id) html = get_sessions_info_html(info_sessions, current_session_id) self.ipython_display.html(html) else: self.ipython_display.html(u'No active sessions.')
class SparkMagicBase(Magics): def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = SparkLog(u"SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction): return SQLQuery(cell, samplemethod, maxrows, samplefraction) def _print_endpoint_info(self, info_sessions, current_session_id): if info_sessions: info_sessions = sorted(info_sessions, key=lambda s: s.id) html = u"""<table> <tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr>""" + \ u"".join([SparkMagicBase._session_row_html(session, current_session_id) for session in info_sessions]) + \ u"</table>" self.ipython_display.html(html) else: self.ipython_display.html(u'No active sessions.') @staticmethod def _session_row_html(session, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format( session.id, session.get_app_id(), session.kind, session.status, SparkMagicBase._link(u'Link', session.get_spark_ui_url()), SparkMagicBase._link(u'Link', session.get_driver_log_url()), u"" if current_session_id is None or current_session_id != session.id else u"✔") @staticmethod def _link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format( text, url) else: return u""
class SparkMagicBase(Magics): def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = SparkLog(u"SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction): return SQLQuery(cell, samplemethod, maxrows, samplefraction) def _print_endpoint_info(self, info_sessions, current_session_id): if info_sessions: info_sessions = sorted(info_sessions, key=lambda s: s.id) html = u"""<table> <tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr>""" + \ u"".join([SparkMagicBase._session_row_html(session, current_session_id) for session in info_sessions]) + \ u"</table>" self.ipython_display.html(html) else: self.ipython_display.html(u'No active sessions.') @staticmethod def _session_row_html(session, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format( session.id, session.get_app_id(), session.kind, session.status, SparkMagicBase._link(u'Link', session.get_spark_ui_url()), SparkMagicBase._link(u'Link', session.get_driver_log_url()), u"" if current_session_id is None or current_session_id != session.id else u"✔" ) @staticmethod def _link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format(text, url) else: return u""
def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self._auth = self._endpoint.auth self._session = requests.Session() self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug(u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.") requests.packages.urllib3.disable_warnings()
def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = SparkLog(u"SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event()
def __init__(self, spark_controller, ipywidget_factory, ipython_display, endpoints, refresh_method): # This is nested super(ManageEndpointWidget, self).__init__(spark_controller, ipywidget_factory, ipython_display, True) self.logger = SparkLog("ManageEndpointWidget") self.endpoints = endpoints self.refresh_method = refresh_method self.children = self.get_existing_endpoint_widgets() for child in self.children: child.parent_widget = self
def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy if self._endpoint.auth_type == constants.AUTH_KERBEROS: self._auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) elif self._endpoint.auth_type == constants.AUTH_BASIC: self._auth = (self._endpoint.username, self._endpoint.password) self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings()
def __init__(self, http_client, properties, ipython_display, session_id=-1, spark_events=None, heartbeat_timeout=0, heartbeat_thread=None): super(LivySession, self).__init__() assert constants.LIVY_KIND_PARAM in list(properties.keys()) kind = properties[constants.LIVY_KIND_PARAM] should_heartbeat = False if heartbeat_timeout > 0: should_heartbeat = True properties[constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()): properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM) self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self._printed_resource_warning = False self.kind = kind self.id = session_id self.session_info = u"" self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread()
def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug(u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.") requests.packages.urllib3.disable_warnings()
def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy if self._endpoint.auth == constants.AUTH_KERBEROS: self._auth = HTTPKerberosAuth(**conf.kerberos_auth_configuration()) elif self._endpoint.auth == constants.AUTH_BASIC: self._auth = (self._endpoint.username, self._endpoint.password) elif self._endpoint.auth != constants.NO_AUTH: raise BadUserConfigurationException(u"Unsupported auth %s" % self._endpoint.auth) self._session = requests.Session() self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings()
def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None, should_heartbeat=False, heartbeat_thread=None): super(LivySession, self).__init__() assert u"kind" in list(properties.keys()) kind = properties[u"kind"] self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException(u"Cannot indicate sql state without session id.") self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread()
class SparkMagicBase(Magics): def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = SparkLog(u"SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction): return SQLQuery(cell, samplemethod, maxrows, samplefraction) def _print_endpoint_info(self, info_sessions, current_session_id): if info_sessions: info_sessions = sorted(info_sessions, key=lambda s: s.id) html = get_sessions_info_html(info_sessions, current_session_id) self.ipython_display.html(html) else: self.ipython_display.html(u'No active sessions.')
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = SparkLog(u"{}_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext sparkmagic.kernels" self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language) self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format(self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): from sparkmagic.utils.sparkevents import get_spark_events_handler import autovizwidget.utils.configuration as c handler = get_spark_events_handler() c.override("events_handler", handler) register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
class ReconnectHandler(IPythonHandler): logger = None @web.authenticated @gen.coroutine def post(self): self.logger = SparkLog(u"ReconnectHandler") spark_events = self._get_spark_events() try: data = json_decode(self.request.body) except ValueError as e: self.set_status(400) msg = "Invalid JSON in request body." self.logger.error(msg) self.finish(msg) spark_events.emit_cluster_change_event(None, 400, False, msg) return endpoint = None try: path = self._get_argument_or_raise(data, 'path') username = self._get_argument_or_raise(data, 'username') password = self._get_argument_or_raise(data, 'password') endpoint = self._get_argument_or_raise(data, 'endpoint') auth = self._get_argument_if_exists(data, 'auth') if auth is None: if username == '' and password == '': auth = constants.NO_AUTH else: auth = constants.AUTH_BASIC except MissingArgumentError as e: self.set_status(400) self.finish(str(e)) self.logger.error(str(e)) spark_events.emit_cluster_change_event(endpoint, 400, False, str(e)) return kernel_name = self._get_kernel_name(data) # Get kernel manager, create a new kernel if none exists or restart the existing one when applicable kernel_manager = yield self._get_kernel_manager(path, kernel_name) # Execute code client = kernel_manager.client() code = '%{} -s {} -u {} -p {} -t {}'.format(KernelMagics._do_not_call_change_endpoint.__name__, endpoint, username, password, auth) response_id = client.execute(code, silent=False, store_history=False) msg = client.get_shell_msg(response_id) # Get execution info successful_message = self._msg_successful(msg) error = self._msg_error(msg) if successful_message: status_code = 200 else: self.logger.error(u"Code to reconnect errored out: {}".format(error)) status_code = 500 # Post execution info self.set_status(status_code) self.finish(json.dumps(dict(success=successful_message, error=error), sort_keys=True)) spark_events.emit_cluster_change_event(endpoint, status_code, successful_message, error) def _get_kernel_name(self, data): kernel_name = self._get_argument_if_exists(data, 'kernelname') self.logger.debug("Kernel name is {}".format(kernel_name)) if kernel_name is None: kernel_name = conf.server_extension_default_kernel_name() self.logger.debug("Defaulting to kernel name {}".format(kernel_name)) return kernel_name def _get_argument_if_exists(self, data, key): return data.get(key) def _get_argument_or_raise(self, data, key): try: return data[key] except KeyError: raise MissingArgumentError(key) @gen.coroutine def _get_kernel_manager(self, path, kernel_name): sessions = self.session_manager.list_sessions() kernel_id = None for session in sessions: if session['notebook']['path'] == path: session_id = session['id'] kernel_id = session['kernel']['id'] existing_kernel_name = session['kernel']['name'] break if kernel_id is None: self.logger.debug(u"Kernel not found. Starting a new kernel.") k_m = yield self._get_kernel_manager_new_session(path, kernel_name) elif existing_kernel_name != kernel_name: self.logger.debug(u"Existing kernel name '{}' does not match requested '{}'. Starting a new kernel.".format(existing_kernel_name, kernel_name)) self._delete_session(session_id) k_m = yield self._get_kernel_manager_new_session(path, kernel_name) else: self.logger.debug(u"Kernel found. Restarting kernel.") k_m = self.kernel_manager.get_kernel(kernel_id) k_m.restart_kernel() raise gen.Return(k_m) @gen.coroutine def _get_kernel_manager_new_session(self, path, kernel_name): model_future = self.session_manager.create_session(kernel_name=kernel_name, path=path, type="notebook") model = yield model_future kernel_id = model["kernel"]["id"] self.logger.debug("Kernel created with id {}".format(str(kernel_id))) k_m = self.kernel_manager.get_kernel(kernel_id) raise gen.Return(k_m) def _delete_session(self, session_id): self.session_manager.delete_session(session_id) def _msg_status(self, msg): return msg['content']['status'] def _msg_successful(self, msg): return self._msg_status(msg) == 'ok' def _msg_error(self, msg): if self._msg_status(msg) != 'error': return None return u'{}:\n{}'.format(msg['content']['ename'], msg['content']['evalue']) def _get_spark_events(self): spark_events = getattr(self, 'spark_events', None) if spark_events is None: return SparkEvents() return spark_events
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None, should_heartbeat=False, heartbeat_thread=None): super(LivySession, self).__init__() assert u"kind" in list(properties.keys()) kind = properties[u"kind"] self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException( u"Cannot indicate sql state without session id.") self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException( u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread() def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event( self.guid, self.kind) try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Creating SparkContext as 'sc'") # Start heartbeat thread to keep Livy interactive session alive. self._start_heartbeat_thread() # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException( u"Session {} did not start up in {} seconds.".format( self.id, conf.livy_session_startup_timeout_seconds())) html = get_sessions_info_html([self], self.id) self.ipython_display.html(html) if create_sql_context: self.create_sql_context() except Exception as e: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, True, "", "") def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug(u"Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln(u"Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: (success, out) = command.execute(self) except LivyClientTimeoutException: raise LivyClientTimeoutException( u"Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) if success: self.ipython_display.writeln( u"SparkContext and HiveContext created. Executing user code ..." ) self.created_sql_context = True else: raise FailedToCreateSqlContextException( u"Failed to create the SqlContext.\nError: '{}'".format(out)) def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event( self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self._stop_heartbeat_thread() self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error( u"Cannot delete session {} that is in state '{}'.".format( session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format( error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) start_time = time() self.logger.debug( u"Session {} in state {}. Sleeping {} seconds.".format( self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def refresh_status(self): status = self._http_client.get_session(self.id)[u'state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise LivyUnexpectedStatusException( u"Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = u"val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = u"from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = u"sqlContext <- sparkRHive.init(sc)" else: raise BadUserDataException( u"Do not know how to create HiveContext in session of kind {}." .format(self.kind)) return Command(sql_context_command) def _start_heartbeat_thread(self): if self._should_heartbeat and self._heartbeat_thread is None: refresh_seconds = conf.heartbeat_refresh_seconds() retry_seconds = conf.heartbeat_retry_seconds() if self._user_passed_heartbeat_thread is None: self._heartbeat_thread = _HeartbeatThread( self, refresh_seconds, retry_seconds) else: self._heartbeat_thread = self._user_passed_heartbeat_thread self._heartbeat_thread.daemon = True self._heartbeat_thread.start() def _stop_heartbeat_thread(self): if self._heartbeat_thread is not None: self._heartbeat_thread.stop() self._heartbeat_thread = None def get_row_html(self, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format( self.id, self.get_app_id(), self.kind, self.status, self.get_html_link(u'Link', self.get_spark_ui_url()), self.get_html_link(u'Link', self.get_driver_log_url()), u"" if current_session_id is None or current_session_id != self.id else u"\u2714") @staticmethod def get_html_link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format( text, url) else: return u""
class Command(ObjectWithGuid): def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = SparkLog(u"Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events def __repr__(self): return "Command({}, ...)".format(repr(self.code)) def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): self._spark_events.emit_statement_execution_start_event( session.guid, session.kind, session.id, self.guid) statement_id = -1 try: session.wait_for_idle() data = {u"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response[u'id'] output = self._get_statement_output(session, statement_id) except Exception as e: self._spark_events.emit_statement_execution_end_event( session.guid, session.kind, session.id, self.guid, statement_id, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_statement_execution_end_event( session.guid, session.kind, session.id, self.guid, statement_id, True, "", "") return output def _get_statement_output(self, session, statement_id): retries = 1 progress = FloatProgress(value=0.0, min=0, max=1.0, step=0.01, description='Progress:', bar_style='info', orientation='horizontal', layout=Layout(width='50%', height='25px')) session.ipython_display.display(progress) while True: statement = session.http_client.get_statement( session.id, statement_id) status = statement[u"state"].lower() self.logger.debug(u"Status of statement {} is {}.".format( statement_id, status)) if status not in FINAL_STATEMENT_STATUS: progress.value = statement.get('progress', 0.0) session.sleep(retries) retries += 1 else: statement_output = statement[u"output"] progress.close() if statement_output is None: return (True, u"") if statement_output[u"status"] == u"ok": data = statement_output[u"data"] png_encoded = data.get("image/png") if png_encoded: image = Image(base64.b64decode(png_encoded)) return (True, image) else: return (True, statement_output[u"data"][u"text/plain"]) elif statement_output[u"status"] == u"error": return (False, statement_output[u"evalue"] + u"\n" + u"".join(statement_output[u"traceback"])) else: raise LivyUnexpectedStatusException( u"Unknown output status from Livy: '{}'".format( statement_output[u"status"]))
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = SparkLog(u"{}_jupyter_kernel".format( self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup # self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext sparkmagic.kernels" self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format( self.session_language) self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format( self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): from sparkmagic.utils.sparkevents import get_spark_events_handler import autovizwidget.utils.configuration as c handler = get_spark_events_handler() c.override("events_handler", handler) register_auto_viz_code = """from autovizwidget.widget.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell( register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format( log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy if self._endpoint.auth == constants.AUTH_KERBEROS: self._auth = HTTPKerberosAuth(mutual_authentication=REQUIRED) elif self._endpoint.auth == constants.AUTH_BASIC: self._auth = (self._endpoint.username, self._endpoint.password) elif self._endpoint.auth != constants.NO_AUTH: raise BadUserConfigurationException(u"Unsupported auth %s" % self._endpoint.auth) self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings() def get_headers(self): return self._headers def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if self._endpoint.auth == constants.NO_AUTH: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=self._auth, verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=self._auth, data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None text = None self.logger.error(u"Request to '{}' failed with '{}'".format( url, e)) else: error = False status = r.status_code text = r.text if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue if error: raise HttpClientException( u"Error sending http request and maximum retry encountered." ) else: raise HttpClientException( u"Invalid status code '{}' from {} with error payload: {}" .format(status, url, text)) return r
class SparkController(object): def __init__(self, ipython_display): self.logger = SparkLog(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager() def get_app_id(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_app_id() def get_driver_log_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_driver_log_url() def get_logs(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_logs() def get_spark_ui_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_spark_ui_url() def run_command(self, command, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return command.execute(session_to_use) def run_sqlquery(self, sqlquery, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return sqlquery.execute(session_to_use) def get_all_sessions_endpoint(self, endpoint): http_client = self._http_client(endpoint) sessions = http_client.get_sessions()[u"sessions"] session_list = [self._livy_session(http_client, {u"kind": s[u"kind"]}, self.ipython_display, s[u"id"]) for s in sessions] for s in session_list: s.refresh_status() return session_list def get_all_sessions_endpoint_info(self, endpoint): sessions = self.get_all_sessions_endpoint(endpoint) return [str(s) for s in sessions] def cleanup(self): self.session_manager.clean_up_all() def cleanup_endpoint(self, endpoint): for session in self.get_all_sessions_endpoint(endpoint): session.delete() def delete_session_by_name(self, name): self.session_manager.delete_client(name) def delete_session_by_id(self, endpoint, session_id): http_client = self._http_client(endpoint) response = http_client.get_session(session_id) http_client = self._http_client(endpoint) session = self._livy_session(http_client, {u"kind": response[u"kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, endpoint, skip_if_exists, properties): if skip_if_exists and (name in self.session_manager.get_sessions_list()): self.logger.debug(u"Skipping {} because it already exists in list of sessions.".format(name)) return http_client = self._http_client(endpoint) session = self._livy_session(http_client, properties, self.ipython_display) self.session_manager.add_session(name, session) session.start() def get_session_id_for_client(self, name): return self.session_manager.get_session_id_for_client(name) def get_client_keys(self): return self.session_manager.get_sessions_list() def get_manager_sessions_str(self): return self.session_manager.get_sessions_info() def get_session_by_name_or_default(self, client_name): if client_name is None: return self.session_manager.get_any_session() else: client_name = client_name.lower() return self.session_manager.get_session(client_name) def get_managed_clients(self): return self.session_manager.sessions @staticmethod def _livy_session(http_client, properties, ipython_display, session_id=-1, sql_created=None): return LivySession(http_client, properties, ipython_display, session_id, sql_created) @staticmethod def _http_client(endpoint): return LivyReliableHttpClient.from_endpoint(endpoint)
class SparkMagicBase(Magics): _STRING_VAR_TYPE = 'str' _PANDAS_DATAFRAME_VAR_TYPE = 'df' _ALLOWED_LOCAL_TO_SPARK_TYPES = [ _STRING_VAR_TYPE, _PANDAS_DATAFRAME_VAR_TYPE ] def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = SparkLog(u"SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug(u'Initialized spark magics.') if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def do_send_to_spark(self, cell, input_variable_name, var_type, output_variable_name, max_rows, session_name): try: input_variable_value = self.shell.user_ns[input_variable_name] except KeyError: raise BadUserDataException( u'Variable named {} not found.'.format(input_variable_name)) if input_variable_value is None: raise BadUserDataException( u'Value of {} is None!'.format(input_variable_name)) if not output_variable_name: output_variable_name = input_variable_name if not max_rows: max_rows = conf.default_maxrows() input_variable_type = var_type.lower() if input_variable_type == self._STRING_VAR_TYPE: command = SendStringToSparkCommand(input_variable_name, input_variable_value, output_variable_name) elif input_variable_type == self._PANDAS_DATAFRAME_VAR_TYPE: command = SendPandasDfToSparkCommand(input_variable_name, input_variable_value, output_variable_name, max_rows) else: raise BadUserDataException( u'Invalid or incorrect -t type. Available are: [{}]'.format( u','.join(self._ALLOWED_LOCAL_TO_SPARK_TYPES))) (success, result, mime_type) = self.spark_controller.run_command(command, None) if not success: self.ipython_display.send_error(result) else: self.ipython_display.write( u'Successfully passed \'{}\' as \'{}\' to Spark' u' kernel'.format(input_variable_name, output_variable_name)) def execute_spark(self, cell, output_var, samplemethod, maxrows, samplefraction, session_name, coerce, output_handler=None): output_handler = output_handler or SparkOutputHandler( html=self.ipython_display.html, text=self.ipython_display.write, default=self.ipython_display.display) (success, out, mimetype) = self.spark_controller.run_command(Command(cell), session_name) if not success: if conf.shutdown_session_on_spark_statement_errors(): self.spark_controller.cleanup() raise SparkStatementException(out) else: if isinstance(out, string_types): if mimetype == MIMETYPE_TEXT_HTML: output_handler.html(out) else: output_handler.text(out) else: output_handler.default(out) if output_var is not None: spark_store_command = self._spark_store_command( output_var, samplemethod, maxrows, samplefraction, coerce) df = self.spark_controller.run_command(spark_store_command, session_name) self.shell.user_ns[output_var] = df @staticmethod def _spark_store_command(output_var, samplemethod, maxrows, samplefraction, coerce): return SparkStoreCommand(output_var, samplemethod, maxrows, samplefraction, coerce=coerce) def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet, coerce): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction, coerce) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction, coerce): return SQLQuery(cell, samplemethod, maxrows, samplefraction, coerce=coerce) def _print_endpoint_info(self, info_sessions, current_session_id): if info_sessions: info_sessions = sorted(info_sessions, key=lambda s: s.id) html = get_sessions_info_html(info_sessions, current_session_id) self.ipython_display.html(html) else: self.ipython_display.html(u'No active sessions.')
class SparkController(object): def __init__(self, ipython_display): self.logger = SparkLog(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager() def get_app_id(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_app_id() def get_driver_log_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_driver_log_url() def get_logs(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_logs() def get_spark_ui_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_spark_ui_url() def run_command(self, command, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return command.execute(session_to_use) def run_sqlquery(self, sqlquery, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return sqlquery.execute(session_to_use) def get_all_sessions_endpoint(self, endpoint): http_client = self._http_client(endpoint) sessions = http_client.get_sessions()[u"sessions"] session_list = [ self._livy_session(http_client, {u"kind": s[u"kind"]}, self.ipython_display, s[u"id"]) for s in sessions ] for s in session_list: s.refresh_status() return session_list def get_all_sessions_endpoint_info(self, endpoint): sessions = self.get_all_sessions_endpoint(endpoint) return [str(s) for s in sessions] def cleanup(self): self.session_manager.clean_up_all() def cleanup_endpoint(self, endpoint): for session in self.get_all_sessions_endpoint(endpoint): session.delete() def delete_session_by_name(self, name): self.session_manager.delete_client(name) def delete_session_by_id(self, endpoint, session_id): http_client = self._http_client(endpoint) response = http_client.get_session(session_id) http_client = self._http_client(endpoint) session = self._livy_session(http_client, {u"kind": response[u"kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, endpoint, skip_if_exists, properties): if skip_if_exists and (name in self.session_manager.get_sessions_list()): self.logger.debug( u"Skipping {} because it already exists in list of sessions.". format(name)) return http_client = self._http_client(endpoint) session = self._livy_session(http_client, properties, self.ipython_display) self.session_manager.add_session(name, session) session.start() def get_session_id_for_client(self, name): return self.session_manager.get_session_id_for_client(name) def get_client_keys(self): return self.session_manager.get_sessions_list() def get_manager_sessions_str(self): return self.session_manager.get_sessions_info() def get_session_by_name_or_default(self, client_name): if client_name is None: return self.session_manager.get_any_session() else: client_name = client_name.lower() return self.session_manager.get_session(client_name) def get_managed_clients(self): return self.session_manager.sessions @staticmethod def _livy_session(http_client, properties, ipython_display, session_id=-1, sql_created=None): return LivySession(http_client, properties, ipython_display, session_id, sql_created) @staticmethod def _http_client(endpoint): return LivyReliableHttpClient.from_endpoint(endpoint)
class SparkController(object): def __init__(self, ipython_display): self.logger = SparkLog(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager(ipython_display) # this is to reuse the already created http clients # since the reliablehttpclient uses requests session self._http_clients = {} def get_app_id(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_app_id() def get_driver_log_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_driver_log_url() def get_logs(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_logs() def get_spark_ui_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_spark_ui_url() def run_command(self, command, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return command.execute(session_to_use) def run_sqlquery(self, sqlquery, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return sqlquery.execute(session_to_use) def get_all_sessions_endpoint(self, endpoint): http_client = self._http_client(endpoint) sessions = http_client.get_sessions()[u"sessions"] session_list = [self._livy_session(http_client, {constants.LIVY_KIND_PARAM: s[constants.LIVY_KIND_PARAM]}, self.ipython_display, s[u"id"]) for s in sessions] for s in session_list: s.refresh_status_and_info() return session_list def get_all_sessions_endpoint_info(self, endpoint): sessions = self.get_all_sessions_endpoint(endpoint) return [str(s) for s in sessions] def cleanup(self): self.session_manager.clean_up_all() def cleanup_endpoint(self, endpoint): for session in self.get_all_sessions_endpoint(endpoint): session.delete() def delete_session_by_name(self, name): self.session_manager.delete_client(name) def delete_session_by_id(self, endpoint, session_id): name = self.session_manager.get_session_name_by_id_endpoint(session_id, endpoint) if name in self.session_manager.get_sessions_list(): self.delete_session_by_name(name) else: http_client = self._http_client(endpoint) response = http_client.get_session(session_id) http_client = self._http_client(endpoint) session = self._livy_session(http_client, {constants.LIVY_KIND_PARAM: response[constants.LIVY_KIND_PARAM]}, self.ipython_display, session_id) session.delete() def add_session(self, name, endpoint, skip_if_exists, properties): if skip_if_exists and (name in self.session_manager.get_sessions_list()): self.logger.debug(u"Skipping {} because it already exists in list of sessions.".format(name)) return http_client = self._http_client(endpoint) session = self._livy_session(http_client, properties, self.ipython_display) self.session_manager.add_session(name, session) session.start() def get_session_id_for_client(self, name): return self.session_manager.get_session_id_for_client(name) def get_client_keys(self): return self.session_manager.get_sessions_list() def get_manager_sessions_str(self): return self.session_manager.get_sessions_info() def get_session_by_name_or_default(self, client_name): if client_name is None: return self.session_manager.get_any_session() else: return self.session_manager.get_session(client_name) def get_managed_clients(self): return self.session_manager.sessions @staticmethod def _livy_session(http_client, properties, ipython_display, session_id=-1): return LivySession(http_client, properties, ipython_display, session_id, heartbeat_timeout=conf.livy_server_heartbeat_timeout_seconds()) def _http_client(self, endpoint): if endpoint not in self._http_clients: self._http_clients[endpoint] = LivyReliableHttpClient.from_endpoint(endpoint) return self._http_clients[endpoint]
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, spark_events=None, heartbeat_timeout=0, heartbeat_thread=None): super(LivySession, self).__init__() assert constants.LIVY_KIND_PARAM in list(properties.keys()) kind = properties[constants.LIVY_KIND_PARAM] should_heartbeat = False if heartbeat_timeout > 0: should_heartbeat = True properties[constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()): properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM) self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self._printed_resource_warning = False self.kind = kind self.id = session_id self.session_info = u"" self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread() def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) self._printed_resource_warning = False try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Starting Spark application") # Start heartbeat thread to keep Livy interactive session alive. self._start_heartbeat_thread() # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException(u"Session {} did not start up in {} seconds." .format(self.id, conf.livy_session_startup_timeout_seconds())) html = get_sessions_info_html([self], self.id) self.ipython_display.html(html) command = Command("spark") (success, out) = command.execute(self) if success: self.ipython_display.writeln(u"SparkSession available as 'spark'.") self.sql_context_variable_name = "spark" else: command = Command("sqlContext") (success, out) = command.execute(self) if success: self.ipython_display.writeln(u"SparkContext available as 'sc'.") if ("hive" in out.lower()): self.ipython_display.writeln(u"HiveContext available as 'sqlContext'.") else: self.ipython_display.writeln(u"SqlContext available as 'sqlContext'.") self.sql_context_variable_name = "sqlContext" else: raise SqlContextNotFoundException(u"Neither SparkSession nor HiveContext/SqlContext is available.") except Exception as e: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "") def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self._stop_heartbeat_thread() self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error(u"Cannot delete session {} that is in state '{}'." .format(session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status_and_info() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) if constants.YARN_RESOURCE_LIMIT_MSG in self.session_info and \ not self._printed_resource_warning: self.ipython_display.send_error(constants.RESOURCE_LIMIT_WARNING\ .format(conf.resource_limit_mitigation_suggestion())) self._printed_resource_warning = True start_time = time() self.logger.debug(u"Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) # This function will refresh the status and get the logs in a single call. # Only the status will be returned as the return value. def refresh_status_and_info(self): response = self._http_client.get_session(self.id) status = response[u'state'] log_array = response[u'log'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status self.session_info = u"\n".join(log_array) else: raise LivyUnexpectedStatusException(u"Status '{}' not supported by session.".format(status)) def _start_heartbeat_thread(self): if self._should_heartbeat and self._heartbeat_thread is None: refresh_seconds = conf.heartbeat_refresh_seconds() retry_seconds = conf.heartbeat_retry_seconds() if self._user_passed_heartbeat_thread is None: self._heartbeat_thread = _HeartbeatThread(self, refresh_seconds, retry_seconds) else: self._heartbeat_thread = self._user_passed_heartbeat_thread self._heartbeat_thread.daemon = True self._heartbeat_thread.start() def _stop_heartbeat_thread(self): if self._heartbeat_thread is not None: self._heartbeat_thread.stop() self._heartbeat_thread = None def get_row_html(self, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td></tr>""".format( self.id, self.get_app_id(), self.kind, self.status, self.get_html_link(u'Link', self.get_spark_ui_url()), self.get_html_link(u'Link', self.get_driver_log_url()), u"" if current_session_id is None or current_session_id != self.id else u"\u2714" ) @staticmethod def get_html_link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format(text, url) else: return u""
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None): super(LivySession, self).__init__() assert u"kind" in list(properties.keys()) kind = properties[u"kind"] self.properties = properties self.ipython_display = ipython_display if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException(u"Cannot indicate sql state without session id.") self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Creating SparkContext as 'sc'") # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException(u"Session {} did not start up in {} seconds." .format(self.id, conf.livy_session_startup_timeout_seconds())) if create_sql_context: self.create_sql_context() except Exception as e: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "") def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug(u"Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln(u"Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: (success, out) = command.execute(self) except LivyClientTimeoutException: raise LivyClientTimeoutException(u"Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) if success: self.created_sql_context = True else: raise FailedToCreateSqlContextException(u"Failed to create the SqlContext.\nError: '{}'".format(out)) def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error(u"Cannot delete session {} that is in state '{}'." .format(session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format(error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) start_time = time() self.logger.debug(u"Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def refresh_status(self): status = self._http_client.get_session(self.id)[u'state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise LivyUnexpectedStatusException(u"Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = u"val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = u"from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = u"sqlContext <- sparkRHive.init(sc)" else: raise BadUserDataException(u"Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return Command(sql_context_command)
def __init__(self, ipython_display): self.logger = SparkLog(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager()
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, spark_events=None, heartbeat_timeout=0, heartbeat_thread=None): super(LivySession, self).__init__() assert constants.LIVY_KIND_PARAM in list(properties.keys()) kind = properties[constants.LIVY_KIND_PARAM] should_heartbeat = False if heartbeat_timeout > 0: should_heartbeat = True properties[ constants.LIVY_HEARTBEAT_TIMEOUT_PARAM] = heartbeat_timeout elif constants.LIVY_HEARTBEAT_TIMEOUT_PARAM in list(properties.keys()): properties.pop(constants.LIVY_HEARTBEAT_TIMEOUT_PARAM) self.properties = properties self.ipython_display = ipython_display self._should_heartbeat = should_heartbeat self._user_passed_heartbeat_thread = heartbeat_thread if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events self._policy = ConfigurableRetryPolicy( retry_seconds_to_sleep_list=[0.2, 0.5, 0.5, 1, 1, 2], max_retries=5000) wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert wait_for_idle_timeout_seconds > 0 self.logger = SparkLog(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException( u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) self._app_id = None self._user = None self._logs = u"" self._http_client = http_client self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self._printed_resource_warning = False self.kind = kind self.id = session_id self.session_info = u"" self._heartbeat_thread = None if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS else: self.status = constants.BUSY_SESSION_STATUS self._start_heartbeat_thread() def __str__(self): return u"Session id: {}\tYARN id: {}\tKind: {}\tState: {}\n\tSpark UI: {}\n\tDriver Log: {}"\ .format(self.id, self.get_app_id(), self.kind, self.status, self.get_spark_ui_url(), self.get_driver_log_url()) def start(self): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event( self.guid, self.kind) self._printed_resource_warning = False try: r = self._http_client.post_session(self.properties) self.id = r[u"id"] self.status = str(r[u"state"]) self.ipython_display.writeln(u"Starting Spark application") # Start heartbeat thread to keep Livy interactive session alive. self._start_heartbeat_thread() # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException( u"Session {} did not start up in {} seconds.".format( self.id, conf.livy_session_startup_timeout_seconds())) html = get_sessions_info_html([self], self.id) self.ipython_display.html(html) command = Command("spark") (success, out, mimetype) = command.execute(self) if success: self.ipython_display.writeln( u"SparkSession available as 'spark'.") self.sql_context_variable_name = "spark" else: command = Command("sqlContext") (success, out, mimetype) = command.execute(self) if success: self.ipython_display.writeln( u"SparkContext available as 'sc'.") if ("hive" in out.lower()): self.ipython_display.writeln( u"HiveContext available as 'sqlContext'.") else: self.ipython_display.writeln( u"SqlContext available as 'sqlContext'.") self.sql_context_variable_name = "sqlContext" else: raise SqlContextNotFoundException( u"Neither SparkSession nor HiveContext/SqlContext is available." ) except Exception as e: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event( self.guid, self.kind, self.id, self.status, True, "", "") def get_app_id(self): if self._app_id is None: self._app_id = self._http_client.get_session(self.id).get("appId") return self._app_id def get_app_info(self): appInfo = self._http_client.get_session(self.id).get("appInfo") return appInfo if appInfo is not None else {} def get_app_info_member(self, member_name): return self.get_app_info().get(member_name) def get_driver_log_url(self): return self.get_app_info_member("driverLogUrl") def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)[u'log'] self._logs = "\n".join(log_array) return self._logs def get_spark_ui_url(self): return self.get_app_info_member("sparkUiUrl") def get_user(self): if self._user is None: session = self._http_client.get_session(self.id) self._user = session.get("proxyUser", session.get("owner")) return self._user @property def http_client(self): return self._http_client @property def endpoint(self): return self._http_client.endpoint @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event( self.guid, self.kind, session_id, self.status) try: self.logger.debug(u"Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS: self._http_client.delete_session(session_id) self._stop_heartbeat_thread() self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error( u"Cannot delete session {} that is in state '{}'.".format( session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event( self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds retries = 1 while True: self.refresh_status_and_info() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = u"Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException(u'{} See logs:\n{}'.format( error, self.get_logs())) if seconds_to_wait <= 0.0: error = u"Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) if constants.YARN_RESOURCE_LIMIT_MSG in self.session_info and \ not self._printed_resource_warning: self.ipython_display.send_error(constants.RESOURCE_LIMIT_WARNING\ .format(conf.resource_limit_mitigation_suggestion())) self._printed_resource_warning = True start_time = time() sleep_time = self._policy.seconds_to_sleep(retries) retries += 1 self.logger.debug( u"Session {} in state {}. Sleeping {} seconds.".format( self.id, self.status, sleep_time)) sleep(sleep_time) seconds_to_wait -= time() - start_time def sleep(self, retries): sleep(self._policy.seconds_to_sleep(retries)) # This function will refresh the status and get the logs in a single call. # Only the status will be returned as the return value. def refresh_status_and_info(self): response = self._http_client.get_session(self.id) status = response[u'state'] log_array = response[u'log'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status self.session_info = u"\n".join(log_array) else: raise LivyUnexpectedStatusException( u"Status '{}' not supported by session.".format(status)) def _start_heartbeat_thread(self): if self._should_heartbeat and self._heartbeat_thread is None: refresh_seconds = conf.heartbeat_refresh_seconds() retry_seconds = conf.heartbeat_retry_seconds() if self._user_passed_heartbeat_thread is None: self._heartbeat_thread = _HeartbeatThread( self, refresh_seconds, retry_seconds) else: self._heartbeat_thread = self._user_passed_heartbeat_thread self._heartbeat_thread.daemon = True self._heartbeat_thread.start() def _stop_heartbeat_thread(self): if self._heartbeat_thread is not None: self._heartbeat_thread.stop() self._heartbeat_thread = None def get_row_html(self, current_session_id): return u"""<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>""".format( self.id, self.get_app_id(), self.kind, self.status, self.get_html_link(u'Link', self.get_spark_ui_url()), self.get_html_link(u'Link', self.get_driver_log_url()), self.get_user(), u"" if current_session_id is None or current_session_id != self.id else u"\u2714") @staticmethod def get_html_link(text, url): if url is not None: return u"""<a target="_blank" href="{1}">{0}</a>""".format( text, url) else: return u""
def __init__(self): self.logger = SparkLog(u"SessionManager") self._sessions = dict()
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error(u"Request to '{}' failed with '{}'".format( url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException( u"Invalid status code '{}' or error '{}' from {}". format(status, error, url)) return r
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = SparkLog(u"ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug(u"ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.") requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip(u"/").lstrip(u"/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error(u"Request to '{}' failed with '{}'".format(url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException(u"Invalid status code '{}' or error '{}' from {}" .format(status, error, url)) return r
class SparkMagicBase(Magics): def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = SparkLog(u"SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def execute_final(self, cell, output_var, samplemethod, maxrows, samplefraction, session_name, coerce): (success, out) = self.spark_controller.run_command(Command(cell), session_name) if not success: self.ipython_display.send_error(out) else: self.ipython_display.write(out) if output_var is not None: spark_store_command = self._spark_store_command( output_var, samplemethod, maxrows, samplefraction, coerce) df = self.spark_controller.run_command(spark_store_command, session_name) self.shell.user_ns[output_var] = df def execute_spark(self, cell, output_var, samplemethod, maxrows, samplefraction, session_name, coerce): if "lagom as" in cell: self.ipython_display.send_error( "You are not allowed to do the following: 'import maggy.experiment.lagom as ...'. Please, just use 'import maggy.experiment as experiment' (or something else)" ) raise elif ".lagom" in cell: client = Client(self.spark_controller, self.session_name, 5, self.ipython_display) try: client.start_heartbeat() if DEBUG: self.ipython_display.writeln("Started heartbeating...") self.execute_final(cell, output_var, samplemethod, maxrows, samplefraction, session_name, coerce) except: raise finally: # 4. Kill thread before leaving current scope client.stop() try: client.close() except: if DEBUG: print("Socket already closed by maggy server.") pass else: self.execute_final(cell, output_var, samplemethod, maxrows, samplefraction, session_name, coerce) @staticmethod def _spark_store_command(output_var, samplemethod, maxrows, samplefraction, coerce): return SparkStoreCommand(output_var, samplemethod, maxrows, samplefraction, coerce=coerce) def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet, coerce): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction, coerce) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction, coerce): return SQLQuery(cell, samplemethod, maxrows, samplefraction, coerce=coerce) def _print_endpoint_info(self, info_sessions, current_session_id): if info_sessions: info_sessions = sorted(info_sessions, key=lambda s: s.id) html = get_sessions_info_html(info_sessions, current_session_id) self.ipython_display.html(html) else: self.ipython_display.html(u'No active sessions.')