def __init__(self, reader_writer): assert reader_writer is not None self.logger = Log("ClientManagerStateSerializer") self._ipython_display = IpythonDisplay() self._reader_writer = reader_writer
def __init__(self, shell, data=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) try: should_serialize = conf.serialize() if should_serialize: self.logger.debug("Serialization enabled.") self.magics_home_path = get_magics_home_path() path_to_serialize = join_paths(self.magics_home_path, "state.json") self.logger.debug( "Will serialize to {}.".format(path_to_serialize)) self.spark_controller = SparkController( self.ipython_display, serialize_path=path_to_serialize) else: self.logger.debug("Serialization NOT enabled.") except KeyError: self.logger.error("Could not read env vars for serialization.") self.logger.debug("Initialized spark magics.")
def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log("Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events
class Command(ObjectWithGuid): def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log(u"Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): self._spark_events.emit_statement_execution_start_event(session.guid, session.kind, session.id, self.guid) statement_id = -1 try: session.wait_for_idle() data = {u"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response[u'id'] output = self._get_statement_output(session, statement_id) except Exception as e: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, True, "", "") return output def _get_statement_output(self, session, statement_id): statement_running = True out = u"" while statement_running: statement = session.http_client.get_statement(session.id, statement_id) status = statement[u"state"] self.logger.debug(u"Status of statement {} is {}.".format(statement_id, status)) if status == u"running": session.sleep() else: statement_running = False statement_output = statement[u"output"] if statement_output[u"status"] == u"ok": out = (True, statement_output[u"data"][u"text/plain"]) elif statement_output[u"status"] == u"error": out = (False, statement_output[u"evalue"] + u"\n" + u"".join(statement_output[u"traceback"])) else: raise LivyUnexpectedStatusException(u"Unknown output status from Livy: '{}'" .format(statement_output[u"status"])) return out
class EventsHandler: def __init__(self): self.logger = Log("EventsHandler") def handle_event(self, kwargs_list): """ Storing the Event details using the logger. """ event_line = ",".join("{}: {}".format(key, arg) for key, arg in kwargs_list) self.logger.info(event_line)
def __init__(self, ipython_display, serialize_path=None): self.logger = Log("SparkController") self.ipython_display = ipython_display if serialize_path is not None: serializer = ClientManagerStateSerializer( FileSystemReaderWriter(serialize_path)) self.client_manager = ClientManager(serializer) else: self.client_manager = ClientManager()
def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = Log("ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( "ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings()
def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event()
class SparkMagicBase(Magics): def __init__(self, shell, data=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) try: should_serialize = conf.serialize() if should_serialize: self.logger.debug("Serialization enabled.") self.magics_home_path = get_magics_home_path() path_to_serialize = join_paths(self.magics_home_path, "state.json") self.logger.debug( "Will serialize to {}.".format(path_to_serialize)) self.spark_controller = SparkController( self.ipython_display, serialize_path=path_to_serialize) else: self.logger.debug("Serialization NOT enabled.") except KeyError: self.logger.error("Could not read env vars for serialization.") self.logger.debug("Initialized spark magics.") def execute_sqlquery(self, sqlquery, session, output_var, quiet): try: df = self.spark_controller.run_cell_sql(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df except DataFrameParseException as e: self.ipython_display.send_error(e.out) return None @staticmethod def print_endpoint_info(info_sessions): sessions_info = [" {}".format(i) for i in info_sessions] print("""Info for endpoint: Sessions: {} """.format("\n".join(sessions_info)))
def __init__(self, shell, data=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) try: should_serialize = conf.serialize() if should_serialize: self.logger.debug("Serialization enabled.") self.magics_home_path = get_magics_home_path() path_to_serialize = join_paths(self.magics_home_path, "state.json") self.logger.debug("Will serialize to {}.".format(path_to_serialize)) self.spark_controller = SparkController(self.ipython_display, serialize_path=path_to_serialize) else: self.logger.debug("Serialization NOT enabled.") except KeyError: self.logger.error("Could not read env vars for serialization.") self.logger.debug("Initialized spark magics.")
def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = Log("_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz()
def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log(u"Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events
class SparkMagicBase(Magics): def __init__(self, shell, data=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) try: should_serialize = conf.serialize() if should_serialize: self.logger.debug("Serialization enabled.") self.magics_home_path = get_magics_home_path() path_to_serialize = join_paths(self.magics_home_path, "state.json") self.logger.debug("Will serialize to {}.".format(path_to_serialize)) self.spark_controller = SparkController(self.ipython_display, serialize_path=path_to_serialize) else: self.logger.debug("Serialization NOT enabled.") except KeyError: self.logger.error("Could not read env vars for serialization.") self.logger.debug("Initialized spark magics.") def execute_sqlquery(self, sqlquery, session, output_var, quiet): try: df = self.spark_controller.run_cell_sql(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df except DataFrameParseException as e: self.ipython_display.send_error(e.out) return None @staticmethod def print_endpoint_info(info_sessions): sessions_info = [" {}".format(i) for i in info_sessions] print("""Info for endpoint: Sessions: {} """.format("\n".join(sessions_info)))
def __init__(self, client_factory, reader_writer): assert client_factory is not None assert reader_writer is not None self.logger = Log("ClientManagerStateSerializer") self._client_factory = client_factory self._reader_writer = reader_writer
def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None): super(LivySession, self).__init__() assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created
def __init__(self, ipython_display, serialize_path=None): self.logger = Log("SparkController") self.ipython_display = ipython_display self.client_factory = LivyClientFactory() if serialize_path is not None: serializer = ClientManagerStateSerializer(self.client_factory, FileSystemReaderWriter(serialize_path)) self.client_manager = ClientManager(serializer) else: self.client_manager = ClientManager()
class Command(ObjectWithGuid): def __init__(self, code): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log("Command") def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): session.wait_for_idle() data = {"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response['id'] return self._get_statement_output(session, statement_id) def _get_statement_output(self, session, statement_id): statement_running = True out = "" while statement_running: statement = session.http_client.get_statement(session.id, statement_id) status = statement["state"] self.logger.debug("Status of statement {} is {}.".format(statement_id, status)) if status == "running": session.sleep() else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise ValueError("Unknown output status: '{}'".format(statement_output["status"])) return out
def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = Log("ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug("ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.") requests.packages.urllib3.disable_warnings()
def __init__(self, serializer=None): serialize_periodically = False serialize_period = 3 if serializer is not None: serialize_periodically = conf.serialize_periodically() serialize_period = conf.serialize_period_seconds() self.logger = Log("ClientManager") self._livy_clients = dict() self._serializer = serializer self._serialize_timer = None if self._serializer is not None: for (name, client) in self._serializer.deserialize_state(): self.add_client(name, client) if serialize_periodically: self._serialize_state_periodically(serialize_period)
def __init__(self, url, headers, username, password, retry_policy): self._url = url.rstrip("/") self._headers = headers self._username = username self._password = password self._retry_policy = retry_policy self.logger = Log("ReliableHttpClient") self._do_not_authenticate = self._username == "" and self._password == "" self.verify_ssl = not conf.ignore_ssl_errors() if self.verify_ssl: self.logger.debug("ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.")
def __init__(self, http_client, properties, ipython_display, session_id="-1", sql_created=None): assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert create_sql_context_timeout_seconds > 0 if session_id == "-1" and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == "-1": self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds self._state = LivySessionState(session_id, self._http_client.connection_string, kind, sql_created)
class SparkMagicBase(Magics): def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction): return SQLQuery(cell, samplemethod, maxrows, samplefraction) @staticmethod def print_endpoint_info(info_sessions): sessions_info = [" {}".format(i) for i in info_sessions] print("""Info for endpoint: Sessions: {} """.format("\n".join(sessions_info)))
def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None): super(LivySession, self).__init__() assert u"kind" in list(properties.keys()) kind = properties[u"kind"] self.properties = properties self.ipython_display = ipython_display if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException(u"Cannot indicate sql state without session id.") self.logger = Log(u"LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException(u"Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._app_id = None self._logs = u"" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created
def __init__(self, implementation, implementation_version, language, language_version, language_info, kernel_conf_name, session_language, client_name, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.kernel_conf_name = kernel_conf_name self.session_language = session_language self.client_name = client_name super(SparkKernelBase, self).__init__(**kwargs) self._logger = Log(self.client_name) self._session_started = False self._fatal_error = None self._ipython_display = IpythonDisplay() self.user_command_parser = UserCommandParser() # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): configuration = self._get_configuration() if not configuration: # _get_configuration() sets the error for us so we can just return now. # The kernel is not in a good state and all do_execute calls will # fail with the fatal error. return (username, password, url) = configuration self.connection_string = get_connection_string(url, username, password) self._load_magics_extension() if conf.use_auto_viz(): self._register_auto_viz()
def __init__(self, ipython_display, http_client, session_id, sql_created, properties): assert "kind" in properties.keys() kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert create_sql_context_timeout_seconds > 0 if session_id == "-1" and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in Constants.session_kinds_supported: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(Constants.session_kinds_supported))) if session_id == "-1": self._status = Constants.not_started_session_status sql_created = False else: self._status = Constants.busy_session_status self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds self._state = LivySessionState(session_id, http_client.connection_string, kind, sql_created)
class SparkController(object): def __init__(self, ipython_display, serialize_path=None): self.logger = Log("SparkController") self.ipython_display = ipython_display if serialize_path is not None: serializer = ClientManagerStateSerializer( FileSystemReaderWriter(serialize_path)) self.client_manager = ClientManager(serializer) else: self.client_manager = ClientManager() def get_logs(self, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.get_logs() def run_cell(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute(cell) def run_cell_sql(self, sqlquery, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute_sql(sqlquery) def get_all_sessions_endpoint(self, connection_string): http_client = self._http_client_from_connection_string( connection_string) r = http_client.get("/sessions", [200]) sessions = r.json()["sessions"] session_list = [ self._create_livy_session(connection_string, {"kind": s["kind"]}, self.ipython_display, s["id"]) for s in sessions ] for s in session_list: s._refresh_status() return session_list def get_all_sessions_endpoint_info(self, connection_string): sessions = self.get_all_sessions_endpoint(connection_string) return [str(s) for s in sessions] def cleanup(self): self.client_manager.clean_up_all() def cleanup_endpoint(self, connection_string): for session in self.get_all_sessions_endpoint(connection_string): session.delete() def delete_session_by_name(self, name): self.client_manager.delete_client(name) def delete_session_by_id(self, connection_string, session_id): http_client = self._http_client_from_connection_string( connection_string) r = http_client.get("/sessions/{}".format(session_id), [200, 404]) if r.status_code != 404: session = self._create_livy_session(connection_string, {"kind": r.json()["kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, connection_string, skip_if_exists, properties): if skip_if_exists and (name in self.client_manager.get_sessions_list()): self.logger.debug( "Skipping {} because it already exists in list of sessions.". format(name)) return session = self._create_livy_session(connection_string, properties, self.ipython_display) session.start() livy_client = self._create_livy_client(session) self.client_manager.add_client(name, livy_client) livy_client.start() def get_session_id_for_client(self, name): return self.client_manager.get_session_id_for_client(name) def get_client_keys(self): return self.client_manager.get_sessions_list() def get_manager_sessions_str(self): return self.client_manager.get_sessions_info() def get_client_by_name_or_default(self, client_name): if client_name is None: return self.client_manager.get_any_client() else: client_name = client_name.lower() return self.client_manager.get_client(client_name) def get_managed_clients(self): return self.client_manager.livy_clients @staticmethod def _create_livy_session(*args, **kwargs): return LivySession.from_connection_string(*args, **kwargs) @staticmethod def _http_client_from_connection_string(connection_string): return LivyReliableHttpClient.from_connection_string(connection_string) @staticmethod def _create_livy_client(session): return LivyClient(session)
def __init__(self, code): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log("Command")
class ClientManager(object): """Livy client manager""" def __init__(self, serializer=None): serialize_periodically = False serialize_period = 3 if serializer is not None: serialize_periodically = conf.serialize_periodically() serialize_period = conf.serialize_period_seconds() self.logger = Log("ClientManager") self._livy_clients = dict() self._serializer = serializer self._serialize_timer = None if self._serializer is not None: for (name, client) in self._serializer.deserialize_state(): self.add_client(name, client) if serialize_periodically: self._serialize_state_periodically(serialize_period) def _serialize_state_periodically(self, serialize_period): self.logger.debug("Starting state serialize timer.") self._serialize_timer = Timer(serialize_period, self._serialize_state) self._serialize_timer.start() def _serialize_state(self): self._serializer.serialize_state(self._livy_clients) @property def livy_clients(self): return self._livy_clients def get_sessions_list(self): return list(self._livy_clients.keys()) def get_sessions_info(self): return [ "Name: {}\t{}".format(k, str(self._livy_clients[k])) for k in list(self._livy_clients.keys()) ] def add_client(self, name, livy_client): if name in self.get_sessions_list(): raise ValueError( "Session with name '{}' already exists. Please delete the session" " first if you intend to replace it.".format(name)) self._livy_clients[name] = livy_client def get_any_client(self): number_of_sessions = len(self._livy_clients) if number_of_sessions == 1: key = self.get_sessions_list()[0] return self._livy_clients[key] elif number_of_sessions == 0: raise AssertionError( "You need to have at least 1 client created to execute commands." ) else: raise AssertionError( "Please specify the client to use. Possible sessions are {}". format(self.get_sessions_list())) def get_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name] raise ValueError( "Could not find '{}' session in list of saved sessions. Possible sessions are {}" .format(name, self.get_sessions_list())) def get_session_id_for_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name].session_id return None def delete_client(self, name): self._remove_session(name) def clean_up_all(self): for name in self.get_sessions_list(): self._remove_session(name) if self._serializer is not None: self._serialize_state() def _remove_session(self, name): if name in self.get_sessions_list(): self._livy_clients[name].close_session() del self._livy_clients[name] else: raise ValueError( "Could not find '{}' session in list of saved sessions. Possible sessions are {}" .format(name, self.get_sessions_list()))
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = Log("ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug("ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.") requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip("/").lstrip("/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error("Request to '{}' failed with '{}'".format(url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException("Invalid status code '{}' or error '{}' from {}" .format(status, error, url)) return r
class ClientManagerStateSerializer(object): """Livy client manager state serializer""" def __init__(self, reader_writer): assert reader_writer is not None self.logger = Log("ClientManagerStateSerializer") self._ipython_display = IpythonDisplay() self._reader_writer = reader_writer def deserialize_state(self): self.logger.debug("Deserializing state.") clients_to_return = [] lines = self._reader_writer.read_lines() line = ''.join(lines).strip() if line != '': self.logger.debug("Read content. Converting to JSON.") json_str = json.loads(line) clients = json_str["clients"] for client in clients: # Ignore version for now name = client["name"] session_id = client["id"] sql_context_created = client["sqlcontext"] kind = client["kind"].lower() connection_string = client["connectionstring"] session = self._create_livy_session(connection_string, {"kind": kind}, self._ipython_display, session_id, sql_context_created) # Do not start session automatically. Just create it but skip is not existent. try: # Get status to know if it's alive or not. status = session.status if not session.is_final_status(status): self.logger.debug("Adding session {}".format(session_id)) client_obj = self._create_livy_client(session) clients_to_return.append((name, client_obj)) else: self.logger.error("Skipping serialized session '{}' because session was in status {}." .format(session.id, status)) except (ValueError, ConnectionError) as e: self.logger.error("Skipping serialized session '{}' because {}".format(session.id, str(e))) else: self.logger.debug("Empty manager state found.") return clients_to_return def serialize_state(self, name_client_dictionary): self.logger.debug("Serializing state.") serialized_clients = [] for name in list(name_client_dictionary.keys()): client = name_client_dictionary[name] serialized_client = client.serialize() serialized_client["name"] = name serialized_clients.append(serialized_client) serialized_str = json.dumps({"clients": serialized_clients}) self._reader_writer.overwrite_with_line(serialized_str) def _create_livy_session(self, connection_string, properties, ipython_display, session_id, sql_context_created): return LivySession.from_connection_string(connection_string, properties, ipython_display, session_id, sql_context_created) def _create_livy_client(self, session): return LivyClient(session)
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = Log("_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext remotespark.kernels" self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language) self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format(self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = Log("_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext remotespark.kernels" self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format( self.session_language) self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format( self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell( register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format( log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
def __init__(self): self.logger = Log("SessionManager") self._sessions = dict()
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = Log("ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( "ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip("/").lstrip("/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error("Request to '{}' failed with '{}'".format( url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException( "Invalid status code '{}' or error '{}' from {}". format(status, error, url)) return r
class ClientManager(object): """Livy client manager""" def __init__(self, serializer=None): serialize_periodically = False serialize_period = 3 if serializer is not None: serialize_periodically = conf.serialize_periodically() serialize_period = conf.serialize_period_seconds() self.logger = Log("ClientManager") self._livy_clients = dict() self._serializer = serializer self._serialize_timer = None if self._serializer is not None: for (name, client) in self._serializer.deserialize_state(): self.add_client(name, client) if serialize_periodically: self._serialize_state_periodically(serialize_period) def _serialize_state_periodically(self, serialize_period): self.logger.debug("Starting state serialize timer.") self._serialize_timer = Timer(serialize_period, self._serialize_state) self._serialize_timer.start() def _serialize_state(self): self._serializer.serialize_state(self._livy_clients) @property def livy_clients(self): return self._livy_clients def get_sessions_list(self): return list(self._livy_clients.keys()) def get_sessions_info(self): return ["Name: {}\t{}".format(k, str(self._livy_clients[k])) for k in list(self._livy_clients.keys())] def add_client(self, name, livy_client): if name in self.get_sessions_list(): raise ValueError("Session with name '{}' already exists. Please delete the session" " first if you intend to replace it.".format(name)) self._livy_clients[name] = livy_client def get_any_client(self): number_of_sessions = len(self._livy_clients) if number_of_sessions == 1: key = self.get_sessions_list()[0] return self._livy_clients[key] elif number_of_sessions == 0: raise AssertionError("You need to have at least 1 client created to execute commands.") else: raise AssertionError("Please specify the client to use. Possible sessions are {}".format( self.get_sessions_list())) def get_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name] raise ValueError("Could not find '{}' session in list of saved sessions. Possible sessions are {}".format( name, self.get_sessions_list())) def get_session_id_for_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name].session_id return None def delete_client(self, name): self._remove_session(name) def clean_up_all(self): for name in self.get_sessions_list(): self._remove_session(name) if self._serializer is not None: self._serialize_state() def _remove_session(self, name): if name in self.get_sessions_list(): self._livy_clients[name].close_session() del self._livy_clients[name] else: raise ValueError("Could not find '{}' session in list of saved sessions. Possible sessions are {}" .format(name, self.get_sessions_list()))
class ClientManagerStateSerializer(object): """Livy client manager state serializer""" def __init__(self, reader_writer): assert reader_writer is not None self.logger = Log("ClientManagerStateSerializer") self._ipython_display = IpythonDisplay() self._reader_writer = reader_writer def deserialize_state(self): self.logger.debug("Deserializing state.") clients_to_return = [] lines = self._reader_writer.read_lines() line = ''.join(lines).strip() if line != '': self.logger.debug("Read content. Converting to JSON.") json_str = json.loads(line) clients = json_str["clients"] for client in clients: # Ignore version for now name = client["name"] session_id = client["id"] sql_context_created = client["sqlcontext"] kind = client["kind"].lower() connection_string = client["connectionstring"] session = self._create_livy_session(connection_string, {"kind": kind}, self._ipython_display, session_id, sql_context_created) # Do not start session automatically. Just create it but skip is not existent. try: # Get status to know if it's alive or not. status = session.status if not session.is_final_status(status): self.logger.debug( "Adding session {}".format(session_id)) client_obj = self._create_livy_client(session) clients_to_return.append((name, client_obj)) else: self.logger.error( "Skipping serialized session '{}' because session was in status {}." .format(session.id, status)) except (ValueError, ConnectionError) as e: self.logger.error( "Skipping serialized session '{}' because {}".format( session.id, str(e))) else: self.logger.debug("Empty manager state found.") return clients_to_return def serialize_state(self, name_client_dictionary): self.logger.debug("Serializing state.") serialized_clients = [] for name in list(name_client_dictionary.keys()): client = name_client_dictionary[name] serialized_client = client.serialize() serialized_client["name"] = name serialized_clients.append(serialized_client) serialized_str = json.dumps({"clients": serialized_clients}) self._reader_writer.overwrite_with_line(serialized_str) def _create_livy_session(self, connection_string, properties, ipython_display, session_id, sql_context_created): return LivySession.from_connection_string(connection_string, properties, ipython_display, session_id, sql_context_created) def _create_livy_client(self, session): return LivyClient(session)
class SparkController(object): def __init__(self, ipython_display): self.logger = Log("SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager() def get_logs(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_logs() def run_command(self, command, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return command.execute(session_to_use) def run_sqlquery(self, sqlquery, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return sqlquery.execute(session_to_use) def get_all_sessions_endpoint(self, endpoint): http_client = self._http_client(endpoint) sessions = http_client.get_sessions()["sessions"] session_list = [ self._livy_session(http_client, {"kind": s["kind"]}, self.ipython_display, s["id"]) for s in sessions ] for s in session_list: s.refresh_status() return session_list def get_all_sessions_endpoint_info(self, endpoint): sessions = self.get_all_sessions_endpoint(endpoint) return [str(s) for s in sessions] def cleanup(self): self.session_manager.clean_up_all() def cleanup_endpoint(self, endpoint): for session in self.get_all_sessions_endpoint(endpoint): session.delete() def delete_session_by_name(self, name): self.session_manager.delete_client(name) def delete_session_by_id(self, endpoint, session_id): http_client = self._http_client(endpoint) response = http_client.get_session(session_id) http_client = self._http_client(endpoint) session = self._livy_session(http_client, {"kind": response["kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, endpoint, skip_if_exists, properties): if skip_if_exists and (name in self.session_manager.get_sessions_list()): self.logger.debug( "Skipping {} because it already exists in list of sessions.". format(name)) return http_client = self._http_client(endpoint) session = self._livy_session(http_client, properties, self.ipython_display) self.session_manager.add_session(name, session) session.start() def get_session_id_for_client(self, name): return self.session_manager.get_session_id_for_client(name) def get_client_keys(self): return self.session_manager.get_sessions_list() def get_manager_sessions_str(self): return self.session_manager.get_sessions_info() def get_session_by_name_or_default(self, client_name): if client_name is None: return self.session_manager.get_any_session() else: client_name = client_name.lower() return self.session_manager.get_session(client_name) def get_managed_clients(self): return self.session_manager.sessions @staticmethod def _livy_session(http_client, properties, ipython_display, session_id=-1, sql_created=None): return LivySession(http_client, properties, ipython_display, session_id, sql_created) @staticmethod def _http_client(endpoint): return LivyReliableHttpClient.from_endpoint(endpoint)
def __init__(self, ipython_display): self.logger = Log("SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager()
def __init__(self, session): self.logger = Log("LivyClient") self._session = session self._execute_timeout_seconds = conf.execute_timeout_seconds()
class SparkController(object): def __init__(self, ipython_display, serialize_path=None): self.logger = Log("SparkController") self.ipython_display = ipython_display self.client_factory = LivyClientFactory() if serialize_path is not None: serializer = ClientManagerStateSerializer(self.client_factory, FileSystemReaderWriter(serialize_path)) self.client_manager = ClientManager(serializer) else: self.client_manager = ClientManager() def get_logs(self, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.get_logs() def run_cell(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute(cell) def run_cell_sql(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute_sql(cell) def run_cell_hive(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute_hive(cell) def get_all_sessions_endpoint(self, connection_string): http_client = self.client_factory.create_http_client(connection_string) r = http_client.get("/sessions", [200]) sessions = r.json()["sessions"] session_list = [self.client_factory.create_session(self.ipython_display, connection_string, {"kind": s["kind"]}, s["id"]) for s in sessions] for s in session_list: s._refresh_status() return session_list def get_all_sessions_endpoint_info(self, connection_string): sessions = self.get_all_sessions_endpoint(connection_string) return [str(s) for s in sessions] def cleanup(self): self.client_manager.clean_up_all() def cleanup_endpoint(self, connection_string): for session in self.get_all_sessions_endpoint(connection_string): session.delete() def delete_session_by_name(self, name): self.client_manager.delete_client(name) def delete_session_by_id(self, connection_string, session_id): http_client = self.client_factory.create_http_client(connection_string) r = http_client.get("/sessions/{}".format(session_id), [200, 404]) if r.status_code != 404: session = self.client_factory.create_session(self.ipython_display, connection_string, {"kind": r.json()["kind"]}, session_id, False) session.delete() def add_session(self, name, connection_string, skip_if_exists, properties): if skip_if_exists and (name in self.client_manager.get_sessions_list()): self.logger.debug("Skipping {} because it already exists in list of sessions.".format(name)) return session = self.client_factory.create_session(self.ipython_display, connection_string, properties, "-1", False) session.start() livy_client = self.client_factory.build_client(session) self.client_manager.add_client(name, livy_client) livy_client.start() def get_client_keys(self): return self.client_manager.get_sessions_list() def get_manager_sessions_str(self): return self.client_manager.get_sessions_info() def get_client_by_name_or_default(self, client_name): if client_name is None: return self.client_manager.get_any_client() else: client_name = client_name.lower() return self.client_manager.get_client(client_name)
class Command(ObjectWithGuid): def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log("Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): self._spark_events.emit_statement_execution_start_event( session.guid, session.kind, session.id, self.guid) statement_id = -1 try: session.wait_for_idle() data = {"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response['id'] output = self._get_statement_output(session, statement_id) except Exception as e: self._spark_events.emit_statement_execution_end_event( session.guid, session.kind, session.id, self.guid, statement_id, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_statement_execution_end_event( session.guid, session.kind, session.id, self.guid, statement_id, True, "", "") return output def _get_statement_output(self, session, statement_id): statement_running = True out = "" while statement_running: statement = session.http_client.get_statement( session.id, statement_id) status = statement["state"] self.logger.debug("Status of statement {} is {}.".format( statement_id, status)) if status == "running": session.sleep() else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise LivyUnexpectedStatusException( "Unknown output status from Livy: '{}'".format( statement_output["status"])) return out
def test_log_init(): logger = Log('something') assert isinstance(logger.logger, logging.Logger)
def __init__(self, ipython_display): self.logger = Log(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager()
class LivySession(object): """Session that is livy specific.""" def __init__(self, ipython_display, http_client, session_id, sql_created, properties): assert "kind" in properties.keys() kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert create_sql_context_timeout_seconds > 0 if session_id == "-1" and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in Constants.session_kinds_supported: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(Constants.session_kinds_supported))) if session_id == "-1": self._status = Constants.not_started_session_status sql_created = False else: self._status = Constants.busy_session_status self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds self._state = LivySessionState(session_id, http_client.connection_string, kind, sql_created) def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self._status) def get_state(self): return self._state def start(self): """Start the session against actual livy server.""" self.logger.debug("Starting '{}' session.".format(self.kind)) r = self._http_client.post("/sessions", [201], self.properties) self._state.session_id = str(r.json()["id"]) self._status = str(r.json()["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") self.logger.debug("Session '{}' started.".format(self.kind)) def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.started_sql_context: return self.logger.debug("Starting '{}' sql and hive session.".format(self.kind)) self.ipython_display.writeln("Creating SqlContext as 'sqlContext'") self._create_context(Constants.context_name_sql) self.ipython_display.writeln("Creating HiveContext as 'hiveContext'") self._create_context(Constants.context_name_hive) self._state.sql_context_created = True def _create_context(self, context_type): if context_type == Constants.context_name_sql: command = self._get_sql_context_creation_command() elif context_type == Constants.context_name_hive: command = self._get_hive_context_creation_command() else: raise ValueError("Cannot create context of type {}.".format(context_type)) try: self.wait_for_idle(self._create_sql_context_timeout_seconds) self.execute(command) self.logger.debug("Started '{}' {} session.".format(self.kind, context_type)) except LivyClientTimeoutError: raise LivyClientTimeoutError("Failed to create the {} context in time. Timed out after {} seconds." .format(context_type, self._create_sql_context_timeout_seconds)) @property def id(self): return self._state.session_id @property def started_sql_context(self): return self._state.sql_context_created @property def kind(self): return self._state.kind @property def logs(self): self._refresh_logs() return self._logs @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in Constants.final_status def execute(self, commands): code = textwrap.dedent(commands) data = {"code": code} r = self._http_client.post(self._statements_url(), [201], data) statement_id = r.json()['id'] return self._get_statement_output(statement_id) def delete(self): self.logger.debug("Deleting session '{}'".format(self.id)) if self._status != Constants.not_started_session_status and self._status != Constants.dead_session_status: self._http_client.delete("/sessions/{}".format(self.id), [200, 404]) self._status = Constants.dead_session_status self._state.session_id = "-1" else: raise ValueError("Cannot delete session {} that is in state '{}'." .format(self.id, self._status)) def wait_for_idle(self, seconds_to_wait): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ self._refresh_status() current_status = self._status if current_status == Constants.idle_session_status: return if current_status in Constants.final_status: error = "Session {} unexpectedly reached final status {}. See logs:\n{}"\ .format(self.id, current_status, self.logs) self.logger.error(error) raise LivyUnexpectedStatusError(error) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, current_status) self.logger.error(error) raise LivyClientTimeoutError(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, current_status, seconds_to_wait)) sleep(self._status_sleep_seconds) elapsed = (time() - start_time) return self.wait_for_idle(seconds_to_wait - elapsed) def _statements_url(self): return "/sessions/{}/statements".format(self.id) def _refresh_status(self): status = self._get_latest_status() if status in Constants.possible_session_status: self._status = status else: raise ValueError("Status '{}' not supported by session.".format(status)) return self._status def _refresh_logs(self): self._logs = self._get_latest_logs() def _get_latest_status(self): r = self._http_client.get("/sessions/{}".format(self.id), [200]) session = r.json() return session['state'] def _get_latest_logs(self): r = self._http_client.get("/sessions/{}/log?from=0".format(self.id), [200]) log_array = r.json()['log'] logs = "\n".join(log_array) return logs def _get_statement_output(self, statement_id): statement_running = True out = "" while statement_running: r = self._http_client.get(self._statements_url(), [200]) statement = [i for i in r.json()["statements"] if i["id"] == statement_id][0] status = statement["state"] self.logger.debug("Status of statement {} is {}.".format(statement_id, status)) if status == "running": sleep(self._statement_sleep_seconds) else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise ValueError("Unknown output status: '{}'".format(statement_output["status"])) return out def _get_sql_context_creation_command(self): if self.kind == Constants.session_kind_spark: sql_context_command = "val sqlContext = new org.apache.spark.sql.SQLContext(sc)\n" \ "import sqlContext.implicits._" elif self.kind == Constants.session_kind_pyspark: sql_context_command = "from pyspark.sql import SQLContext\nfrom pyspark.sql.types import *\n" \ "sqlContext = SQLContext(sc)" elif self.kind == Constants.session_kind_sparkr: sql_context_command = "sqlContext <- sparkRSQL.init(sc)" else: raise ValueError("Do not know how to create sqlContext in session of kind {}.".format(self.kind)) return sql_context_command def _get_hive_context_creation_command(self): if self.kind == Constants.session_kind_spark: hive_context_command = "val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == Constants.session_kind_pyspark: hive_context_command = "from pyspark.sql import HiveContext\nhiveContext = HiveContext(sc)" elif self.kind == Constants.session_kind_sparkr: hive_context_command = "hiveContext <- sparkRHive.init(sc)" else: raise ValueError("Do not know how to create hiveContext in session of kind {}.".format(self.kind)) return hive_context_command
class SparkController(object): def __init__(self, ipython_display): self.logger = Log(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager() def get_app_id(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_app_id() def get_driver_log_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_driver_log_url() def get_logs(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_logs() def get_spark_ui_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_spark_ui_url() def run_command(self, command, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return command.execute(session_to_use) def run_sqlquery(self, sqlquery, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return sqlquery.execute(session_to_use) def get_all_sessions_endpoint(self, endpoint): http_client = self._http_client(endpoint) sessions = http_client.get_sessions()[u"sessions"] session_list = [self._livy_session(http_client, {u"kind": s[u"kind"]}, self.ipython_display, s[u"id"]) for s in sessions] for s in session_list: s.refresh_status() return session_list def get_all_sessions_endpoint_info(self, endpoint): sessions = self.get_all_sessions_endpoint(endpoint) return [str(s) for s in sessions] def cleanup(self): self.session_manager.clean_up_all() def cleanup_endpoint(self, endpoint): for session in self.get_all_sessions_endpoint(endpoint): session.delete() def delete_session_by_name(self, name): self.session_manager.delete_client(name) def delete_session_by_id(self, endpoint, session_id): http_client = self._http_client(endpoint) response = http_client.get_session(session_id) http_client = self._http_client(endpoint) session = self._livy_session(http_client, {u"kind": response[u"kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, endpoint, skip_if_exists, properties): if skip_if_exists and (name in self.session_manager.get_sessions_list()): self.logger.debug(u"Skipping {} because it already exists in list of sessions.".format(name)) return http_client = self._http_client(endpoint) session = self._livy_session(http_client, properties, self.ipython_display) self.session_manager.add_session(name, session) session.start() def get_session_id_for_client(self, name): return self.session_manager.get_session_id_for_client(name) def get_client_keys(self): return self.session_manager.get_sessions_list() def get_manager_sessions_str(self): return self.session_manager.get_sessions_info() def get_session_by_name_or_default(self, client_name): if client_name is None: return self.session_manager.get_any_session() else: client_name = client_name.lower() return self.session_manager.get_session(client_name) def get_managed_clients(self): return self.session_manager.sessions @staticmethod def _livy_session(http_client, properties, ipython_display, session_id=-1, sql_created=None): return LivySession(http_client, properties, ipython_display, session_id, sql_created) @staticmethod def _http_client(endpoint): return LivyReliableHttpClient.from_endpoint(endpoint)
def __init__(self): self.logger = Log("EventsHandler")
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, kernel_conf_name, session_language, client_name, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.kernel_conf_name = kernel_conf_name self.session_language = session_language self.client_name = client_name super(SparkKernelBase, self).__init__(**kwargs) self._logger = Log(self.client_name) self._session_started = False self._fatal_error = None self._ipython_display = IpythonDisplay() self.user_command_parser = UserCommandParser() # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): configuration = self._get_configuration() if not configuration: # _get_configuration() sets the error for us so we can just return now. # The kernel is not in a good state and all do_execute calls will # fail with the fatal error. return (username, password, url) = configuration self.connection_string = get_connection_string(url, username, password) self._load_magics_extension() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): if self._fatal_error is not None: self._repeat_fatal_error() # Parse command subcommand, force, output_var, command = self.user_command_parser.parse_user_command(code) # Get transformer transformer = self._get_code_transformer(subcommand) # Get instructions try: code_to_run, error_to_show, begin_action, end_action, deletes_session = \ transformer.get_code_to_execute(self._session_started, self.connection_string, force, output_var, command) except SyntaxError as se: self._show_user_error("{}".format(se)) else: # Execute instructions if error_to_show is not None: self._show_user_error(error_to_show) return self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) if begin_action == Constants.delete_session_action: self._delete_session() elif begin_action == Constants.start_session_action: self._start_session() elif begin_action == Constants.do_nothing_action: pass else: raise ValueError("Begin action {} not supported.".format(begin_action)) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) if end_action == Constants.delete_session_action: self._delete_session() elif end_action == Constants.start_session_action: self._start_session() elif end_action == Constants.do_nothing_action: pass else: raise ValueError("End action {} not supported.".format(end_action)) if deletes_session: self._session_started = False return res return self._execute_cell("", silent, store_history, user_expressions, allow_stdin) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) @staticmethod def _get_code_transformer(subcommand): if subcommand == UserCommandParser.run_command: return SparkTransformer(subcommand) elif subcommand == UserCommandParser.sql_command: return SqlTransformer(subcommand) elif subcommand == UserCommandParser.hive_command: return HiveTransformer(subcommand) elif subcommand == UserCommandParser.config_command: return ConfigTransformer(subcommand) elif subcommand == UserCommandParser.info_command: return InfoTransformer(subcommand) elif subcommand == UserCommandParser.delete_command: return DeleteSessionTransformer(subcommand) elif subcommand == UserCommandParser.clean_up_command: return CleanUpTransformer(subcommand) elif subcommand == UserCommandParser.logs_command: return LogsTransformer(subcommand) elif subcommand == UserCommandParser.local_command: return PythonTransformer(subcommand) else: return NotSupportedTransformer(subcommand) def _load_magics_extension(self): register_magics_code = "%load_ext remotespark" self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark magics library.") self._logger.debug("Loaded magics.") def _register_auto_viz(self): register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self._logger.debug("Registered auto viz.") def _start_session(self): if not self._session_started: self._session_started = True add_session_code = "%spark add {} {} {} skip".format( self.client_name, self.session_language, self.connection_string) self._execute_cell(add_session_code, True, False, shutdown_if_error=True, log_if_error="Failed to create a Livy session.") self._logger.debug("Added session.") def _delete_session(self): if self._session_started: code = "%spark cleanup" self._execute_cell_for_user(code, True, False) self._session_started = False def _get_configuration(self): """Returns (username, password, url). If there is an error (missing configuration), returns False.""" try: credentials = getattr(conf, 'kernel_' + self.kernel_conf_name + '_credentials')() ret = (credentials['username'], credentials['password'], credentials['url']) # The URL has to be set in the configuration. assert(ret[2]) return ret except (KeyError, AssertionError): message = "Please set configuration for 'kernel_{}_credentials' to initialize Kernel".format( self.kernel_conf_name) self._queue_fatal_error(message) return False def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply) self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _show_user_error(self, message): self._logger.error(message) self._ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self._logger.error(error) self._ipython_display.send_error(error) raise ValueError(self._fatal_error)
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None): super(LivySession, self).__init__() assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) try: r = self._http_client.post_session(self.properties) self.id = r["id"] self.status = str(r["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException("Session {} did not start up in {} seconds." .format(self.id, conf.livy_session_startup_timeout_seconds())) if create_sql_context: self.create_sql_context() except Exception as e: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "") def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug("Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln("Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: (success, out) = command.execute(self) except LivyClientTimeoutException: raise LivyClientTimeoutException("Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) if success: self.created_sql_context = True else: raise FailedToCreateSqlContextException("Failed to create the SqlContext.\nError: '{}'".format(out)) def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)['log'] self._logs = "\n".join(log_array) return self._logs @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status) try: self.logger.debug("Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS: self._http_client.delete_session(session_id) self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error("Cannot delete session {} that is in state '{}'." .format(session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = "Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException('{} See logs:\n{}'.format(error, self.get_logs())) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def refresh_status(self): status = self._http_client.get_session(self.id)['state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise LivyUnexpectedStatusException("Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = "sqlContext <- sparkRHive.init(sc)" else: raise BadUserDataException("Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return Command(sql_context_command)
class LivySession(object): """Session that is livy specific.""" def __init__(self, http_client, properties, ipython_display, session_id="-1", sql_created=None): assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert create_sql_context_timeout_seconds > 0 if session_id == "-1" and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == "-1": self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds self._state = LivySessionState(session_id, self._http_client.connection_string, kind, sql_created) @staticmethod def from_connection_string(connection_string, properties, ipython_display, session_id="-1", sql_created=None): http_client = LivyReliableHttpClient.from_connection_string(connection_string) return LivySession(http_client, properties, ipython_display, session_id, sql_created) def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status) def get_state(self): return self._state def start(self): """Start the session against actual livy server.""" self.logger.debug("Starting '{}' session.".format(self.kind)) r = self._http_client.post("/sessions", [201], self.properties) self._state.session_id = str(r.json()["id"]) self.status = str(r.json()["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") self.logger.debug("Session '{}' started.".format(self.kind)) def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.started_sql_context: return self.logger.debug("Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln("Creating HiveContext as 'sqlContext'") self._create_context(constants.CONTEXT_NAME_SQL) self._state.sql_context_created = True def _create_context(self, context_type): if context_type == constants.CONTEXT_NAME_SQL: command = self._get_sql_context_creation_command() else: raise ValueError("Cannot create context of type {}.".format(context_type)) try: self.wait_for_idle(self._create_sql_context_timeout_seconds) self.execute(command) self.logger.debug("Started '{}' {} session.".format(self.kind, context_type)) except LivyClientTimeoutError: raise LivyClientTimeoutError("Failed to create the {} context in time. Timed out after {} seconds." .format(context_type, self._create_sql_context_timeout_seconds)) def get_logs(self): r = self._http_client.get("/sessions/{}/log?from=0".format(self.id), [200]) log_array = r.json()['log'] self._logs = "\n".join(log_array) return self._logs @property def id(self): return self._state.session_id @property def started_sql_context(self): return self._state.sql_context_created @property def kind(self): return self._state.kind @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def execute(self, commands): code = textwrap.dedent(commands) data = {"code": code} r = self._http_client.post(self._statements_url(), [201], data) statement_id = r.json()['id'] return self._get_statement_output(statement_id) def delete(self): self.logger.debug("Deleting session '{}'".format(self.id)) if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS: self._http_client.delete("/sessions/{}".format(self.id), [200, 404]) self.status = constants.DEAD_SESSION_STATUS self._state.session_id = "-1" else: raise ValueError("Cannot delete session {} that is in state '{}'." .format(self.id, self.status)) def wait_for_idle(self, seconds_to_wait): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ self._refresh_status() current_status = self.status if current_status == constants.IDLE_SESSION_STATUS: return if current_status in constants.FINAL_STATUS: error = "Session {} unexpectedly reached final status '{}'. See logs:\n{}"\ .format(self.id, current_status, self.get_logs()) self.logger.error(error) raise LivyUnexpectedStatusError(error) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, current_status) self.logger.error(error) raise LivyClientTimeoutError(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, current_status, seconds_to_wait)) sleep(self._status_sleep_seconds) elapsed = (time() - start_time) return self.wait_for_idle(seconds_to_wait - elapsed) def _statements_url(self): return "/sessions/{}/statements".format(self.id) def _refresh_status(self): status = self._http_client.get("/sessions/{}".format(self.id), [200]).json()['state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise ValueError("Status '{}' not supported by session.".format(status)) return self.status def _get_statement_output(self, statement_id): statement_running = True out = "" while statement_running: r = self._http_client.get(self._statements_url(), [200]) statement = [i for i in r.json()["statements"] if i["id"] == statement_id][0] status = statement["state"] self.logger.debug("Status of statement {} is {}.".format(statement_id, status)) if status == "running": sleep(self._statement_sleep_seconds) else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise ValueError("Unknown output status: '{}'".format(statement_output["status"])) return out def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = "sqlContext <- sparkRHive.init(sc)" else: raise ValueError("Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return sql_context_command
class LivySession(ObjectWithGuid): """Session that is livy specific.""" def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None): super(LivySession, self).__init__() assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display self._spark_events = SparkEvents() status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) r = self._http_client.post_session(self.properties) self.id = r["id"] self.status = str(r["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutError: raise LivyClientTimeoutError("Session {} did not start up in {} seconds."\ .format(self.id, conf.livy_session_startup_timeout_seconds())) if create_sql_context: self.create_sql_context() self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status) def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug("Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln("Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: command.execute(self) except LivyClientTimeoutError: raise LivyClientTimeoutError("Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) self.created_sql_context = True def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)['log'] self._logs = "\n".join(log_array) return self._logs @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): self.logger.debug("Deleting session '{}'".format(self.id)) if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS: self._http_client.delete_session(self.id) self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: raise ValueError("Cannot delete session {} that is in state '{}'." .format(self.id, self.status)) def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self._refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = "Session {} unexpectedly reached final status '{}'. See logs:\n{}"\ .format(self.id, self.status, self.get_logs()) self.logger.error(error) raise LivyUnexpectedStatusError(error) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutError(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def _refresh_status(self): status = self._http_client.get_session(self.id)['state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise ValueError("Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = "sqlContext <- sparkRHive.init(sc)" else: raise ValueError("Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return Command(sql_context_command)