class Command(ObjectWithGuid): def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log(u"Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): self._spark_events.emit_statement_execution_start_event(session.guid, session.kind, session.id, self.guid) statement_id = -1 try: session.wait_for_idle() data = {u"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response[u'id'] output = self._get_statement_output(session, statement_id) except Exception as e: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id, self.guid, statement_id, True, "", "") return output def _get_statement_output(self, session, statement_id): statement_running = True out = u"" while statement_running: statement = session.http_client.get_statement(session.id, statement_id) status = statement[u"state"] self.logger.debug(u"Status of statement {} is {}.".format(statement_id, status)) if status == u"running": session.sleep() else: statement_running = False statement_output = statement[u"output"] if statement_output[u"status"] == u"ok": out = (True, statement_output[u"data"][u"text/plain"]) elif statement_output[u"status"] == u"error": out = (False, statement_output[u"evalue"] + u"\n" + u"".join(statement_output[u"traceback"])) else: raise LivyUnexpectedStatusException(u"Unknown output status from Livy: '{}'" .format(statement_output[u"status"])) return out
class Command(ObjectWithGuid): def __init__(self, code): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log("Command") def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): session.wait_for_idle() data = {"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response['id'] return self._get_statement_output(session, statement_id) def _get_statement_output(self, session, statement_id): statement_running = True out = "" while statement_running: statement = session.http_client.get_statement(session.id, statement_id) status = statement["state"] self.logger.debug("Status of statement {} is {}.".format(statement_id, status)) if status == "running": session.sleep() else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise ValueError("Unknown output status: '{}'".format(statement_output["status"])) return out
class SparkMagicBase(Magics): def __init__(self, shell, data=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) try: should_serialize = conf.serialize() if should_serialize: self.logger.debug("Serialization enabled.") self.magics_home_path = get_magics_home_path() path_to_serialize = join_paths(self.magics_home_path, "state.json") self.logger.debug( "Will serialize to {}.".format(path_to_serialize)) self.spark_controller = SparkController( self.ipython_display, serialize_path=path_to_serialize) else: self.logger.debug("Serialization NOT enabled.") except KeyError: self.logger.error("Could not read env vars for serialization.") self.logger.debug("Initialized spark magics.") def execute_sqlquery(self, sqlquery, session, output_var, quiet): try: df = self.spark_controller.run_cell_sql(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df except DataFrameParseException as e: self.ipython_display.send_error(e.out) return None @staticmethod def print_endpoint_info(info_sessions): sessions_info = [" {}".format(i) for i in info_sessions] print("""Info for endpoint: Sessions: {} """.format("\n".join(sessions_info)))
class SparkMagicBase(Magics): def __init__(self, shell, data=None, spark_events=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) self.logger.debug("Initialized spark magics.") if spark_events is None: spark_events = SparkEvents() spark_events.emit_library_loaded_event() def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction, session, output_var, quiet): sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction) df = self.spark_controller.run_sqlquery(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df @staticmethod def _sqlquery(cell, samplemethod, maxrows, samplefraction): return SQLQuery(cell, samplemethod, maxrows, samplefraction) @staticmethod def print_endpoint_info(info_sessions): sessions_info = [" {}".format(i) for i in info_sessions] print("""Info for endpoint: Sessions: {} """.format("\n".join(sessions_info)))
class SparkMagicBase(Magics): def __init__(self, shell, data=None): # You must call the parent constructor super(SparkMagicBase, self).__init__(shell) self.logger = Log("SparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) try: should_serialize = conf.serialize() if should_serialize: self.logger.debug("Serialization enabled.") self.magics_home_path = get_magics_home_path() path_to_serialize = join_paths(self.magics_home_path, "state.json") self.logger.debug("Will serialize to {}.".format(path_to_serialize)) self.spark_controller = SparkController(self.ipython_display, serialize_path=path_to_serialize) else: self.logger.debug("Serialization NOT enabled.") except KeyError: self.logger.error("Could not read env vars for serialization.") self.logger.debug("Initialized spark magics.") def execute_sqlquery(self, sqlquery, session, output_var, quiet): try: df = self.spark_controller.run_cell_sql(sqlquery, session) if output_var is not None: self.shell.user_ns[output_var] = df if quiet: return None else: return df except DataFrameParseException as e: self.ipython_display.send_error(e.out) return None @staticmethod def print_endpoint_info(info_sessions): sessions_info = [" {}".format(i) for i in info_sessions] print("""Info for endpoint: Sessions: {} """.format("\n".join(sessions_info)))
class ClientManager(object): """Livy client manager""" def __init__(self, serializer=None): serialize_periodically = False serialize_period = 3 if serializer is not None: serialize_periodically = conf.serialize_periodically() serialize_period = conf.serialize_period_seconds() self.logger = Log("ClientManager") self._livy_clients = dict() self._serializer = serializer self._serialize_timer = None if self._serializer is not None: for (name, client) in self._serializer.deserialize_state(): self.add_client(name, client) if serialize_periodically: self._serialize_state_periodically(serialize_period) def _serialize_state_periodically(self, serialize_period): self.logger.debug("Starting state serialize timer.") self._serialize_timer = Timer(serialize_period, self._serialize_state) self._serialize_timer.start() def _serialize_state(self): self._serializer.serialize_state(self._livy_clients) @property def livy_clients(self): return self._livy_clients def get_sessions_list(self): return list(self._livy_clients.keys()) def get_sessions_info(self): return ["Name: {}\t{}".format(k, str(self._livy_clients[k])) for k in list(self._livy_clients.keys())] def add_client(self, name, livy_client): if name in self.get_sessions_list(): raise ValueError("Session with name '{}' already exists. Please delete the session" " first if you intend to replace it.".format(name)) self._livy_clients[name] = livy_client def get_any_client(self): number_of_sessions = len(self._livy_clients) if number_of_sessions == 1: key = self.get_sessions_list()[0] return self._livy_clients[key] elif number_of_sessions == 0: raise AssertionError("You need to have at least 1 client created to execute commands.") else: raise AssertionError("Please specify the client to use. Possible sessions are {}".format( self.get_sessions_list())) def get_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name] raise ValueError("Could not find '{}' session in list of saved sessions. Possible sessions are {}".format( name, self.get_sessions_list())) def get_session_id_for_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name].session_id return None def delete_client(self, name): self._remove_session(name) def clean_up_all(self): for name in self.get_sessions_list(): self._remove_session(name) if self._serializer is not None: self._serialize_state() def _remove_session(self, name): if name in self.get_sessions_list(): self._livy_clients[name].close_session() del self._livy_clients[name] else: raise ValueError("Could not find '{}' session in list of saved sessions. Possible sessions are {}" .format(name, self.get_sessions_list()))
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, kernel_conf_name, session_language, client_name, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.kernel_conf_name = kernel_conf_name self.session_language = session_language self.client_name = client_name super(SparkKernelBase, self).__init__(**kwargs) self._logger = Log(self.client_name) self._session_started = False self._fatal_error = None self._ipython_display = IpythonDisplay() self.user_command_parser = UserCommandParser() # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): configuration = self._get_configuration() if not configuration: # _get_configuration() sets the error for us so we can just return now. # The kernel is not in a good state and all do_execute calls will # fail with the fatal error. return (username, password, url) = configuration self.connection_string = get_connection_string(url, username, password) self._load_magics_extension() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): if self._fatal_error is not None: self._repeat_fatal_error() # Parse command subcommand, force, output_var, command = self.user_command_parser.parse_user_command(code) # Get transformer transformer = self._get_code_transformer(subcommand) # Get instructions try: code_to_run, error_to_show, begin_action, end_action, deletes_session = \ transformer.get_code_to_execute(self._session_started, self.connection_string, force, output_var, command) except SyntaxError as se: self._show_user_error("{}".format(se)) else: # Execute instructions if error_to_show is not None: self._show_user_error(error_to_show) return self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) if begin_action == Constants.delete_session_action: self._delete_session() elif begin_action == Constants.start_session_action: self._start_session() elif begin_action == Constants.do_nothing_action: pass else: raise ValueError("Begin action {} not supported.".format(begin_action)) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) if end_action == Constants.delete_session_action: self._delete_session() elif end_action == Constants.start_session_action: self._start_session() elif end_action == Constants.do_nothing_action: pass else: raise ValueError("End action {} not supported.".format(end_action)) if deletes_session: self._session_started = False return res return self._execute_cell("", silent, store_history, user_expressions, allow_stdin) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) @staticmethod def _get_code_transformer(subcommand): if subcommand == UserCommandParser.run_command: return SparkTransformer(subcommand) elif subcommand == UserCommandParser.sql_command: return SqlTransformer(subcommand) elif subcommand == UserCommandParser.hive_command: return HiveTransformer(subcommand) elif subcommand == UserCommandParser.config_command: return ConfigTransformer(subcommand) elif subcommand == UserCommandParser.info_command: return InfoTransformer(subcommand) elif subcommand == UserCommandParser.delete_command: return DeleteSessionTransformer(subcommand) elif subcommand == UserCommandParser.clean_up_command: return CleanUpTransformer(subcommand) elif subcommand == UserCommandParser.logs_command: return LogsTransformer(subcommand) elif subcommand == UserCommandParser.local_command: return PythonTransformer(subcommand) else: return NotSupportedTransformer(subcommand) def _load_magics_extension(self): register_magics_code = "%load_ext remotespark" self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark magics library.") self._logger.debug("Loaded magics.") def _register_auto_viz(self): register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self._logger.debug("Registered auto viz.") def _start_session(self): if not self._session_started: self._session_started = True add_session_code = "%spark add {} {} {} skip".format( self.client_name, self.session_language, self.connection_string) self._execute_cell(add_session_code, True, False, shutdown_if_error=True, log_if_error="Failed to create a Livy session.") self._logger.debug("Added session.") def _delete_session(self): if self._session_started: code = "%spark cleanup" self._execute_cell_for_user(code, True, False) self._session_started = False def _get_configuration(self): """Returns (username, password, url). If there is an error (missing configuration), returns False.""" try: credentials = getattr(conf, 'kernel_' + self.kernel_conf_name + '_credentials')() ret = (credentials['username'], credentials['password'], credentials['url']) # The URL has to be set in the configuration. assert(ret[2]) return ret except (KeyError, AssertionError): message = "Please set configuration for 'kernel_{}_credentials' to initialize Kernel".format( self.kernel_conf_name) self._queue_fatal_error(message) return False def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply) self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _show_user_error(self, message): self._logger.error(message) self._ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self._logger.error(error) self._ipython_display.send_error(error) raise ValueError(self._fatal_error)
class SparkController(object): def __init__(self, ipython_display): self.logger = Log(u"SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager() def get_app_id(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_app_id() def get_driver_log_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_driver_log_url() def get_logs(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_logs() def get_spark_ui_url(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_spark_ui_url() def run_command(self, command, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return command.execute(session_to_use) def run_sqlquery(self, sqlquery, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return sqlquery.execute(session_to_use) def get_all_sessions_endpoint(self, endpoint): http_client = self._http_client(endpoint) sessions = http_client.get_sessions()[u"sessions"] session_list = [self._livy_session(http_client, {u"kind": s[u"kind"]}, self.ipython_display, s[u"id"]) for s in sessions] for s in session_list: s.refresh_status() return session_list def get_all_sessions_endpoint_info(self, endpoint): sessions = self.get_all_sessions_endpoint(endpoint) return [str(s) for s in sessions] def cleanup(self): self.session_manager.clean_up_all() def cleanup_endpoint(self, endpoint): for session in self.get_all_sessions_endpoint(endpoint): session.delete() def delete_session_by_name(self, name): self.session_manager.delete_client(name) def delete_session_by_id(self, endpoint, session_id): http_client = self._http_client(endpoint) response = http_client.get_session(session_id) http_client = self._http_client(endpoint) session = self._livy_session(http_client, {u"kind": response[u"kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, endpoint, skip_if_exists, properties): if skip_if_exists and (name in self.session_manager.get_sessions_list()): self.logger.debug(u"Skipping {} because it already exists in list of sessions.".format(name)) return http_client = self._http_client(endpoint) session = self._livy_session(http_client, properties, self.ipython_display) self.session_manager.add_session(name, session) session.start() def get_session_id_for_client(self, name): return self.session_manager.get_session_id_for_client(name) def get_client_keys(self): return self.session_manager.get_sessions_list() def get_manager_sessions_str(self): return self.session_manager.get_sessions_info() def get_session_by_name_or_default(self, client_name): if client_name is None: return self.session_manager.get_any_session() else: client_name = client_name.lower() return self.session_manager.get_session(client_name) def get_managed_clients(self): return self.session_manager.sessions @staticmethod def _livy_session(http_client, properties, ipython_display, session_id=-1, sql_created=None): return LivySession(http_client, properties, ipython_display, session_id, sql_created) @staticmethod def _http_client(endpoint): return LivyReliableHttpClient.from_endpoint(endpoint)
class ClientManagerStateSerializer(object): """Livy client manager state serializer""" def __init__(self, reader_writer): assert reader_writer is not None self.logger = Log("ClientManagerStateSerializer") self._ipython_display = IpythonDisplay() self._reader_writer = reader_writer def deserialize_state(self): self.logger.debug("Deserializing state.") clients_to_return = [] lines = self._reader_writer.read_lines() line = ''.join(lines).strip() if line != '': self.logger.debug("Read content. Converting to JSON.") json_str = json.loads(line) clients = json_str["clients"] for client in clients: # Ignore version for now name = client["name"] session_id = client["id"] sql_context_created = client["sqlcontext"] kind = client["kind"].lower() connection_string = client["connectionstring"] session = self._create_livy_session(connection_string, {"kind": kind}, self._ipython_display, session_id, sql_context_created) # Do not start session automatically. Just create it but skip is not existent. try: # Get status to know if it's alive or not. status = session.status if not session.is_final_status(status): self.logger.debug( "Adding session {}".format(session_id)) client_obj = self._create_livy_client(session) clients_to_return.append((name, client_obj)) else: self.logger.error( "Skipping serialized session '{}' because session was in status {}." .format(session.id, status)) except (ValueError, ConnectionError) as e: self.logger.error( "Skipping serialized session '{}' because {}".format( session.id, str(e))) else: self.logger.debug("Empty manager state found.") return clients_to_return def serialize_state(self, name_client_dictionary): self.logger.debug("Serializing state.") serialized_clients = [] for name in list(name_client_dictionary.keys()): client = name_client_dictionary[name] serialized_client = client.serialize() serialized_client["name"] = name serialized_clients.append(serialized_client) serialized_str = json.dumps({"clients": serialized_clients}) self._reader_writer.overwrite_with_line(serialized_str) def _create_livy_session(self, connection_string, properties, ipython_display, session_id, sql_context_created): return LivySession.from_connection_string(connection_string, properties, ipython_display, session_id, sql_context_created) def _create_livy_client(self, session): return LivyClient(session)
class RemoteSparkMagics(Magics): def __init__(self, shell, data=None): # You must call the parent constructor super(RemoteSparkMagics, self).__init__(shell) self.logger = Log("RemoteSparkMagics") self.ipython_display = IpythonDisplay() self.spark_controller = SparkController(self.ipython_display) try: should_serialize = conf.serialize() if should_serialize: self.logger.debug("Serialization enabled.") self.magics_home_path = get_magics_home_path() path_to_serialize = join_paths(self.magics_home_path, "state.json") self.logger.debug("Will serialize to {}.".format(path_to_serialize)) self.spark_controller = SparkController(self.ipython_display, serialize_path=path_to_serialize) else: self.logger.debug("Serialization NOT enabled.") except KeyError: self.logger.error("Could not read env vars for serialization.") self.logger.debug("Initialized spark magics.") @magic_arguments() @argument("-c", "--context", type=str, default=Constants.context_name_spark, help="Context to use: '{}' for spark, '{}' for sql queries, and '{}' for hive queries. " "Default is '{}'.".format(Constants.context_name_spark, Constants.context_name_sql, Constants.context_name_hive, Constants.context_name_spark)) @argument("-s", "--session", help="The name of the Livy session to use. " "If only one session has been created, there's no need to specify one.") @argument("-o", "--output", type=str, default=None, help="If present, output when using SQL or Hive " "query will be stored in variable of this name.") @argument("command", type=str, default=[""], nargs="*", help="Commands to execute.") @needs_local_scope @line_cell_magic def spark(self, line, cell="", local_ns=None): """Magic to execute spark remotely. This magic allows you to create a Livy Scala or Python session against a Livy endpoint. Every session can be used to execute either Spark code or SparkSQL code by executing against the SQL context in the session. When the SQL context is used, the result will be a Pandas dataframe of a sample of the results. If invoked with no subcommand, the cell will be executed against the specified session. Subcommands ----------- info Display the available Livy sessions and other configurations for sessions. add Add a Livy session. First argument is the name of the session, second argument is the language, and third argument is the connection string of the Livy endpoint. A fourth argument specifying if session creation can be skipped if it already exists is optional: "skip" or empty. e.g. `%%spark add test python url=https://sparkcluster.net/livy;username=u;password=p skip` or e.g. `%%spark add test python url=https://sparkcluster.net/livy;username=u;password=p` config Override the livy session properties sent to Livy on session creation. All session creations will contain these config settings from then on. Expected value is a JSON key-value string to be sent as part of the Request Body for the POST /sessions endpoint in Livy. e.g. `%%spark config {"driverMemory":"1000M", "executorCores":4}` run Run Spark code against a session. e.g. `%%spark -s testsession` will execute the cell code against the testsession previously created e.g. `%%spark -s testsession -c sql` will execute the SQL code against the testsession previously created e.g. `%%spark -s testsession -c sql -o my_var` will execute the SQL code against the testsession previously created and store the pandas dataframe created in the my_var variable in the Python environment. logs Returns the logs for a given session. e.g. `%%spark logs -s testsession` will return the logs for the testsession previously created delete Delete a Livy session. Argument is the name of the session to be deleted. e.g. `%%spark delete defaultlivy` cleanup Delete all Livy sessions created by the notebook. No arguments required. e.g. `%%spark cleanup` """ usage = "Please look at usage of %spark by executing `%spark?`." user_input = line args = parse_argstring(self.spark, user_input) subcommand = args.command[0].lower() try: # info if subcommand == "info": if len(args.command) == 2: connection_string = args.command[1] info_sessions = self.spark_controller.get_all_sessions_endpoint_info(connection_string) self._print_endpoint_info(info_sessions) elif len(args.command) == 1: self._print_local_info() else: raise ValueError("Subcommand 'info' requires no value or a connection string to show all sessions.\n" "{}".format(usage)) # config elif subcommand == "config": # Would normally do " ".join(args.command[1:]) but parse_argstring removes quotes... rest_of_line = user_input[7:] conf.override(conf.session_configs.__name__, json.loads(rest_of_line)) # add elif subcommand == "add": if len(args.command) != 4 and len(args.command) != 5: raise ValueError("Subcommand 'add' requires three or four arguments.\n{}".format(usage)) name = args.command[1].lower() language = args.command[2].lower() connection_string = args.command[3] if len(args.command) == 5: skip = args.command[4].lower() == "skip" else: skip = False properties = copy.deepcopy(conf.session_configs()) properties["kind"] = self._get_livy_kind(language) self.spark_controller.add_session(name, connection_string, skip, properties) # delete elif subcommand == "delete": if len(args.command) == 2: name = args.command[1].lower() self.spark_controller.delete_session_by_name(name) elif len(args.command) == 3: connection_string = args.command[1] session_id = args.command[2] self.spark_controller.delete_session_by_id(connection_string, session_id) else: raise ValueError("Subcommand 'delete' requires a session name or a connection string and id.\n{}" .format(usage)) # cleanup elif subcommand == "cleanup": if len(args.command) == 2: connection_string = args.command[1] self.spark_controller.cleanup_endpoint(connection_string) elif len(args.command) == 1: self.spark_controller.cleanup() else: raise ValueError("Subcommand 'cleanup' requires no further values or a connection string to clean up " "sessions.\n{}".format(usage)) # logs elif subcommand == "logs": if len(args.command) == 1: (success, out) = self.spark_controller.get_logs(args.session) if success: self.ipython_display.write(out) else: self.ipython_display.send_error(out) else: raise ValueError("Subcommand 'logs' requires no further values.\n{}".format(usage)) # run elif len(subcommand) == 0: if args.context == Constants.context_name_spark: (success, out) = self.spark_controller.run_cell(cell, args.session) if success: self.ipython_display.write(out) else: self.ipython_display.send_error(out) elif args.context == Constants.context_name_sql: return self._execute_against_context_that_returns_df(self.spark_controller.run_cell_sql, cell, args.session, args.output) elif args.context == Constants.context_name_hive: return self._execute_against_context_that_returns_df(self.spark_controller.run_cell_hive, cell, args.session, args.output) else: raise ValueError("Context '{}' not found".format(args.context)) # error else: raise ValueError("Subcommand '{}' not found. {}".format(subcommand, usage)) except ValueError as err: self.ipython_display.send_error("{}".format(err)) def _execute_against_context_that_returns_df(self, method, cell, session, output_var): try: df = method(cell, session) if output_var is not None: self.shell.user_ns[output_var] = df return df except DataFrameParseException as e: self.ipython_display.send_error(e.out) return None def _print_local_info(self): sessions_info = [" {}".format(i) for i in self.spark_controller.get_manager_sessions_str()] print("""Info for running Spark: Sessions: {} Session configs: {} """.format("\n".join(sessions_info), conf.session_configs())) def _print_endpoint_info(self, info_sessions): sessions_info = [" {}".format(i) for i in info_sessions] print("""Info for endpoint: Sessions: {} """.format("\n".join(sessions_info))) @staticmethod def _get_livy_kind(language): if language == Constants.lang_scala: return Constants.session_kind_spark elif language == Constants.lang_python: return Constants.session_kind_pyspark elif language == Constants.lang_r: return Constants.session_kind_sparkr else: raise ValueError("Cannot get session kind for {}.".format(language))
class ClientManagerStateSerializer(object): """Livy client manager state serializer""" def __init__(self, reader_writer): assert reader_writer is not None self.logger = Log("ClientManagerStateSerializer") self._ipython_display = IpythonDisplay() self._reader_writer = reader_writer def deserialize_state(self): self.logger.debug("Deserializing state.") clients_to_return = [] lines = self._reader_writer.read_lines() line = ''.join(lines).strip() if line != '': self.logger.debug("Read content. Converting to JSON.") json_str = json.loads(line) clients = json_str["clients"] for client in clients: # Ignore version for now name = client["name"] session_id = client["id"] sql_context_created = client["sqlcontext"] kind = client["kind"].lower() connection_string = client["connectionstring"] session = self._create_livy_session(connection_string, {"kind": kind}, self._ipython_display, session_id, sql_context_created) # Do not start session automatically. Just create it but skip is not existent. try: # Get status to know if it's alive or not. status = session.status if not session.is_final_status(status): self.logger.debug("Adding session {}".format(session_id)) client_obj = self._create_livy_client(session) clients_to_return.append((name, client_obj)) else: self.logger.error("Skipping serialized session '{}' because session was in status {}." .format(session.id, status)) except (ValueError, ConnectionError) as e: self.logger.error("Skipping serialized session '{}' because {}".format(session.id, str(e))) else: self.logger.debug("Empty manager state found.") return clients_to_return def serialize_state(self, name_client_dictionary): self.logger.debug("Serializing state.") serialized_clients = [] for name in list(name_client_dictionary.keys()): client = name_client_dictionary[name] serialized_client = client.serialize() serialized_client["name"] = name serialized_clients.append(serialized_client) serialized_str = json.dumps({"clients": serialized_clients}) self._reader_writer.overwrite_with_line(serialized_str) def _create_livy_session(self, connection_string, properties, ipython_display, session_id, sql_context_created): return LivySession.from_connection_string(connection_string, properties, ipython_display, session_id, sql_context_created) def _create_livy_client(self, session): return LivyClient(session)
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = Log("_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext remotespark.kernels" self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format( self.session_language) self._execute_cell( register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format( self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell( register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format( log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
class SparkController(object): def __init__(self, ipython_display, serialize_path=None): self.logger = Log("SparkController") self.ipython_display = ipython_display if serialize_path is not None: serializer = ClientManagerStateSerializer( FileSystemReaderWriter(serialize_path)) self.client_manager = ClientManager(serializer) else: self.client_manager = ClientManager() def get_logs(self, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.get_logs() def run_cell(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute(cell) def run_cell_sql(self, sqlquery, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute_sql(sqlquery) def get_all_sessions_endpoint(self, connection_string): http_client = self._http_client_from_connection_string( connection_string) r = http_client.get("/sessions", [200]) sessions = r.json()["sessions"] session_list = [ self._create_livy_session(connection_string, {"kind": s["kind"]}, self.ipython_display, s["id"]) for s in sessions ] for s in session_list: s._refresh_status() return session_list def get_all_sessions_endpoint_info(self, connection_string): sessions = self.get_all_sessions_endpoint(connection_string) return [str(s) for s in sessions] def cleanup(self): self.client_manager.clean_up_all() def cleanup_endpoint(self, connection_string): for session in self.get_all_sessions_endpoint(connection_string): session.delete() def delete_session_by_name(self, name): self.client_manager.delete_client(name) def delete_session_by_id(self, connection_string, session_id): http_client = self._http_client_from_connection_string( connection_string) r = http_client.get("/sessions/{}".format(session_id), [200, 404]) if r.status_code != 404: session = self._create_livy_session(connection_string, {"kind": r.json()["kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, connection_string, skip_if_exists, properties): if skip_if_exists and (name in self.client_manager.get_sessions_list()): self.logger.debug( "Skipping {} because it already exists in list of sessions.". format(name)) return session = self._create_livy_session(connection_string, properties, self.ipython_display) session.start() livy_client = self._create_livy_client(session) self.client_manager.add_client(name, livy_client) livy_client.start() def get_session_id_for_client(self, name): return self.client_manager.get_session_id_for_client(name) def get_client_keys(self): return self.client_manager.get_sessions_list() def get_manager_sessions_str(self): return self.client_manager.get_sessions_info() def get_client_by_name_or_default(self, client_name): if client_name is None: return self.client_manager.get_any_client() else: client_name = client_name.lower() return self.client_manager.get_client(client_name) def get_managed_clients(self): return self.client_manager.livy_clients @staticmethod def _create_livy_session(*args, **kwargs): return LivySession.from_connection_string(*args, **kwargs) @staticmethod def _http_client_from_connection_string(connection_string): return LivyReliableHttpClient.from_connection_string(connection_string) @staticmethod def _create_livy_client(session): return LivyClient(session)
class Command(ObjectWithGuid): def __init__(self, code, spark_events=None): super(Command, self).__init__() self.code = textwrap.dedent(code) self.logger = Log("Command") if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events def __eq__(self, other): return self.code == other.code def __ne__(self, other): return not self == other def execute(self, session): self._spark_events.emit_statement_execution_start_event( session.guid, session.kind, session.id, self.guid) statement_id = -1 try: session.wait_for_idle() data = {"code": self.code} response = session.http_client.post_statement(session.id, data) statement_id = response['id'] output = self._get_statement_output(session, statement_id) except Exception as e: self._spark_events.emit_statement_execution_end_event( session.guid, session.kind, session.id, self.guid, statement_id, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_statement_execution_end_event( session.guid, session.kind, session.id, self.guid, statement_id, True, "", "") return output def _get_statement_output(self, session, statement_id): statement_running = True out = "" while statement_running: statement = session.http_client.get_statement( session.id, statement_id) status = statement["state"] self.logger.debug("Status of statement {} is {}.".format( statement_id, status)) if status == "running": session.sleep() else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise LivyUnexpectedStatusException( "Unknown output status from Livy: '{}'".format( statement_output["status"])) return out
class LivySession(object): """Session that is livy specific.""" def __init__(self, http_client, properties, ipython_display, session_id="-1", sql_created=None): assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert create_sql_context_timeout_seconds > 0 if session_id == "-1" and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == "-1": self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds self._state = LivySessionState(session_id, self._http_client.connection_string, kind, sql_created) @staticmethod def from_connection_string(connection_string, properties, ipython_display, session_id="-1", sql_created=None): http_client = LivyReliableHttpClient.from_connection_string(connection_string) return LivySession(http_client, properties, ipython_display, session_id, sql_created) def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status) def get_state(self): return self._state def start(self): """Start the session against actual livy server.""" self.logger.debug("Starting '{}' session.".format(self.kind)) r = self._http_client.post("/sessions", [201], self.properties) self._state.session_id = str(r.json()["id"]) self.status = str(r.json()["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") self.logger.debug("Session '{}' started.".format(self.kind)) def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.started_sql_context: return self.logger.debug("Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln("Creating HiveContext as 'sqlContext'") self._create_context(constants.CONTEXT_NAME_SQL) self._state.sql_context_created = True def _create_context(self, context_type): if context_type == constants.CONTEXT_NAME_SQL: command = self._get_sql_context_creation_command() else: raise ValueError("Cannot create context of type {}.".format(context_type)) try: self.wait_for_idle(self._create_sql_context_timeout_seconds) self.execute(command) self.logger.debug("Started '{}' {} session.".format(self.kind, context_type)) except LivyClientTimeoutError: raise LivyClientTimeoutError("Failed to create the {} context in time. Timed out after {} seconds." .format(context_type, self._create_sql_context_timeout_seconds)) def get_logs(self): r = self._http_client.get("/sessions/{}/log?from=0".format(self.id), [200]) log_array = r.json()['log'] self._logs = "\n".join(log_array) return self._logs @property def id(self): return self._state.session_id @property def started_sql_context(self): return self._state.sql_context_created @property def kind(self): return self._state.kind @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def execute(self, commands): code = textwrap.dedent(commands) data = {"code": code} r = self._http_client.post(self._statements_url(), [201], data) statement_id = r.json()['id'] return self._get_statement_output(statement_id) def delete(self): self.logger.debug("Deleting session '{}'".format(self.id)) if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS: self._http_client.delete("/sessions/{}".format(self.id), [200, 404]) self.status = constants.DEAD_SESSION_STATUS self._state.session_id = "-1" else: raise ValueError("Cannot delete session {} that is in state '{}'." .format(self.id, self.status)) def wait_for_idle(self, seconds_to_wait): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ self._refresh_status() current_status = self.status if current_status == constants.IDLE_SESSION_STATUS: return if current_status in constants.FINAL_STATUS: error = "Session {} unexpectedly reached final status '{}'. See logs:\n{}"\ .format(self.id, current_status, self.get_logs()) self.logger.error(error) raise LivyUnexpectedStatusError(error) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, current_status) self.logger.error(error) raise LivyClientTimeoutError(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, current_status, seconds_to_wait)) sleep(self._status_sleep_seconds) elapsed = (time() - start_time) return self.wait_for_idle(seconds_to_wait - elapsed) def _statements_url(self): return "/sessions/{}/statements".format(self.id) def _refresh_status(self): status = self._http_client.get("/sessions/{}".format(self.id), [200]).json()['state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise ValueError("Status '{}' not supported by session.".format(status)) return self.status def _get_statement_output(self, statement_id): statement_running = True out = "" while statement_running: r = self._http_client.get(self._statements_url(), [200]) statement = [i for i in r.json()["statements"] if i["id"] == statement_id][0] status = statement["state"] self.logger.debug("Status of statement {} is {}.".format(statement_id, status)) if status == "running": sleep(self._statement_sleep_seconds) else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise ValueError("Unknown output status: '{}'".format(statement_output["status"])) return out def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = "sqlContext <- sparkRHive.init(sc)" else: raise ValueError("Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return sql_context_command
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = Log("ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug( "ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks." ) requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip("/").lstrip("/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error("Request to '{}' failed with '{}'".format( url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException( "Invalid status code '{}' or error '{}' from {}". format(status, error, url)) return r
class SparkController(object): def __init__(self, ipython_display, serialize_path=None): self.logger = Log("SparkController") self.ipython_display = ipython_display self.client_factory = LivyClientFactory() if serialize_path is not None: serializer = ClientManagerStateSerializer(self.client_factory, FileSystemReaderWriter(serialize_path)) self.client_manager = ClientManager(serializer) else: self.client_manager = ClientManager() def get_logs(self, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.get_logs() def run_cell(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute(cell) def run_cell_sql(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute_sql(cell) def run_cell_hive(self, cell, client_name=None): client_to_use = self.get_client_by_name_or_default(client_name) return client_to_use.execute_hive(cell) def get_all_sessions_endpoint(self, connection_string): http_client = self.client_factory.create_http_client(connection_string) r = http_client.get("/sessions", [200]) sessions = r.json()["sessions"] session_list = [self.client_factory.create_session(self.ipython_display, connection_string, {"kind": s["kind"]}, s["id"]) for s in sessions] for s in session_list: s._refresh_status() return session_list def get_all_sessions_endpoint_info(self, connection_string): sessions = self.get_all_sessions_endpoint(connection_string) return [str(s) for s in sessions] def cleanup(self): self.client_manager.clean_up_all() def cleanup_endpoint(self, connection_string): for session in self.get_all_sessions_endpoint(connection_string): session.delete() def delete_session_by_name(self, name): self.client_manager.delete_client(name) def delete_session_by_id(self, connection_string, session_id): http_client = self.client_factory.create_http_client(connection_string) r = http_client.get("/sessions/{}".format(session_id), [200, 404]) if r.status_code != 404: session = self.client_factory.create_session(self.ipython_display, connection_string, {"kind": r.json()["kind"]}, session_id, False) session.delete() def add_session(self, name, connection_string, skip_if_exists, properties): if skip_if_exists and (name in self.client_manager.get_sessions_list()): self.logger.debug("Skipping {} because it already exists in list of sessions.".format(name)) return session = self.client_factory.create_session(self.ipython_display, connection_string, properties, "-1", False) session.start() livy_client = self.client_factory.build_client(session) self.client_manager.add_client(name, livy_client) livy_client.start() def get_client_keys(self): return self.client_manager.get_sessions_list() def get_manager_sessions_str(self): return self.client_manager.get_sessions_info() def get_client_by_name_or_default(self, client_name): if client_name is None: return self.client_manager.get_any_client() else: client_name = client_name.lower() return self.client_manager.get_client(client_name)
class LivySession(object): """Session that is livy specific.""" def __init__(self, ipython_display, http_client, session_id, sql_created, properties): assert "kind" in properties.keys() kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert create_sql_context_timeout_seconds > 0 if session_id == "-1" and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in Constants.session_kinds_supported: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(Constants.session_kinds_supported))) if session_id == "-1": self._status = Constants.not_started_session_status sql_created = False else: self._status = Constants.busy_session_status self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds self._state = LivySessionState(session_id, http_client.connection_string, kind, sql_created) def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self._status) def get_state(self): return self._state def start(self): """Start the session against actual livy server.""" self.logger.debug("Starting '{}' session.".format(self.kind)) r = self._http_client.post("/sessions", [201], self.properties) self._state.session_id = str(r.json()["id"]) self._status = str(r.json()["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") self.logger.debug("Session '{}' started.".format(self.kind)) def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.started_sql_context: return self.logger.debug("Starting '{}' sql and hive session.".format(self.kind)) self.ipython_display.writeln("Creating SqlContext as 'sqlContext'") self._create_context(Constants.context_name_sql) self.ipython_display.writeln("Creating HiveContext as 'hiveContext'") self._create_context(Constants.context_name_hive) self._state.sql_context_created = True def _create_context(self, context_type): if context_type == Constants.context_name_sql: command = self._get_sql_context_creation_command() elif context_type == Constants.context_name_hive: command = self._get_hive_context_creation_command() else: raise ValueError("Cannot create context of type {}.".format(context_type)) try: self.wait_for_idle(self._create_sql_context_timeout_seconds) self.execute(command) self.logger.debug("Started '{}' {} session.".format(self.kind, context_type)) except LivyClientTimeoutError: raise LivyClientTimeoutError("Failed to create the {} context in time. Timed out after {} seconds." .format(context_type, self._create_sql_context_timeout_seconds)) @property def id(self): return self._state.session_id @property def started_sql_context(self): return self._state.sql_context_created @property def kind(self): return self._state.kind @property def logs(self): self._refresh_logs() return self._logs @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in Constants.final_status def execute(self, commands): code = textwrap.dedent(commands) data = {"code": code} r = self._http_client.post(self._statements_url(), [201], data) statement_id = r.json()['id'] return self._get_statement_output(statement_id) def delete(self): self.logger.debug("Deleting session '{}'".format(self.id)) if self._status != Constants.not_started_session_status and self._status != Constants.dead_session_status: self._http_client.delete("/sessions/{}".format(self.id), [200, 404]) self._status = Constants.dead_session_status self._state.session_id = "-1" else: raise ValueError("Cannot delete session {} that is in state '{}'." .format(self.id, self._status)) def wait_for_idle(self, seconds_to_wait): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ self._refresh_status() current_status = self._status if current_status == Constants.idle_session_status: return if current_status in Constants.final_status: error = "Session {} unexpectedly reached final status {}. See logs:\n{}"\ .format(self.id, current_status, self.logs) self.logger.error(error) raise LivyUnexpectedStatusError(error) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, current_status) self.logger.error(error) raise LivyClientTimeoutError(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, current_status, seconds_to_wait)) sleep(self._status_sleep_seconds) elapsed = (time() - start_time) return self.wait_for_idle(seconds_to_wait - elapsed) def _statements_url(self): return "/sessions/{}/statements".format(self.id) def _refresh_status(self): status = self._get_latest_status() if status in Constants.possible_session_status: self._status = status else: raise ValueError("Status '{}' not supported by session.".format(status)) return self._status def _refresh_logs(self): self._logs = self._get_latest_logs() def _get_latest_status(self): r = self._http_client.get("/sessions/{}".format(self.id), [200]) session = r.json() return session['state'] def _get_latest_logs(self): r = self._http_client.get("/sessions/{}/log?from=0".format(self.id), [200]) log_array = r.json()['log'] logs = "\n".join(log_array) return logs def _get_statement_output(self, statement_id): statement_running = True out = "" while statement_running: r = self._http_client.get(self._statements_url(), [200]) statement = [i for i in r.json()["statements"] if i["id"] == statement_id][0] status = statement["state"] self.logger.debug("Status of statement {} is {}.".format(statement_id, status)) if status == "running": sleep(self._statement_sleep_seconds) else: statement_running = False statement_output = statement["output"] if statement_output["status"] == "ok": out = (True, statement_output["data"]["text/plain"]) elif statement_output["status"] == "error": out = (False, statement_output["evalue"] + "\n" + "".join(statement_output["traceback"])) else: raise ValueError("Unknown output status: '{}'".format(statement_output["status"])) return out def _get_sql_context_creation_command(self): if self.kind == Constants.session_kind_spark: sql_context_command = "val sqlContext = new org.apache.spark.sql.SQLContext(sc)\n" \ "import sqlContext.implicits._" elif self.kind == Constants.session_kind_pyspark: sql_context_command = "from pyspark.sql import SQLContext\nfrom pyspark.sql.types import *\n" \ "sqlContext = SQLContext(sc)" elif self.kind == Constants.session_kind_sparkr: sql_context_command = "sqlContext <- sparkRSQL.init(sc)" else: raise ValueError("Do not know how to create sqlContext in session of kind {}.".format(self.kind)) return sql_context_command def _get_hive_context_creation_command(self): if self.kind == Constants.session_kind_spark: hive_context_command = "val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == Constants.session_kind_pyspark: hive_context_command = "from pyspark.sql import HiveContext\nhiveContext = HiveContext(sc)" elif self.kind == Constants.session_kind_sparkr: hive_context_command = "hiveContext <- sparkRHive.init(sc)" else: raise ValueError("Do not know how to create hiveContext in session of kind {}.".format(self.kind)) return hive_context_command
class ClientManager(object): """Livy client manager""" def __init__(self, serializer=None): serialize_periodically = False serialize_period = 3 if serializer is not None: serialize_periodically = conf.serialize_periodically() serialize_period = conf.serialize_period_seconds() self.logger = Log("ClientManager") self._livy_clients = dict() self._serializer = serializer self._serialize_timer = None if self._serializer is not None: for (name, client) in self._serializer.deserialize_state(): self.add_client(name, client) if serialize_periodically: self._serialize_state_periodically(serialize_period) def _serialize_state_periodically(self, serialize_period): self.logger.debug("Starting state serialize timer.") self._serialize_timer = Timer(serialize_period, self._serialize_state) self._serialize_timer.start() def _serialize_state(self): self._serializer.serialize_state(self._livy_clients) @property def livy_clients(self): return self._livy_clients def get_sessions_list(self): return list(self._livy_clients.keys()) def get_sessions_info(self): return [ "Name: {}\t{}".format(k, str(self._livy_clients[k])) for k in list(self._livy_clients.keys()) ] def add_client(self, name, livy_client): if name in self.get_sessions_list(): raise ValueError( "Session with name '{}' already exists. Please delete the session" " first if you intend to replace it.".format(name)) self._livy_clients[name] = livy_client def get_any_client(self): number_of_sessions = len(self._livy_clients) if number_of_sessions == 1: key = self.get_sessions_list()[0] return self._livy_clients[key] elif number_of_sessions == 0: raise AssertionError( "You need to have at least 1 client created to execute commands." ) else: raise AssertionError( "Please specify the client to use. Possible sessions are {}". format(self.get_sessions_list())) def get_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name] raise ValueError( "Could not find '{}' session in list of saved sessions. Possible sessions are {}" .format(name, self.get_sessions_list())) def get_session_id_for_client(self, name): if name in self.get_sessions_list(): return self._livy_clients[name].session_id return None def delete_client(self, name): self._remove_session(name) def clean_up_all(self): for name in self.get_sessions_list(): self._remove_session(name) if self._serializer is not None: self._serialize_state() def _remove_session(self, name): if name in self.get_sessions_list(): self._livy_clients[name].close_session() del self._livy_clients[name] else: raise ValueError( "Could not find '{}' session in list of saved sessions. Possible sessions are {}" .format(name, self.get_sessions_list()))
class LivySession(ObjectWithGuid): """Session that is livy specific.""" def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None): super(LivySession, self).__init__() assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display self._spark_events = SparkEvents() status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise ValueError("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) r = self._http_client.post_session(self.properties) self.id = r["id"] self.status = str(r["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutError: raise LivyClientTimeoutError("Session {} did not start up in {} seconds."\ .format(self.id, conf.livy_session_startup_timeout_seconds())) if create_sql_context: self.create_sql_context() self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status) def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug("Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln("Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: command.execute(self) except LivyClientTimeoutError: raise LivyClientTimeoutError("Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) self.created_sql_context = True def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)['log'] self._logs = "\n".join(log_array) return self._logs @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): self.logger.debug("Deleting session '{}'".format(self.id)) if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS: self._http_client.delete_session(self.id) self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: raise ValueError("Cannot delete session {} that is in state '{}'." .format(self.id, self.status)) def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self._refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = "Session {} unexpectedly reached final status '{}'. See logs:\n{}"\ .format(self.id, self.status, self.get_logs()) self.logger.error(error) raise LivyUnexpectedStatusError(error) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutError(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def _refresh_status(self): status = self._http_client.get_session(self.id)['state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise ValueError("Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = "sqlContext <- sparkRHive.init(sc)" else: raise ValueError("Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return Command(sql_context_command)
class SparkKernelBase(IPythonKernel): def __init__(self, implementation, implementation_version, language, language_version, language_info, session_language, user_code_parser=None, **kwargs): # Required by Jupyter - Override self.implementation = implementation self.implementation_version = implementation_version self.language = language self.language_version = language_version self.language_info = language_info # Override self.session_language = session_language super(SparkKernelBase, self).__init__(**kwargs) self.logger = Log("_jupyter_kernel".format(self.session_language)) self._fatal_error = None self.ipython_display = IpythonDisplay() if user_code_parser is None: self.user_code_parser = UserCodeParser() else: self.user_code_parser = user_code_parser # Disable warnings for test env in HDI requests.packages.urllib3.disable_warnings() if not kwargs.get("testing", False): self._load_magics_extension() self._change_language() if conf.use_auto_viz(): self._register_auto_viz() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): def f(self): if self._fatal_error is not None: return self._repeat_fatal_error() return self._do_execute(code, silent, store_history, user_expressions, allow_stdin) return wrap_unexpected_exceptions(f, self._complete_cell)(self) def do_shutdown(self, restart): # Cleanup self._delete_session() return self._do_shutdown_ipykernel(restart) def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin): code_to_run = self.user_code_parser.get_code_to_run(code) res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin) return res def _load_magics_extension(self): register_magics_code = "%load_ext remotespark.kernels" self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to load the Spark kernels magics library.") self.logger.debug("Loaded magics.") def _change_language(self): register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language) self._execute_cell(register_magics_code, True, False, shutdown_if_error=True, log_if_error="Failed to change language to {}.".format(self.session_language)) self.logger.debug("Changed language.") def _register_auto_viz(self): register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe ip = get_ipython() ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)""" self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True, log_if_error="Failed to register auto viz for notebook.") self.logger.debug("Registered auto viz.") def _delete_session(self): code = "%%_do_not_call_delete_session\n " self._execute_cell_for_user(code, True, False) def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False, shutdown_if_error=False, log_if_error=None): reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin) if shutdown_if_error and reply_content[u"status"] == u"error": error_from_reply = reply_content[u"evalue"] if log_if_error is not None: message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply) return self._abort_with_fatal_error(message) return reply_content def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin) def _do_shutdown_ipykernel(self, restart): return super(SparkKernelBase, self).do_shutdown(restart) def _complete_cell(self): """A method that runs a cell with no effect. Call this and return the value it returns when there's some sort of error preventing the user's cell from executing; this will register the cell from the Jupyter UI as being completed.""" return self._execute_cell("None", False, True, None, False) def _show_user_error(self, message): self.logger.error(message) self.ipython_display.send_error(message) def _queue_fatal_error(self, message): """Queues up a fatal error to be thrown when the next cell is executed; does not raise an error immediately. We use this for errors that happen on kernel startup, since IPython crashes if we throw an exception in the __init__ method.""" self._fatal_error = message def _abort_with_fatal_error(self, message): """Queues up a fatal error and throws it immediately.""" self._queue_fatal_error(message) return self._repeat_fatal_error() def _repeat_fatal_error(self): """Throws an error that has already been queued.""" error = conf.fatal_error_suggestion().format(self._fatal_error) self.logger.error(error) self.ipython_display.send_error(error) return self._complete_cell()
class SparkController(object): def __init__(self, ipython_display): self.logger = Log("SparkController") self.ipython_display = ipython_display self.session_manager = SessionManager() def get_logs(self, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return session_to_use.get_logs() def run_command(self, command, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return command.execute(session_to_use) def run_sqlquery(self, sqlquery, client_name=None): session_to_use = self.get_session_by_name_or_default(client_name) return sqlquery.execute(session_to_use) def get_all_sessions_endpoint(self, endpoint): http_client = self._http_client(endpoint) sessions = http_client.get_sessions()["sessions"] session_list = [ self._livy_session(http_client, {"kind": s["kind"]}, self.ipython_display, s["id"]) for s in sessions ] for s in session_list: s.refresh_status() return session_list def get_all_sessions_endpoint_info(self, endpoint): sessions = self.get_all_sessions_endpoint(endpoint) return [str(s) for s in sessions] def cleanup(self): self.session_manager.clean_up_all() def cleanup_endpoint(self, endpoint): for session in self.get_all_sessions_endpoint(endpoint): session.delete() def delete_session_by_name(self, name): self.session_manager.delete_client(name) def delete_session_by_id(self, endpoint, session_id): http_client = self._http_client(endpoint) response = http_client.get_session(session_id) http_client = self._http_client(endpoint) session = self._livy_session(http_client, {"kind": response["kind"]}, self.ipython_display, session_id, False) session.delete() def add_session(self, name, endpoint, skip_if_exists, properties): if skip_if_exists and (name in self.session_manager.get_sessions_list()): self.logger.debug( "Skipping {} because it already exists in list of sessions.". format(name)) return http_client = self._http_client(endpoint) session = self._livy_session(http_client, properties, self.ipython_display) self.session_manager.add_session(name, session) session.start() def get_session_id_for_client(self, name): return self.session_manager.get_session_id_for_client(name) def get_client_keys(self): return self.session_manager.get_sessions_list() def get_manager_sessions_str(self): return self.session_manager.get_sessions_info() def get_session_by_name_or_default(self, client_name): if client_name is None: return self.session_manager.get_any_session() else: client_name = client_name.lower() return self.session_manager.get_session(client_name) def get_managed_clients(self): return self.session_manager.sessions @staticmethod def _livy_session(http_client, properties, ipython_display, session_id=-1, sql_created=None): return LivySession(http_client, properties, ipython_display, session_id, sql_created) @staticmethod def _http_client(endpoint): return LivyReliableHttpClient.from_endpoint(endpoint)
class ReliableHttpClient(object): """Http client that is reliable in its requests. Uses requests library.""" def __init__(self, endpoint, headers, retry_policy): self._endpoint = endpoint self._headers = headers self._retry_policy = retry_policy self.logger = Log("ReliableHttpClient") self.verify_ssl = not conf.ignore_ssl_errors() if not self.verify_ssl: self.logger.debug("ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.") requests.packages.urllib3.disable_warnings() def compose_url(self, relative_url): r_u = "/{}".format(relative_url.rstrip("/").lstrip("/")) return self._endpoint.url + r_u def get(self, relative_url, accepted_status_codes): """Sends a get request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.get) def post(self, relative_url, accepted_status_codes, data): """Sends a post request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.post, data) def delete(self, relative_url, accepted_status_codes): """Sends a delete request. Returns a response.""" return self._send_request(relative_url, accepted_status_codes, requests.delete) def _send_request(self, relative_url, accepted_status_codes, function, data=None): return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0) def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count): while True: try: if not self._endpoint.authenticate: if data is None: r = function(url, headers=self._headers, verify=self.verify_ssl) else: r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl) else: if data is None: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), verify=self.verify_ssl) else: r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password), data=json.dumps(data), verify=self.verify_ssl) except requests.exceptions.RequestException as e: error = True r = None status = None self.logger.error("Request to '{}' failed with '{}'".format(url, e)) else: error = False status = r.status_code if error or status not in accepted_status_codes: if self._retry_policy.should_retry(status, error, retry_count): sleep(self._retry_policy.seconds_to_sleep(retry_count)) retry_count += 1 continue else: raise HttpClientException("Invalid status code '{}' or error '{}' from {}" .format(status, error, url)) return r
class LivySession(ObjectWithGuid): def __init__(self, http_client, properties, ipython_display, session_id=-1, sql_created=None, spark_events=None): super(LivySession, self).__init__() assert "kind" in list(properties.keys()) kind = properties["kind"] self.properties = properties self.ipython_display = ipython_display if spark_events is None: spark_events = SparkEvents() self._spark_events = spark_events status_sleep_seconds = conf.status_sleep_seconds() statement_sleep_seconds = conf.statement_sleep_seconds() wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds() assert status_sleep_seconds > 0 assert statement_sleep_seconds > 0 assert wait_for_idle_timeout_seconds > 0 if session_id == -1 and sql_created is True: raise BadUserDataException("Cannot indicate sql state without session id.") self.logger = Log("LivySession") kind = kind.lower() if kind not in constants.SESSION_KINDS_SUPPORTED: raise BadUserDataException("Session of kind '{}' not supported. Session must be of kinds {}." .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED))) if session_id == -1: self.status = constants.NOT_STARTED_SESSION_STATUS sql_created = False else: self.status = constants.BUSY_SESSION_STATUS self._logs = "" self._http_client = http_client self._status_sleep_seconds = status_sleep_seconds self._statement_sleep_seconds = statement_sleep_seconds self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds self.kind = kind self.id = session_id self.created_sql_context = sql_created def __str__(self): return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status) def start(self, create_sql_context=True): """Start the session against actual livy server.""" self._spark_events.emit_session_creation_start_event(self.guid, self.kind) try: r = self._http_client.post_session(self.properties) self.id = r["id"] self.status = str(r["state"]) self.ipython_display.writeln("Creating SparkContext as 'sc'") # We wait for livy_session_startup_timeout_seconds() for the session to start up. try: self.wait_for_idle(conf.livy_session_startup_timeout_seconds()) except LivyClientTimeoutException: raise LivyClientTimeoutException("Session {} did not start up in {} seconds." .format(self.id, conf.livy_session_startup_timeout_seconds())) if create_sql_context: self.create_sql_context() except Exception as e: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "") def create_sql_context(self): """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'.""" if self.created_sql_context: return self.logger.debug("Starting '{}' hive session.".format(self.kind)) self.ipython_display.writeln("Creating HiveContext as 'sqlContext'") command = self._get_sql_context_creation_command() try: (success, out) = command.execute(self) except LivyClientTimeoutException: raise LivyClientTimeoutException("Failed to create the SqlContext in time. Timed out after {} seconds." .format(self._wait_for_idle_timeout_seconds)) if success: self.created_sql_context = True else: raise FailedToCreateSqlContextException("Failed to create the SqlContext.\nError: '{}'".format(out)) def get_logs(self): log_array = self._http_client.get_all_session_logs(self.id)['log'] self._logs = "\n".join(log_array) return self._logs @property def http_client(self): return self._http_client @staticmethod def is_final_status(status): return status in constants.FINAL_STATUS def delete(self): session_id = self.id self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status) try: self.logger.debug("Deleting session '{}'".format(session_id)) if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS: self._http_client.delete_session(session_id) self.status = constants.DEAD_SESSION_STATUS self.id = -1 else: self.ipython_display.send_error("Cannot delete session {} that is in state '{}'." .format(session_id, self.status)) except Exception as e: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False, e.__class__.__name__, str(e)) raise else: self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "") def wait_for_idle(self, seconds_to_wait=None): """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as indicated by the constructor. Parameters: seconds_to_wait : number of seconds to wait before giving up. """ if seconds_to_wait is None: seconds_to_wait = self._wait_for_idle_timeout_seconds while True: self.refresh_status() if self.status == constants.IDLE_SESSION_STATUS: return if self.status in constants.FINAL_STATUS: error = "Session {} unexpectedly reached final status '{}'."\ .format(self.id, self.status) self.logger.error(error) raise LivyUnexpectedStatusException('{} See logs:\n{}'.format(error, self.get_logs())) if seconds_to_wait <= 0.0: error = "Session {} did not reach idle status in time. Current status is {}."\ .format(self.id, self.status) self.logger.error(error) raise LivyClientTimeoutException(error) start_time = time() self.logger.debug("Session {} in state {}. Sleeping {} seconds." .format(self.id, self.status, self._status_sleep_seconds)) sleep(self._status_sleep_seconds) seconds_to_wait -= time() - start_time def sleep(self): sleep(self._statement_sleep_seconds) def refresh_status(self): status = self._http_client.get_session(self.id)['state'] if status in constants.POSSIBLE_SESSION_STATUS: self.status = status else: raise LivyUnexpectedStatusException("Status '{}' not supported by session.".format(status)) return self.status def _get_sql_context_creation_command(self): if self.kind == constants.SESSION_KIND_SPARK: sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)" elif self.kind == constants.SESSION_KIND_PYSPARK: sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)" elif self.kind == constants.SESSION_KIND_SPARKR: sql_context_command = "sqlContext <- sparkRHive.init(sc)" else: raise BadUserDataException("Do not know how to create HiveContext in session of kind {}.".format(self.kind)) return Command(sql_context_command)