Beispiel #1
0
class Command(ObjectWithGuid):
    def __init__(self, code, spark_events=None):
        super(Command, self).__init__()
        self.code = textwrap.dedent(code)
        self.logger = Log(u"Command")
        if spark_events is None:
            spark_events = SparkEvents()
        self._spark_events = spark_events

    def __eq__(self, other):
        return self.code == other.code

    def __ne__(self, other):
        return not self == other

    def execute(self, session):
        self._spark_events.emit_statement_execution_start_event(session.guid, session.kind, session.id, self.guid)
        statement_id = -1
        try:
            session.wait_for_idle()
            data = {u"code": self.code}
            response = session.http_client.post_statement(session.id, data)
            statement_id = response[u'id']
            output = self._get_statement_output(session, statement_id)
        except Exception as e:
            self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id,
                                                                  self.guid, statement_id, False, e.__class__.__name__,
                                                                  str(e))
            raise
        else:
            self._spark_events.emit_statement_execution_end_event(session.guid, session.kind, session.id,
                                                                  self.guid, statement_id, True, "", "")
            return output

    def _get_statement_output(self, session, statement_id):
        statement_running = True
        out = u""
        while statement_running:
            statement = session.http_client.get_statement(session.id, statement_id)
            status = statement[u"state"]

            self.logger.debug(u"Status of statement {} is {}.".format(statement_id, status))

            if status == u"running":
                session.sleep()
            else:
                statement_running = False

                statement_output = statement[u"output"]
                if statement_output[u"status"] == u"ok":
                    out = (True, statement_output[u"data"][u"text/plain"])
                elif statement_output[u"status"] == u"error":
                    out = (False,
                           statement_output[u"evalue"] + u"\n" + u"".join(statement_output[u"traceback"]))
                else:
                    raise LivyUnexpectedStatusException(u"Unknown output status from Livy: '{}'"
                                                        .format(statement_output[u"status"]))

        return out
Beispiel #2
0
class Command(ObjectWithGuid):
    def __init__(self, code):
        super(Command, self).__init__()
        self.code = textwrap.dedent(code)
        self.logger = Log("Command")

    def __eq__(self, other):
        return self.code == other.code

    def __ne__(self, other):
        return not self == other

    def execute(self, session):
        session.wait_for_idle()
        data = {"code": self.code}
        response = session.http_client.post_statement(session.id, data)
        statement_id = response['id']
        return self._get_statement_output(session, statement_id)

    def _get_statement_output(self, session, statement_id):
        statement_running = True
        out = ""
        while statement_running:
            statement = session.http_client.get_statement(session.id, statement_id)
            status = statement["state"]

            self.logger.debug("Status of statement {} is {}.".format(statement_id, status))

            if status == "running":
                session.sleep()
            else:
                statement_running = False

                statement_output = statement["output"]
                if statement_output["status"] == "ok":
                    out = (True, statement_output["data"]["text/plain"])
                elif statement_output["status"] == "error":
                    out = (False,
                           statement_output["evalue"] + "\n" + "".join(statement_output["traceback"]))
                else:
                    raise ValueError("Unknown output status: '{}'".format(statement_output["status"]))

        return out
Beispiel #3
0
class SparkMagicBase(Magics):
    def __init__(self, shell, data=None):
        # You must call the parent constructor
        super(SparkMagicBase, self).__init__(shell)

        self.logger = Log("SparkMagics")
        self.ipython_display = IpythonDisplay()
        self.spark_controller = SparkController(self.ipython_display)

        try:
            should_serialize = conf.serialize()
            if should_serialize:
                self.logger.debug("Serialization enabled.")

                self.magics_home_path = get_magics_home_path()
                path_to_serialize = join_paths(self.magics_home_path,
                                               "state.json")

                self.logger.debug(
                    "Will serialize to {}.".format(path_to_serialize))

                self.spark_controller = SparkController(
                    self.ipython_display, serialize_path=path_to_serialize)
            else:
                self.logger.debug("Serialization NOT enabled.")
        except KeyError:
            self.logger.error("Could not read env vars for serialization.")

        self.logger.debug("Initialized spark magics.")

    def execute_sqlquery(self, sqlquery, session, output_var, quiet):
        try:
            df = self.spark_controller.run_cell_sql(sqlquery, session)
            if output_var is not None:
                self.shell.user_ns[output_var] = df
            if quiet:
                return None
            else:
                return df
        except DataFrameParseException as e:
            self.ipython_display.send_error(e.out)
            return None

    @staticmethod
    def print_endpoint_info(info_sessions):
        sessions_info = ["        {}".format(i) for i in info_sessions]
        print("""Info for endpoint:
    Sessions:
{}
""".format("\n".join(sessions_info)))
Beispiel #4
0
class SparkMagicBase(Magics):
    def __init__(self, shell, data=None, spark_events=None):
        # You must call the parent constructor
        super(SparkMagicBase, self).__init__(shell)

        self.logger = Log("SparkMagics")
        self.ipython_display = IpythonDisplay()
        self.spark_controller = SparkController(self.ipython_display)

        self.logger.debug("Initialized spark magics.")

        if spark_events is None:
            spark_events = SparkEvents()
        spark_events.emit_library_loaded_event()

    def execute_sqlquery(self, cell, samplemethod, maxrows, samplefraction,
                         session, output_var, quiet):
        sqlquery = self._sqlquery(cell, samplemethod, maxrows, samplefraction)
        df = self.spark_controller.run_sqlquery(sqlquery, session)
        if output_var is not None:
            self.shell.user_ns[output_var] = df
        if quiet:
            return None
        else:
            return df

    @staticmethod
    def _sqlquery(cell, samplemethod, maxrows, samplefraction):
        return SQLQuery(cell, samplemethod, maxrows, samplefraction)

    @staticmethod
    def print_endpoint_info(info_sessions):
        sessions_info = ["        {}".format(i) for i in info_sessions]
        print("""Info for endpoint:
    Sessions:
{}
""".format("\n".join(sessions_info)))
Beispiel #5
0
class SparkMagicBase(Magics):
    def __init__(self, shell, data=None):
        # You must call the parent constructor
        super(SparkMagicBase, self).__init__(shell)

        self.logger = Log("SparkMagics")
        self.ipython_display = IpythonDisplay()
        self.spark_controller = SparkController(self.ipython_display)

        try:
            should_serialize = conf.serialize()
            if should_serialize:
                self.logger.debug("Serialization enabled.")

                self.magics_home_path = get_magics_home_path()
                path_to_serialize = join_paths(self.magics_home_path, "state.json")

                self.logger.debug("Will serialize to {}.".format(path_to_serialize))

                self.spark_controller = SparkController(self.ipython_display, serialize_path=path_to_serialize)
            else:
                self.logger.debug("Serialization NOT enabled.")
        except KeyError:
            self.logger.error("Could not read env vars for serialization.")

        self.logger.debug("Initialized spark magics.")

    def execute_sqlquery(self, sqlquery, session, output_var, quiet):
        try:
            df = self.spark_controller.run_cell_sql(sqlquery, session)
            if output_var is not None:
                self.shell.user_ns[output_var] = df
            if quiet:
                return None
            else:
                return df
        except DataFrameParseException as e:
            self.ipython_display.send_error(e.out)
            return None

    @staticmethod
    def print_endpoint_info(info_sessions):
        sessions_info = ["        {}".format(i) for i in info_sessions]
        print("""Info for endpoint:
    Sessions:
{}
""".format("\n".join(sessions_info)))
Beispiel #6
0
class ClientManager(object):
    """Livy client manager"""

    def __init__(self, serializer=None):
        serialize_periodically = False
        serialize_period = 3

        if serializer is not None:
            serialize_periodically = conf.serialize_periodically()
            serialize_period = conf.serialize_period_seconds()

        self.logger = Log("ClientManager")

        self._livy_clients = dict()
        self._serializer = serializer
        self._serialize_timer = None

        if self._serializer is not None:
            for (name, client) in self._serializer.deserialize_state():
                self.add_client(name, client)

            if serialize_periodically:
                self._serialize_state_periodically(serialize_period)

    def _serialize_state_periodically(self, serialize_period):
        self.logger.debug("Starting state serialize timer.")

        self._serialize_timer = Timer(serialize_period, self._serialize_state)
        self._serialize_timer.start()

    def _serialize_state(self):
        self._serializer.serialize_state(self._livy_clients)

    @property
    def livy_clients(self):
        return self._livy_clients

    def get_sessions_list(self):
        return list(self._livy_clients.keys())

    def get_sessions_info(self):
        return ["Name: {}\t{}".format(k, str(self._livy_clients[k])) for k in list(self._livy_clients.keys())]

    def add_client(self, name, livy_client):
        if name in self.get_sessions_list():
            raise ValueError("Session with name '{}' already exists. Please delete the session"
                             " first if you intend to replace it.".format(name))

        self._livy_clients[name] = livy_client

    def get_any_client(self):
        number_of_sessions = len(self._livy_clients)
        if number_of_sessions == 1:
            key = self.get_sessions_list()[0]
            return self._livy_clients[key]
        elif number_of_sessions == 0:
            raise AssertionError("You need to have at least 1 client created to execute commands.")
        else:
            raise AssertionError("Please specify the client to use. Possible sessions are {}".format(
                self.get_sessions_list()))
        
    def get_client(self, name):
        if name in self.get_sessions_list():
            return self._livy_clients[name]
        raise ValueError("Could not find '{}' session in list of saved sessions. Possible sessions are {}".format(
            name, self.get_sessions_list()))

    def get_session_id_for_client(self, name):
        if name in self.get_sessions_list():
            return self._livy_clients[name].session_id
        return None

    def delete_client(self, name):
        self._remove_session(name)
    
    def clean_up_all(self):
        for name in self.get_sessions_list():
            self._remove_session(name)

        if self._serializer is not None:
            self._serialize_state()

    def _remove_session(self, name):
        if name in self.get_sessions_list():
            self._livy_clients[name].close_session()
            del self._livy_clients[name]
        else:
            raise ValueError("Could not find '{}' session in list of saved sessions. Possible sessions are {}"
                             .format(name, self.get_sessions_list()))
class SparkKernelBase(IPythonKernel):
    def __init__(self, implementation, implementation_version, language, language_version, language_info,
                 kernel_conf_name, session_language, client_name, **kwargs):
        # Required by Jupyter - Override
        self.implementation = implementation
        self.implementation_version = implementation_version
        self.language = language
        self.language_version = language_version
        self.language_info = language_info

        # Override
        self.kernel_conf_name = kernel_conf_name
        self.session_language = session_language
        self.client_name = client_name

        super(SparkKernelBase, self).__init__(**kwargs)

        self._logger = Log(self.client_name)
        self._session_started = False
        self._fatal_error = None
        self._ipython_display = IpythonDisplay()

        self.user_command_parser = UserCommandParser()

        # Disable warnings for test env in HDI
        requests.packages.urllib3.disable_warnings()

        if not kwargs.get("testing", False):
            configuration = self._get_configuration()
            if not configuration:
                # _get_configuration() sets the error for us so we can just return now.
                # The kernel is not in a good state and all do_execute calls will
                # fail with the fatal error.
                return
            (username, password, url) = configuration
            self.connection_string = get_connection_string(url, username, password)
            self._load_magics_extension()
            if conf.use_auto_viz():
                self._register_auto_viz()

    def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        if self._fatal_error is not None:
            self._repeat_fatal_error()

        # Parse command
        subcommand, force, output_var, command = self.user_command_parser.parse_user_command(code)

        # Get transformer
        transformer = self._get_code_transformer(subcommand)

        # Get instructions
        try:
            code_to_run, error_to_show, begin_action, end_action, deletes_session = \
                transformer.get_code_to_execute(self._session_started, self.connection_string,
                                                force, output_var, command)
        except SyntaxError as se:
            self._show_user_error("{}".format(se))
        else:
            # Execute instructions
            if error_to_show is not None:
                self._show_user_error(error_to_show)
                return self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin)

            if begin_action == Constants.delete_session_action:
                self._delete_session()
            elif begin_action == Constants.start_session_action:
                self._start_session()
            elif begin_action == Constants.do_nothing_action:
                pass
            else:
                raise ValueError("Begin action {} not supported.".format(begin_action))

            res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin)

            if end_action == Constants.delete_session_action:
                self._delete_session()
            elif end_action == Constants.start_session_action:
                self._start_session()
            elif end_action == Constants.do_nothing_action:
                pass
            else:
                raise ValueError("End action {} not supported.".format(end_action))

            if deletes_session:
                self._session_started = False

            return res

        return self._execute_cell("", silent, store_history, user_expressions, allow_stdin)

    def do_shutdown(self, restart):
        # Cleanup
        self._delete_session()

        return self._do_shutdown_ipykernel(restart)

    @staticmethod
    def _get_code_transformer(subcommand):
        if subcommand == UserCommandParser.run_command:
            return SparkTransformer(subcommand)
        elif subcommand == UserCommandParser.sql_command:
            return SqlTransformer(subcommand)
        elif subcommand == UserCommandParser.hive_command:
            return HiveTransformer(subcommand)
        elif subcommand == UserCommandParser.config_command:
            return ConfigTransformer(subcommand)
        elif subcommand == UserCommandParser.info_command:
            return InfoTransformer(subcommand)
        elif subcommand == UserCommandParser.delete_command:
            return DeleteSessionTransformer(subcommand)
        elif subcommand == UserCommandParser.clean_up_command:
            return CleanUpTransformer(subcommand)
        elif subcommand == UserCommandParser.logs_command:
            return LogsTransformer(subcommand)
        elif subcommand == UserCommandParser.local_command:
            return PythonTransformer(subcommand)
        else:
            return NotSupportedTransformer(subcommand)

    def _load_magics_extension(self):
        register_magics_code = "%load_ext remotespark"
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to load the Spark magics library.")
        self._logger.debug("Loaded magics.")

    def _register_auto_viz(self):
        register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe
ip = get_ipython()
ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)"""
        self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to register auto viz for notebook.")
        self._logger.debug("Registered auto viz.")

    def _start_session(self):
        if not self._session_started:
            self._session_started = True

            add_session_code = "%spark add {} {} {} skip".format(
                self.client_name, self.session_language, self.connection_string)
            self._execute_cell(add_session_code, True, False, shutdown_if_error=True,
                               log_if_error="Failed to create a Livy session.")
            self._logger.debug("Added session.")

    def _delete_session(self):
        if self._session_started:
            code = "%spark cleanup"
            self._execute_cell_for_user(code, True, False)
            self._session_started = False

    def _get_configuration(self):
        """Returns (username, password, url). If there is an error (missing configuration),
           returns False."""
        try:
            credentials = getattr(conf, 'kernel_' + self.kernel_conf_name + '_credentials')()
            ret = (credentials['username'], credentials['password'], credentials['url'])

            # The URL has to be set in the configuration.
            assert(ret[2])

            return ret
        except (KeyError, AssertionError):
            message = "Please set configuration for 'kernel_{}_credentials' to initialize Kernel".format(
                self.kernel_conf_name)
            self._queue_fatal_error(message)
            return False

    def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False,
                      shutdown_if_error=False, log_if_error=None):
        reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin)

        if shutdown_if_error and reply_content[u"status"] == u"error":
            error_from_reply = reply_content[u"evalue"]
            if log_if_error is not None:
                message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply)
                self._abort_with_fatal_error(message)

        return reply_content

    def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin)

    def _do_shutdown_ipykernel(self, restart):
        return super(SparkKernelBase, self).do_shutdown(restart)

    def _show_user_error(self, message):
        self._logger.error(message)
        self._ipython_display.send_error(message)

    def _queue_fatal_error(self, message):
        """Queues up a fatal error to be thrown when the next cell is executed; does not
        raise an error immediately. We use this for errors that happen on kernel startup,
        since IPython crashes if we throw an exception in the __init__ method."""
        self._fatal_error = message

    def _abort_with_fatal_error(self, message):
        """Queues up a fatal error and throws it immediately."""
        self._queue_fatal_error(message)
        self._repeat_fatal_error()

    def _repeat_fatal_error(self):
        """Throws an error that has already been queued."""
        error = conf.fatal_error_suggestion().format(self._fatal_error)
        self._logger.error(error)
        self._ipython_display.send_error(error)
        raise ValueError(self._fatal_error)
class SparkController(object):
    def __init__(self, ipython_display):
        self.logger = Log(u"SparkController")
        self.ipython_display = ipython_display
        self.session_manager = SessionManager()

    def get_app_id(self, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return session_to_use.get_app_id()

    def get_driver_log_url(self, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return session_to_use.get_driver_log_url()

    def get_logs(self, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return session_to_use.get_logs()

    def get_spark_ui_url(self, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return session_to_use.get_spark_ui_url()

    def run_command(self, command, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return command.execute(session_to_use)

    def run_sqlquery(self, sqlquery, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return sqlquery.execute(session_to_use)

    def get_all_sessions_endpoint(self, endpoint):
        http_client = self._http_client(endpoint)
        sessions = http_client.get_sessions()[u"sessions"]
        session_list = [self._livy_session(http_client, {u"kind": s[u"kind"]},
                                           self.ipython_display, s[u"id"])
                        for s in sessions]
        for s in session_list:
            s.refresh_status()
        return session_list

    def get_all_sessions_endpoint_info(self, endpoint):
        sessions = self.get_all_sessions_endpoint(endpoint)
        return [str(s) for s in sessions]

    def cleanup(self):
        self.session_manager.clean_up_all()

    def cleanup_endpoint(self, endpoint):
        for session in self.get_all_sessions_endpoint(endpoint):
            session.delete()

    def delete_session_by_name(self, name):
        self.session_manager.delete_client(name)

    def delete_session_by_id(self, endpoint, session_id):
        http_client = self._http_client(endpoint)
        response = http_client.get_session(session_id)
        http_client = self._http_client(endpoint)
        session = self._livy_session(http_client, {u"kind": response[u"kind"]},
                                     self.ipython_display, session_id, False)
        session.delete()

    def add_session(self, name, endpoint, skip_if_exists, properties):
        if skip_if_exists and (name in self.session_manager.get_sessions_list()):
            self.logger.debug(u"Skipping {} because it already exists in list of sessions.".format(name))
            return
        http_client = self._http_client(endpoint)
        session = self._livy_session(http_client, properties, self.ipython_display)
        self.session_manager.add_session(name, session)
        session.start()

    def get_session_id_for_client(self, name):
        return self.session_manager.get_session_id_for_client(name)

    def get_client_keys(self):
        return self.session_manager.get_sessions_list()

    def get_manager_sessions_str(self):
        return self.session_manager.get_sessions_info()

    def get_session_by_name_or_default(self, client_name):
        if client_name is None:
            return self.session_manager.get_any_session()
        else:
            client_name = client_name.lower()
            return self.session_manager.get_session(client_name)

    def get_managed_clients(self):
        return self.session_manager.sessions

    @staticmethod
    def _livy_session(http_client, properties, ipython_display,
                      session_id=-1, sql_created=None):
        return LivySession(http_client, properties, ipython_display,
                           session_id, sql_created)

    @staticmethod
    def _http_client(endpoint):
        return LivyReliableHttpClient.from_endpoint(endpoint)
Beispiel #9
0
class ClientManagerStateSerializer(object):
    """Livy client manager state serializer"""
    def __init__(self, reader_writer):
        assert reader_writer is not None

        self.logger = Log("ClientManagerStateSerializer")
        self._ipython_display = IpythonDisplay()

        self._reader_writer = reader_writer

    def deserialize_state(self):
        self.logger.debug("Deserializing state.")

        clients_to_return = []

        lines = self._reader_writer.read_lines()
        line = ''.join(lines).strip()

        if line != '':
            self.logger.debug("Read content. Converting to JSON.")
            json_str = json.loads(line)
            clients = json_str["clients"]

            for client in clients:
                # Ignore version for now
                name = client["name"]
                session_id = client["id"]
                sql_context_created = client["sqlcontext"]
                kind = client["kind"].lower()
                connection_string = client["connectionstring"]

                session = self._create_livy_session(connection_string,
                                                    {"kind": kind},
                                                    self._ipython_display,
                                                    session_id,
                                                    sql_context_created)

                # Do not start session automatically. Just create it but skip is not existent.
                try:
                    # Get status to know if it's alive or not.
                    status = session.status
                    if not session.is_final_status(status):
                        self.logger.debug(
                            "Adding session {}".format(session_id))
                        client_obj = self._create_livy_client(session)
                        clients_to_return.append((name, client_obj))
                    else:
                        self.logger.error(
                            "Skipping serialized session '{}' because session was in status {}."
                            .format(session.id, status))
                except (ValueError, ConnectionError) as e:
                    self.logger.error(
                        "Skipping serialized session '{}' because {}".format(
                            session.id, str(e)))
        else:
            self.logger.debug("Empty manager state found.")

        return clients_to_return

    def serialize_state(self, name_client_dictionary):
        self.logger.debug("Serializing state.")

        serialized_clients = []
        for name in list(name_client_dictionary.keys()):
            client = name_client_dictionary[name]
            serialized_client = client.serialize()
            serialized_client["name"] = name
            serialized_clients.append(serialized_client)

        serialized_str = json.dumps({"clients": serialized_clients})
        self._reader_writer.overwrite_with_line(serialized_str)

    def _create_livy_session(self, connection_string, properties,
                             ipython_display, session_id, sql_context_created):
        return LivySession.from_connection_string(connection_string,
                                                  properties, ipython_display,
                                                  session_id,
                                                  sql_context_created)

    def _create_livy_client(self, session):
        return LivyClient(session)
class RemoteSparkMagics(Magics):
    def __init__(self, shell, data=None):
        # You must call the parent constructor
        super(RemoteSparkMagics, self).__init__(shell)

        self.logger = Log("RemoteSparkMagics")
        self.ipython_display = IpythonDisplay()
        self.spark_controller = SparkController(self.ipython_display)

        try:
            should_serialize = conf.serialize()
            if should_serialize:
                self.logger.debug("Serialization enabled.")

                self.magics_home_path = get_magics_home_path()
                path_to_serialize = join_paths(self.magics_home_path, "state.json")

                self.logger.debug("Will serialize to {}.".format(path_to_serialize))

                self.spark_controller = SparkController(self.ipython_display, serialize_path=path_to_serialize)
            else:
                self.logger.debug("Serialization NOT enabled.")
        except KeyError:
            self.logger.error("Could not read env vars for serialization.")

        self.logger.debug("Initialized spark magics.")

    @magic_arguments()
    @argument("-c", "--context", type=str, default=Constants.context_name_spark,
              help="Context to use: '{}' for spark, '{}' for sql queries, and '{}' for hive queries. "
                   "Default is '{}'.".format(Constants.context_name_spark,
                                             Constants.context_name_sql,
                                             Constants.context_name_hive,
                                             Constants.context_name_spark))
    @argument("-s", "--session", help="The name of the Livy session to use. "
                                      "If only one session has been created, there's no need to specify one.")
    @argument("-o", "--output", type=str, default=None, help="If present, output when using SQL or Hive "
                                                             "query will be stored in variable of this name.")
    @argument("command", type=str, default=[""], nargs="*", help="Commands to execute.")
    @needs_local_scope
    @line_cell_magic
    def spark(self, line, cell="", local_ns=None):
        """Magic to execute spark remotely.

           This magic allows you to create a Livy Scala or Python session against a Livy endpoint. Every session can
           be used to execute either Spark code or SparkSQL code by executing against the SQL context in the session.
           When the SQL context is used, the result will be a Pandas dataframe of a sample of the results.

           If invoked with no subcommand, the cell will be executed against the specified session.

           Subcommands
           -----------
           info
               Display the available Livy sessions and other configurations for sessions.
           add
               Add a Livy session. First argument is the name of the session, second argument
               is the language, and third argument is the connection string of the Livy endpoint.
               A fourth argument specifying if session creation can be skipped if it already exists is optional:
               "skip" or empty.
               e.g. `%%spark add test python url=https://sparkcluster.net/livy;username=u;password=p skip`
               or
               e.g. `%%spark add test python url=https://sparkcluster.net/livy;username=u;password=p`
           config
               Override the livy session properties sent to Livy on session creation. All session creations will
               contain these config settings from then on.
               Expected value is a JSON key-value string to be sent as part of the Request Body for the POST /sessions
               endpoint in Livy.
               e.g. `%%spark config {"driverMemory":"1000M", "executorCores":4}`
           run
               Run Spark code against a session.
               e.g. `%%spark -s testsession` will execute the cell code against the testsession previously created
               e.g. `%%spark -s testsession -c sql` will execute the SQL code against the testsession previously created
               e.g. `%%spark -s testsession -c sql -o my_var` will execute the SQL code against the testsession
                        previously created and store the pandas dataframe created in the my_var variable in the
                        Python environment.
           logs
               Returns the logs for a given session.
               e.g. `%%spark logs -s testsession` will return the logs for the testsession previously created
           delete
               Delete a Livy session. Argument is the name of the session to be deleted.
               e.g. `%%spark delete defaultlivy`
           cleanup
               Delete all Livy sessions created by the notebook. No arguments required.
               e.g. `%%spark cleanup`
        """
        usage = "Please look at usage of %spark by executing `%spark?`."
        user_input = line
        args = parse_argstring(self.spark, user_input)

        subcommand = args.command[0].lower()

        try:
            # info
            if subcommand == "info":
                if len(args.command) == 2:
                    connection_string = args.command[1]
                    info_sessions = self.spark_controller.get_all_sessions_endpoint_info(connection_string)
                    self._print_endpoint_info(info_sessions)
                elif len(args.command) == 1:
                    self._print_local_info()
                else:
                    raise ValueError("Subcommand 'info' requires no value or a connection string to show all sessions.\n"
                                     "{}".format(usage))
            # config
            elif subcommand == "config":
                # Would normally do " ".join(args.command[1:]) but parse_argstring removes quotes...
                rest_of_line = user_input[7:]
                conf.override(conf.session_configs.__name__, json.loads(rest_of_line))
            # add
            elif subcommand == "add":
                if len(args.command) != 4 and len(args.command) != 5:
                    raise ValueError("Subcommand 'add' requires three or four arguments.\n{}".format(usage))

                name = args.command[1].lower()
                language = args.command[2].lower()
                connection_string = args.command[3]

                if len(args.command) == 5:
                    skip = args.command[4].lower() == "skip"
                else:
                    skip = False

                properties = copy.deepcopy(conf.session_configs())
                properties["kind"] = self._get_livy_kind(language)

                self.spark_controller.add_session(name, connection_string, skip, properties)
            # delete
            elif subcommand == "delete":
                if len(args.command) == 2:
                    name = args.command[1].lower()
                    self.spark_controller.delete_session_by_name(name)
                elif len(args.command) == 3:
                    connection_string = args.command[1]
                    session_id = args.command[2]
                    self.spark_controller.delete_session_by_id(connection_string, session_id)
                else:
                    raise ValueError("Subcommand 'delete' requires a session name or a connection string and id.\n{}"
                                     .format(usage))
            # cleanup
            elif subcommand == "cleanup":
                if len(args.command) == 2:
                    connection_string = args.command[1]
                    self.spark_controller.cleanup_endpoint(connection_string)
                elif len(args.command) == 1:
                    self.spark_controller.cleanup()
                else:
                    raise ValueError("Subcommand 'cleanup' requires no further values or a connection string to clean up "
                                     "sessions.\n{}".format(usage))
            # logs
            elif subcommand == "logs":
                if len(args.command) == 1:
                    (success, out) = self.spark_controller.get_logs(args.session)
                    if success:
                        self.ipython_display.write(out)
                    else:
                        self.ipython_display.send_error(out)
                else:
                    raise ValueError("Subcommand 'logs' requires no further values.\n{}".format(usage))
            # run
            elif len(subcommand) == 0:
                if args.context == Constants.context_name_spark:
                    (success, out) = self.spark_controller.run_cell(cell, args.session)
                    if success:
                        self.ipython_display.write(out)
                    else:
                        self.ipython_display.send_error(out)
                elif args.context == Constants.context_name_sql:
                    return self._execute_against_context_that_returns_df(self.spark_controller.run_cell_sql, cell,
                                                                         args.session, args.output)
                elif args.context == Constants.context_name_hive:
                    return self._execute_against_context_that_returns_df(self.spark_controller.run_cell_hive, cell,
                                                                         args.session, args.output)
                else:
                    raise ValueError("Context '{}' not found".format(args.context))
            # error
            else:
                raise ValueError("Subcommand '{}' not found. {}".format(subcommand, usage))
        except ValueError as err:
            self.ipython_display.send_error("{}".format(err))

    def _execute_against_context_that_returns_df(self, method, cell, session, output_var):
        try:
            df = method(cell, session)
            if output_var is not None:
                self.shell.user_ns[output_var] = df
            return df
        except DataFrameParseException as e:
            self.ipython_display.send_error(e.out)
            return None

    def _print_local_info(self):
        sessions_info = ["        {}".format(i) for i in self.spark_controller.get_manager_sessions_str()]
        print("""Info for running Spark:
    Sessions:
{}
    Session configs:
        {}
""".format("\n".join(sessions_info), conf.session_configs()))


    def _print_endpoint_info(self, info_sessions):
        sessions_info = ["        {}".format(i) for i in info_sessions]
        print("""Info for endpoint:
    Sessions:
{}
""".format("\n".join(sessions_info)))

    @staticmethod
    def _get_livy_kind(language):
        if language == Constants.lang_scala:
            return Constants.session_kind_spark
        elif language == Constants.lang_python:
            return Constants.session_kind_pyspark
        elif language == Constants.lang_r:
            return Constants.session_kind_sparkr
        else:
            raise ValueError("Cannot get session kind for {}.".format(language))
class ClientManagerStateSerializer(object):
    """Livy client manager state serializer"""

    def __init__(self, reader_writer):
        assert reader_writer is not None

        self.logger = Log("ClientManagerStateSerializer")
        self._ipython_display = IpythonDisplay()

        self._reader_writer = reader_writer

    def deserialize_state(self):
        self.logger.debug("Deserializing state.")

        clients_to_return = []

        lines = self._reader_writer.read_lines()
        line = ''.join(lines).strip()

        if line != '':
            self.logger.debug("Read content. Converting to JSON.")
            json_str = json.loads(line)
            clients = json_str["clients"]

            for client in clients:
                # Ignore version for now
                name = client["name"]
                session_id = client["id"]
                sql_context_created = client["sqlcontext"]
                kind = client["kind"].lower()
                connection_string = client["connectionstring"]

                session = self._create_livy_session(connection_string, {"kind": kind}, self._ipython_display,
                                                    session_id, sql_context_created)

                # Do not start session automatically. Just create it but skip is not existent.
                try:
                    # Get status to know if it's alive or not.
                    status = session.status
                    if not session.is_final_status(status):
                        self.logger.debug("Adding session {}".format(session_id))
                        client_obj = self._create_livy_client(session)
                        clients_to_return.append((name, client_obj))
                    else:
                        self.logger.error("Skipping serialized session '{}' because session was in status {}."
                                          .format(session.id, status))
                except (ValueError, ConnectionError) as e:
                    self.logger.error("Skipping serialized session '{}' because {}".format(session.id, str(e)))
        else:
            self.logger.debug("Empty manager state found.")

        return clients_to_return

    def serialize_state(self, name_client_dictionary):
        self.logger.debug("Serializing state.")

        serialized_clients = []
        for name in list(name_client_dictionary.keys()):
            client = name_client_dictionary[name]
            serialized_client = client.serialize()
            serialized_client["name"] = name
            serialized_clients.append(serialized_client)

        serialized_str = json.dumps({"clients": serialized_clients})
        self._reader_writer.overwrite_with_line(serialized_str)

    def _create_livy_session(self, connection_string, properties, ipython_display,
                             session_id, sql_context_created):
        return LivySession.from_connection_string(connection_string, properties, ipython_display,
                                                  session_id, sql_context_created)

    def _create_livy_client(self, session):
        return LivyClient(session)
Beispiel #12
0
class SparkKernelBase(IPythonKernel):
    def __init__(self,
                 implementation,
                 implementation_version,
                 language,
                 language_version,
                 language_info,
                 session_language,
                 user_code_parser=None,
                 **kwargs):
        # Required by Jupyter - Override
        self.implementation = implementation
        self.implementation_version = implementation_version
        self.language = language
        self.language_version = language_version
        self.language_info = language_info

        # Override
        self.session_language = session_language

        super(SparkKernelBase, self).__init__(**kwargs)

        self.logger = Log("_jupyter_kernel".format(self.session_language))
        self._fatal_error = None
        self.ipython_display = IpythonDisplay()

        if user_code_parser is None:
            self.user_code_parser = UserCodeParser()
        else:
            self.user_code_parser = user_code_parser

        # Disable warnings for test env in HDI
        requests.packages.urllib3.disable_warnings()

        if not kwargs.get("testing", False):
            self._load_magics_extension()
            self._change_language()
            if conf.use_auto_viz():
                self._register_auto_viz()

    def do_execute(self,
                   code,
                   silent,
                   store_history=True,
                   user_expressions=None,
                   allow_stdin=False):
        def f(self):
            if self._fatal_error is not None:
                return self._repeat_fatal_error()

            return self._do_execute(code, silent, store_history,
                                    user_expressions, allow_stdin)

        return wrap_unexpected_exceptions(f, self._complete_cell)(self)

    def do_shutdown(self, restart):
        # Cleanup
        self._delete_session()

        return self._do_shutdown_ipykernel(restart)

    def _do_execute(self, code, silent, store_history, user_expressions,
                    allow_stdin):
        code_to_run = self.user_code_parser.get_code_to_run(code)

        res = self._execute_cell(code_to_run, silent, store_history,
                                 user_expressions, allow_stdin)

        return res

    def _load_magics_extension(self):
        register_magics_code = "%load_ext remotespark.kernels"
        self._execute_cell(
            register_magics_code,
            True,
            False,
            shutdown_if_error=True,
            log_if_error="Failed to load the Spark kernels magics library.")
        self.logger.debug("Loaded magics.")

    def _change_language(self):
        register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(
            self.session_language)
        self._execute_cell(
            register_magics_code,
            True,
            False,
            shutdown_if_error=True,
            log_if_error="Failed to change language to {}.".format(
                self.session_language))
        self.logger.debug("Changed language.")

    def _register_auto_viz(self):
        register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe
ip = get_ipython()
ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)"""
        self._execute_cell(
            register_auto_viz_code,
            True,
            False,
            shutdown_if_error=True,
            log_if_error="Failed to register auto viz for notebook.")
        self.logger.debug("Registered auto viz.")

    def _delete_session(self):
        code = "%%_do_not_call_delete_session\n "
        self._execute_cell_for_user(code, True, False)

    def _execute_cell(self,
                      code,
                      silent,
                      store_history=True,
                      user_expressions=None,
                      allow_stdin=False,
                      shutdown_if_error=False,
                      log_if_error=None):
        reply_content = self._execute_cell_for_user(code, silent,
                                                    store_history,
                                                    user_expressions,
                                                    allow_stdin)

        if shutdown_if_error and reply_content[u"status"] == u"error":
            error_from_reply = reply_content[u"evalue"]
            if log_if_error is not None:
                message = "{}\nException details:\n\t\"{}\"".format(
                    log_if_error, error_from_reply)
                return self._abort_with_fatal_error(message)

        return reply_content

    def _execute_cell_for_user(self,
                               code,
                               silent,
                               store_history=True,
                               user_expressions=None,
                               allow_stdin=False):
        return super(SparkKernelBase,
                     self).do_execute(code, silent, store_history,
                                      user_expressions, allow_stdin)

    def _do_shutdown_ipykernel(self, restart):
        return super(SparkKernelBase, self).do_shutdown(restart)

    def _complete_cell(self):
        """A method that runs a cell with no effect. Call this and return the value it
        returns when there's some sort of error preventing the user's cell from executing; this
        will register the cell from the Jupyter UI as being completed."""
        return self._execute_cell("None", False, True, None, False)

    def _show_user_error(self, message):
        self.logger.error(message)
        self.ipython_display.send_error(message)

    def _queue_fatal_error(self, message):
        """Queues up a fatal error to be thrown when the next cell is executed; does not
        raise an error immediately. We use this for errors that happen on kernel startup,
        since IPython crashes if we throw an exception in the __init__ method."""
        self._fatal_error = message

    def _abort_with_fatal_error(self, message):
        """Queues up a fatal error and throws it immediately."""
        self._queue_fatal_error(message)
        return self._repeat_fatal_error()

    def _repeat_fatal_error(self):
        """Throws an error that has already been queued."""
        error = conf.fatal_error_suggestion().format(self._fatal_error)
        self.logger.error(error)
        self.ipython_display.send_error(error)
        return self._complete_cell()
Beispiel #13
0
class SparkController(object):
    def __init__(self, ipython_display, serialize_path=None):
        self.logger = Log("SparkController")
        self.ipython_display = ipython_display

        if serialize_path is not None:
            serializer = ClientManagerStateSerializer(
                FileSystemReaderWriter(serialize_path))
            self.client_manager = ClientManager(serializer)
        else:
            self.client_manager = ClientManager()

    def get_logs(self, client_name=None):
        client_to_use = self.get_client_by_name_or_default(client_name)
        return client_to_use.get_logs()

    def run_cell(self, cell, client_name=None):
        client_to_use = self.get_client_by_name_or_default(client_name)
        return client_to_use.execute(cell)

    def run_cell_sql(self, sqlquery, client_name=None):
        client_to_use = self.get_client_by_name_or_default(client_name)
        return client_to_use.execute_sql(sqlquery)

    def get_all_sessions_endpoint(self, connection_string):
        http_client = self._http_client_from_connection_string(
            connection_string)
        r = http_client.get("/sessions", [200])
        sessions = r.json()["sessions"]
        session_list = [
            self._create_livy_session(connection_string, {"kind": s["kind"]},
                                      self.ipython_display, s["id"])
            for s in sessions
        ]
        for s in session_list:
            s._refresh_status()
        return session_list

    def get_all_sessions_endpoint_info(self, connection_string):
        sessions = self.get_all_sessions_endpoint(connection_string)
        return [str(s) for s in sessions]

    def cleanup(self):
        self.client_manager.clean_up_all()

    def cleanup_endpoint(self, connection_string):
        for session in self.get_all_sessions_endpoint(connection_string):
            session.delete()

    def delete_session_by_name(self, name):
        self.client_manager.delete_client(name)

    def delete_session_by_id(self, connection_string, session_id):
        http_client = self._http_client_from_connection_string(
            connection_string)
        r = http_client.get("/sessions/{}".format(session_id), [200, 404])
        if r.status_code != 404:
            session = self._create_livy_session(connection_string,
                                                {"kind": r.json()["kind"]},
                                                self.ipython_display,
                                                session_id, False)
            session.delete()

    def add_session(self, name, connection_string, skip_if_exists, properties):
        if skip_if_exists and (name
                               in self.client_manager.get_sessions_list()):
            self.logger.debug(
                "Skipping {} because it already exists in list of sessions.".
                format(name))
            return

        session = self._create_livy_session(connection_string, properties,
                                            self.ipython_display)
        session.start()

        livy_client = self._create_livy_client(session)
        self.client_manager.add_client(name, livy_client)
        livy_client.start()

    def get_session_id_for_client(self, name):
        return self.client_manager.get_session_id_for_client(name)

    def get_client_keys(self):
        return self.client_manager.get_sessions_list()

    def get_manager_sessions_str(self):
        return self.client_manager.get_sessions_info()

    def get_client_by_name_or_default(self, client_name):
        if client_name is None:
            return self.client_manager.get_any_client()
        else:
            client_name = client_name.lower()
            return self.client_manager.get_client(client_name)

    def get_managed_clients(self):
        return self.client_manager.livy_clients

    @staticmethod
    def _create_livy_session(*args, **kwargs):
        return LivySession.from_connection_string(*args, **kwargs)

    @staticmethod
    def _http_client_from_connection_string(connection_string):
        return LivyReliableHttpClient.from_connection_string(connection_string)

    @staticmethod
    def _create_livy_client(session):
        return LivyClient(session)
Beispiel #14
0
class Command(ObjectWithGuid):
    def __init__(self, code, spark_events=None):
        super(Command, self).__init__()
        self.code = textwrap.dedent(code)
        self.logger = Log("Command")
        if spark_events is None:
            spark_events = SparkEvents()
        self._spark_events = spark_events

    def __eq__(self, other):
        return self.code == other.code

    def __ne__(self, other):
        return not self == other

    def execute(self, session):
        self._spark_events.emit_statement_execution_start_event(
            session.guid, session.kind, session.id, self.guid)
        statement_id = -1
        try:
            session.wait_for_idle()
            data = {"code": self.code}
            response = session.http_client.post_statement(session.id, data)
            statement_id = response['id']
            output = self._get_statement_output(session, statement_id)
        except Exception as e:
            self._spark_events.emit_statement_execution_end_event(
                session.guid, session.kind, session.id, self.guid,
                statement_id, False, e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_statement_execution_end_event(
                session.guid, session.kind, session.id, self.guid,
                statement_id, True, "", "")
            return output

    def _get_statement_output(self, session, statement_id):
        statement_running = True
        out = ""
        while statement_running:
            statement = session.http_client.get_statement(
                session.id, statement_id)
            status = statement["state"]

            self.logger.debug("Status of statement {} is {}.".format(
                statement_id, status))

            if status == "running":
                session.sleep()
            else:
                statement_running = False

                statement_output = statement["output"]
                if statement_output["status"] == "ok":
                    out = (True, statement_output["data"]["text/plain"])
                elif statement_output["status"] == "error":
                    out = (False, statement_output["evalue"] + "\n" +
                           "".join(statement_output["traceback"]))
                else:
                    raise LivyUnexpectedStatusException(
                        "Unknown output status from Livy: '{}'".format(
                            statement_output["status"]))

        return out
Beispiel #15
0
class LivySession(object):
    """Session that is livy specific."""

    def __init__(self, http_client, properties, ipython_display,
                 session_id="-1", sql_created=None):
        assert "kind" in list(properties.keys())
        kind = properties["kind"]
        self.properties = properties
        self.ipython_display = ipython_display

        status_sleep_seconds = conf.status_sleep_seconds()
        statement_sleep_seconds = conf.statement_sleep_seconds()
        create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds()

        assert status_sleep_seconds > 0
        assert statement_sleep_seconds > 0
        assert create_sql_context_timeout_seconds > 0
        if session_id == "-1" and sql_created is True:
            raise ValueError("Cannot indicate sql state without session id.")

        self.logger = Log("LivySession")

        kind = kind.lower()
        if kind not in constants.SESSION_KINDS_SUPPORTED:
            raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}."
                             .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED)))

        if session_id == "-1":
            self.status = constants.NOT_STARTED_SESSION_STATUS
            sql_created = False
        else:
            self.status = constants.BUSY_SESSION_STATUS

        self._logs = ""
        self._http_client = http_client
        self._status_sleep_seconds = status_sleep_seconds
        self._statement_sleep_seconds = statement_sleep_seconds
        self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds

        self._state = LivySessionState(session_id, self._http_client.connection_string,
                                       kind, sql_created)

    @staticmethod
    def from_connection_string(connection_string, properties, ipython_display,
                               session_id="-1", sql_created=None):
        http_client = LivyReliableHttpClient.from_connection_string(connection_string)
        return LivySession(http_client, properties, ipython_display, session_id, sql_created)

    def __str__(self):
        return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status)

    def get_state(self):
        return self._state

    def start(self):
        """Start the session against actual livy server."""
        self.logger.debug("Starting '{}' session.".format(self.kind))

        r = self._http_client.post("/sessions", [201], self.properties)
        self._state.session_id = str(r.json()["id"])
        self.status = str(r.json()["state"])

        self.ipython_display.writeln("Creating SparkContext as 'sc'")
        self.logger.debug("Session '{}' started.".format(self.kind))

    def create_sql_context(self):
        """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'."""
        if self.started_sql_context:
            return
        self.logger.debug("Starting '{}' hive session.".format(self.kind))
        self.ipython_display.writeln("Creating HiveContext as 'sqlContext'")
        self._create_context(constants.CONTEXT_NAME_SQL)
        self._state.sql_context_created = True

    def _create_context(self, context_type):
        if context_type == constants.CONTEXT_NAME_SQL:
            command = self._get_sql_context_creation_command()
        else:
            raise ValueError("Cannot create context of type {}.".format(context_type))

        try:
            self.wait_for_idle(self._create_sql_context_timeout_seconds)
            self.execute(command)
            self.logger.debug("Started '{}' {} session.".format(self.kind, context_type))
        except LivyClientTimeoutError:
            raise LivyClientTimeoutError("Failed to create the {} context in time. Timed out after {} seconds."
                                         .format(context_type, self._create_sql_context_timeout_seconds))

    def get_logs(self):
        r = self._http_client.get("/sessions/{}/log?from=0".format(self.id), [200])
        log_array = r.json()['log']
        self._logs = "\n".join(log_array)
        return self._logs

    @property
    def id(self):
        return self._state.session_id

    @property
    def started_sql_context(self):
        return self._state.sql_context_created

    @property
    def kind(self):
        return self._state.kind

    @property
    def http_client(self):
        return self._http_client

    @staticmethod
    def is_final_status(status):
        return status in constants.FINAL_STATUS
    
    def execute(self, commands):
        code = textwrap.dedent(commands)

        data = {"code": code}
        r = self._http_client.post(self._statements_url(), [201], data)
        statement_id = r.json()['id']
        
        return self._get_statement_output(statement_id)

    def delete(self):
        self.logger.debug("Deleting session '{}'".format(self.id))

        if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS:
            self._http_client.delete("/sessions/{}".format(self.id), [200, 404])
            self.status = constants.DEAD_SESSION_STATUS
            self._state.session_id = "-1"
        else:
            raise ValueError("Cannot delete session {} that is in state '{}'."
                             .format(self.id, self.status))

    def wait_for_idle(self, seconds_to_wait):
        """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as
        indicated by the constructor.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        self._refresh_status()
        current_status = self.status
        if current_status == constants.IDLE_SESSION_STATUS:
            return

        if current_status in constants.FINAL_STATUS:
            error = "Session {} unexpectedly reached final status '{}'. See logs:\n{}"\
                .format(self.id, current_status, self.get_logs())
            self.logger.error(error)
            raise LivyUnexpectedStatusError(error)

        if seconds_to_wait <= 0.0:
            error = "Session {} did not reach idle status in time. Current status is {}."\
                .format(self.id, current_status)
            self.logger.error(error)
            raise LivyClientTimeoutError(error)

        start_time = time()
        self.logger.debug("Session {} in state {}. Sleeping {} seconds."
                          .format(self.id, current_status, seconds_to_wait))
        sleep(self._status_sleep_seconds)
        elapsed = (time() - start_time)
        return self.wait_for_idle(seconds_to_wait - elapsed)

    def _statements_url(self):
        return "/sessions/{}/statements".format(self.id)

    def _refresh_status(self):
        status = self._http_client.get("/sessions/{}".format(self.id), [200]).json()['state']

        if status in constants.POSSIBLE_SESSION_STATUS:
            self.status = status
        else:
            raise ValueError("Status '{}' not supported by session.".format(status))

        return self.status
    
    def _get_statement_output(self, statement_id):
        statement_running = True
        out = ""
        while statement_running:
            r = self._http_client.get(self._statements_url(), [200])
            statement = [i for i in r.json()["statements"] if i["id"] == statement_id][0]
            status = statement["state"]

            self.logger.debug("Status of statement {} is {}.".format(statement_id, status))

            if status == "running":
                sleep(self._statement_sleep_seconds)
            else:
                statement_running = False
                
                statement_output = statement["output"]
                if statement_output["status"] == "ok":
                    out = (True, statement_output["data"]["text/plain"])
                elif statement_output["status"] == "error":
                    out = (False, statement_output["evalue"] + "\n" +
                           "".join(statement_output["traceback"]))
                else:
                    raise ValueError("Unknown output status: '{}'".format(statement_output["status"]))

        return out

    def _get_sql_context_creation_command(self):
        if self.kind == constants.SESSION_KIND_SPARK:
            sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_PYSPARK:
            sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_SPARKR:
            sql_context_command = "sqlContext <- sparkRHive.init(sc)"
        else:
            raise ValueError("Do not know how to create HiveContext in session of kind {}.".format(self.kind))

        return sql_context_command
Beispiel #16
0
class ReliableHttpClient(object):
    """Http client that is reliable in its requests. Uses requests library."""
    def __init__(self, endpoint, headers, retry_policy):
        self._endpoint = endpoint
        self._headers = headers
        self._retry_policy = retry_policy
        self.logger = Log("ReliableHttpClient")

        self.verify_ssl = not conf.ignore_ssl_errors()
        if not self.verify_ssl:
            self.logger.debug(
                "ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks."
            )
            requests.packages.urllib3.disable_warnings()

    def compose_url(self, relative_url):
        r_u = "/{}".format(relative_url.rstrip("/").lstrip("/"))
        return self._endpoint.url + r_u

    def get(self, relative_url, accepted_status_codes):
        """Sends a get request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.get)

    def post(self, relative_url, accepted_status_codes, data):
        """Sends a post request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.post, data)

    def delete(self, relative_url, accepted_status_codes):
        """Sends a delete request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes,
                                  requests.delete)

    def _send_request(self,
                      relative_url,
                      accepted_status_codes,
                      function,
                      data=None):
        return self._send_request_helper(self.compose_url(relative_url),
                                         accepted_status_codes, function, data,
                                         0)

    def _send_request_helper(self, url, accepted_status_codes, function, data,
                             retry_count):
        while True:
            try:
                if not self._endpoint.authenticate:
                    if data is None:
                        r = function(url,
                                     headers=self._headers,
                                     verify=self.verify_ssl)
                    else:
                        r = function(url,
                                     headers=self._headers,
                                     data=json.dumps(data),
                                     verify=self.verify_ssl)
                else:
                    if data is None:
                        r = function(url,
                                     headers=self._headers,
                                     auth=(self._endpoint.username,
                                           self._endpoint.password),
                                     verify=self.verify_ssl)
                    else:
                        r = function(url,
                                     headers=self._headers,
                                     auth=(self._endpoint.username,
                                           self._endpoint.password),
                                     data=json.dumps(data),
                                     verify=self.verify_ssl)
            except requests.exceptions.RequestException as e:
                error = True
                r = None
                status = None

                self.logger.error("Request to '{}' failed with '{}'".format(
                    url, e))
            else:
                error = False
                status = r.status_code

            if error or status not in accepted_status_codes:
                if self._retry_policy.should_retry(status, error, retry_count):
                    sleep(self._retry_policy.seconds_to_sleep(retry_count))
                    retry_count += 1
                    continue
                else:
                    raise HttpClientException(
                        "Invalid status code '{}' or error '{}' from {}".
                        format(status, error, url))
            return r
class SparkController(object):

    def __init__(self, ipython_display, serialize_path=None):
        self.logger = Log("SparkController")
        self.ipython_display = ipython_display
        self.client_factory = LivyClientFactory()

        if serialize_path is not None:
            serializer = ClientManagerStateSerializer(self.client_factory, FileSystemReaderWriter(serialize_path))
            self.client_manager = ClientManager(serializer)
        else:
            self.client_manager = ClientManager()

    def get_logs(self, client_name=None):
        client_to_use = self.get_client_by_name_or_default(client_name)
        return client_to_use.get_logs()

    def run_cell(self, cell, client_name=None):
        client_to_use = self.get_client_by_name_or_default(client_name)
        return client_to_use.execute(cell)

    def run_cell_sql(self, cell, client_name=None):
        client_to_use = self.get_client_by_name_or_default(client_name)
        return client_to_use.execute_sql(cell)

    def run_cell_hive(self, cell, client_name=None):
        client_to_use = self.get_client_by_name_or_default(client_name)
        return client_to_use.execute_hive(cell)

    def get_all_sessions_endpoint(self, connection_string):
        http_client = self.client_factory.create_http_client(connection_string)
        r = http_client.get("/sessions", [200])
        sessions = r.json()["sessions"]
        session_list = [self.client_factory.create_session(self.ipython_display, connection_string, {"kind": s["kind"]}, s["id"])
                        for s in sessions]
        for s in session_list:
            s._refresh_status()
        return session_list

    def get_all_sessions_endpoint_info(self, connection_string):
        sessions = self.get_all_sessions_endpoint(connection_string)
        return [str(s) for s in sessions]

    def cleanup(self):
        self.client_manager.clean_up_all()

    def cleanup_endpoint(self, connection_string):
        for session in self.get_all_sessions_endpoint(connection_string):
            session.delete()

    def delete_session_by_name(self, name):
        self.client_manager.delete_client(name)

    def delete_session_by_id(self, connection_string, session_id):
        http_client = self.client_factory.create_http_client(connection_string)
        r = http_client.get("/sessions/{}".format(session_id), [200, 404])
        if r.status_code != 404:
            session = self.client_factory.create_session(self.ipython_display, connection_string, {"kind": r.json()["kind"]}, session_id, False)
            session.delete()

    def add_session(self, name, connection_string, skip_if_exists, properties):
        if skip_if_exists and (name in self.client_manager.get_sessions_list()):
            self.logger.debug("Skipping {} because it already exists in list of sessions.".format(name))
            return

        session = self.client_factory.create_session(self.ipython_display, connection_string, properties, "-1", False)
        session.start()

        livy_client = self.client_factory.build_client(session)
        self.client_manager.add_client(name, livy_client)
        livy_client.start()

    def get_client_keys(self):
        return self.client_manager.get_sessions_list()

    def get_manager_sessions_str(self):
        return self.client_manager.get_sessions_info()

    def get_client_by_name_or_default(self, client_name):
        if client_name is None:
            return self.client_manager.get_any_client()
        else:
            client_name = client_name.lower()
            return self.client_manager.get_client(client_name)
class LivySession(object):
    """Session that is livy specific."""

    def __init__(self, ipython_display, http_client, session_id, sql_created, properties):
        assert "kind" in properties.keys()
        kind = properties["kind"]
        self.properties = properties
        self.ipython_display = ipython_display

        status_sleep_seconds = conf.status_sleep_seconds()
        statement_sleep_seconds = conf.statement_sleep_seconds()
        create_sql_context_timeout_seconds = conf.create_sql_context_timeout_seconds()

        assert status_sleep_seconds > 0
        assert statement_sleep_seconds > 0
        assert create_sql_context_timeout_seconds > 0
        if session_id == "-1" and sql_created is True:
            raise ValueError("Cannot indicate sql state without session id.")

        self.logger = Log("LivySession")

        kind = kind.lower()
        if kind not in Constants.session_kinds_supported:
            raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}."
                             .format(kind, ", ".join(Constants.session_kinds_supported)))

        if session_id == "-1":
            self._status = Constants.not_started_session_status
            sql_created = False
        else:
            self._status = Constants.busy_session_status

        self._logs = ""
        self._http_client = http_client
        self._status_sleep_seconds = status_sleep_seconds
        self._statement_sleep_seconds = statement_sleep_seconds
        self._create_sql_context_timeout_seconds = create_sql_context_timeout_seconds

        self._state = LivySessionState(session_id, http_client.connection_string,
                                       kind, sql_created)

    def __str__(self):
        return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self._status)

    def get_state(self):
        return self._state

    def start(self):
        """Start the session against actual livy server."""
        self.logger.debug("Starting '{}' session.".format(self.kind))

        r = self._http_client.post("/sessions", [201], self.properties)
        self._state.session_id = str(r.json()["id"])
        self._status = str(r.json()["state"])

        self.ipython_display.writeln("Creating SparkContext as 'sc'")
        self.logger.debug("Session '{}' started.".format(self.kind))

    def create_sql_context(self):
        """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'."""
        if self.started_sql_context:
            return

        self.logger.debug("Starting '{}' sql and hive session.".format(self.kind))

        self.ipython_display.writeln("Creating SqlContext as 'sqlContext'")
        self._create_context(Constants.context_name_sql)

        self.ipython_display.writeln("Creating HiveContext as 'hiveContext'")
        self._create_context(Constants.context_name_hive)

        self._state.sql_context_created = True

    def _create_context(self, context_type):
        if context_type == Constants.context_name_sql:
            command = self._get_sql_context_creation_command()
        elif context_type == Constants.context_name_hive:
            command = self._get_hive_context_creation_command()
        else:
            raise ValueError("Cannot create context of type {}.".format(context_type))

        try:
            self.wait_for_idle(self._create_sql_context_timeout_seconds)
            self.execute(command)
            self.logger.debug("Started '{}' {} session.".format(self.kind, context_type))
        except LivyClientTimeoutError:
            raise LivyClientTimeoutError("Failed to create the {} context in time. Timed out after {} seconds."
                                         .format(context_type, self._create_sql_context_timeout_seconds))

    @property
    def id(self):
        return self._state.session_id

    @property
    def started_sql_context(self):
        return self._state.sql_context_created

    @property
    def kind(self):
        return self._state.kind

    @property
    def logs(self):
        self._refresh_logs()
        return self._logs

    @property
    def http_client(self):
        return self._http_client

    @staticmethod
    def is_final_status(status):
        return status in Constants.final_status
    
    def execute(self, commands):
        code = textwrap.dedent(commands)

        data = {"code": code}
        r = self._http_client.post(self._statements_url(), [201], data)
        statement_id = r.json()['id']
        
        return self._get_statement_output(statement_id)

    def delete(self):
        self.logger.debug("Deleting session '{}'".format(self.id))

        if self._status != Constants.not_started_session_status and self._status != Constants.dead_session_status:
            self._http_client.delete("/sessions/{}".format(self.id), [200, 404])
            self._status = Constants.dead_session_status
            self._state.session_id = "-1"
        else:
            raise ValueError("Cannot delete session {} that is in state '{}'."
                             .format(self.id, self._status))

    def wait_for_idle(self, seconds_to_wait):
        """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as
        indicated by the constructor.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        self._refresh_status()
        current_status = self._status
        if current_status == Constants.idle_session_status:
            return

        if current_status in Constants.final_status:
            error = "Session {} unexpectedly reached final status {}. See logs:\n{}"\
                .format(self.id, current_status, self.logs)
            self.logger.error(error)
            raise LivyUnexpectedStatusError(error)

        if seconds_to_wait <= 0.0:
            error = "Session {} did not reach idle status in time. Current status is {}."\
                .format(self.id, current_status)
            self.logger.error(error)
            raise LivyClientTimeoutError(error)

        start_time = time()
        self.logger.debug("Session {} in state {}. Sleeping {} seconds."
                          .format(self.id, current_status, seconds_to_wait))
        sleep(self._status_sleep_seconds)
        elapsed = (time() - start_time)
        return self.wait_for_idle(seconds_to_wait - elapsed)

    def _statements_url(self):
        return "/sessions/{}/statements".format(self.id)

    def _refresh_status(self):
        status = self._get_latest_status()

        if status in Constants.possible_session_status:
            self._status = status
        else:
            raise ValueError("Status '{}' not supported by session.".format(status))

        return self._status

    def _refresh_logs(self):
        self._logs = self._get_latest_logs()

    def _get_latest_status(self):
        r = self._http_client.get("/sessions/{}".format(self.id), [200])
        session = r.json()
                    
        return session['state']

    def _get_latest_logs(self):
        r = self._http_client.get("/sessions/{}/log?from=0".format(self.id), [200])
        log_array = r.json()['log']
        logs = "\n".join(log_array)

        return logs
    
    def _get_statement_output(self, statement_id):
        statement_running = True
        out = ""
        while statement_running:
            r = self._http_client.get(self._statements_url(), [200])
            statement = [i for i in r.json()["statements"] if i["id"] == statement_id][0]
            status = statement["state"]

            self.logger.debug("Status of statement {} is {}.".format(statement_id, status))

            if status == "running":
                sleep(self._statement_sleep_seconds)
            else:
                statement_running = False
                
                statement_output = statement["output"]
                if statement_output["status"] == "ok":
                    out = (True, statement_output["data"]["text/plain"])
                elif statement_output["status"] == "error":
                    out = (False, statement_output["evalue"] + "\n" +
                           "".join(statement_output["traceback"]))
                else:
                    raise ValueError("Unknown output status: '{}'".format(statement_output["status"]))

        return out

    def _get_sql_context_creation_command(self):
        if self.kind == Constants.session_kind_spark:
            sql_context_command = "val sqlContext = new org.apache.spark.sql.SQLContext(sc)\n" \
                                  "import sqlContext.implicits._"
        elif self.kind == Constants.session_kind_pyspark:
            sql_context_command = "from pyspark.sql import SQLContext\nfrom pyspark.sql.types import *\n" \
                                  "sqlContext = SQLContext(sc)"
        elif self.kind == Constants.session_kind_sparkr:
            sql_context_command = "sqlContext <- sparkRSQL.init(sc)"
        else:
            raise ValueError("Do not know how to create sqlContext in session of kind {}.".format(self.kind))

        return sql_context_command

    def _get_hive_context_creation_command(self):
        if self.kind == Constants.session_kind_spark:
            hive_context_command = "val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)"
        elif self.kind == Constants.session_kind_pyspark:
            hive_context_command = "from pyspark.sql import HiveContext\nhiveContext = HiveContext(sc)"
        elif self.kind == Constants.session_kind_sparkr:
            hive_context_command = "hiveContext <- sparkRHive.init(sc)"
        else:
            raise ValueError("Do not know how to create hiveContext in session of kind {}.".format(self.kind))

        return hive_context_command
Beispiel #19
0
class ClientManager(object):
    """Livy client manager"""
    def __init__(self, serializer=None):
        serialize_periodically = False
        serialize_period = 3

        if serializer is not None:
            serialize_periodically = conf.serialize_periodically()
            serialize_period = conf.serialize_period_seconds()

        self.logger = Log("ClientManager")

        self._livy_clients = dict()
        self._serializer = serializer
        self._serialize_timer = None

        if self._serializer is not None:
            for (name, client) in self._serializer.deserialize_state():
                self.add_client(name, client)

            if serialize_periodically:
                self._serialize_state_periodically(serialize_period)

    def _serialize_state_periodically(self, serialize_period):
        self.logger.debug("Starting state serialize timer.")

        self._serialize_timer = Timer(serialize_period, self._serialize_state)
        self._serialize_timer.start()

    def _serialize_state(self):
        self._serializer.serialize_state(self._livy_clients)

    @property
    def livy_clients(self):
        return self._livy_clients

    def get_sessions_list(self):
        return list(self._livy_clients.keys())

    def get_sessions_info(self):
        return [
            "Name: {}\t{}".format(k, str(self._livy_clients[k]))
            for k in list(self._livy_clients.keys())
        ]

    def add_client(self, name, livy_client):
        if name in self.get_sessions_list():
            raise ValueError(
                "Session with name '{}' already exists. Please delete the session"
                " first if you intend to replace it.".format(name))

        self._livy_clients[name] = livy_client

    def get_any_client(self):
        number_of_sessions = len(self._livy_clients)
        if number_of_sessions == 1:
            key = self.get_sessions_list()[0]
            return self._livy_clients[key]
        elif number_of_sessions == 0:
            raise AssertionError(
                "You need to have at least 1 client created to execute commands."
            )
        else:
            raise AssertionError(
                "Please specify the client to use. Possible sessions are {}".
                format(self.get_sessions_list()))

    def get_client(self, name):
        if name in self.get_sessions_list():
            return self._livy_clients[name]
        raise ValueError(
            "Could not find '{}' session in list of saved sessions. Possible sessions are {}"
            .format(name, self.get_sessions_list()))

    def get_session_id_for_client(self, name):
        if name in self.get_sessions_list():
            return self._livy_clients[name].session_id
        return None

    def delete_client(self, name):
        self._remove_session(name)

    def clean_up_all(self):
        for name in self.get_sessions_list():
            self._remove_session(name)

        if self._serializer is not None:
            self._serialize_state()

    def _remove_session(self, name):
        if name in self.get_sessions_list():
            self._livy_clients[name].close_session()
            del self._livy_clients[name]
        else:
            raise ValueError(
                "Could not find '{}' session in list of saved sessions. Possible sessions are {}"
                .format(name, self.get_sessions_list()))
Beispiel #20
0
class LivySession(ObjectWithGuid):
    """Session that is livy specific."""

    def __init__(self, http_client, properties, ipython_display,
                 session_id=-1, sql_created=None):
        super(LivySession, self).__init__()
        assert "kind" in list(properties.keys())
        kind = properties["kind"]
        self.properties = properties
        self.ipython_display = ipython_display
        self._spark_events = SparkEvents()

        status_sleep_seconds = conf.status_sleep_seconds()
        statement_sleep_seconds = conf.statement_sleep_seconds()
        wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds()

        assert status_sleep_seconds > 0
        assert statement_sleep_seconds > 0
        assert wait_for_idle_timeout_seconds > 0
        if session_id == -1 and sql_created is True:
            raise ValueError("Cannot indicate sql state without session id.")

        self.logger = Log("LivySession")

        kind = kind.lower()
        if kind not in constants.SESSION_KINDS_SUPPORTED:
            raise ValueError("Session of kind '{}' not supported. Session must be of kinds {}."
                             .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED)))

        if session_id == -1:
            self.status = constants.NOT_STARTED_SESSION_STATUS
            sql_created = False
        else:
            self.status = constants.BUSY_SESSION_STATUS

        self._logs = ""
        self._http_client = http_client
        self._status_sleep_seconds = status_sleep_seconds
        self._statement_sleep_seconds = statement_sleep_seconds
        self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds

        self.kind = kind
        self.id = session_id
        self.created_sql_context = sql_created

    def __str__(self):
        return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status)

    def start(self, create_sql_context=True):
        """Start the session against actual livy server."""
        self._spark_events.emit_session_creation_start_event(self.guid, self.kind)

        r = self._http_client.post_session(self.properties)
        self.id = r["id"]
        self.status = str(r["state"])

        self.ipython_display.writeln("Creating SparkContext as 'sc'")
        # We wait for livy_session_startup_timeout_seconds() for the session to start up.
        try:
            self.wait_for_idle(conf.livy_session_startup_timeout_seconds())
        except LivyClientTimeoutError:
            raise LivyClientTimeoutError("Session {} did not start up in {} seconds."\
                                         .format(self.id, conf.livy_session_startup_timeout_seconds()))

        if create_sql_context:
            self.create_sql_context()
        self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status)

    def create_sql_context(self):
        """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'."""
        if self.created_sql_context:
            return
        self.logger.debug("Starting '{}' hive session.".format(self.kind))
        self.ipython_display.writeln("Creating HiveContext as 'sqlContext'")
        command = self._get_sql_context_creation_command()
        try:
            command.execute(self)
        except LivyClientTimeoutError:
            raise LivyClientTimeoutError("Failed to create the SqlContext in time. Timed out after {} seconds."
                                         .format(self._wait_for_idle_timeout_seconds))
        self.created_sql_context = True

    def get_logs(self):
        log_array = self._http_client.get_all_session_logs(self.id)['log']
        self._logs = "\n".join(log_array)
        return self._logs

    @property
    def http_client(self):
        return self._http_client

    @staticmethod
    def is_final_status(status):
        return status in constants.FINAL_STATUS

    def delete(self):
        self.logger.debug("Deleting session '{}'".format(self.id))

        if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS:
            self._http_client.delete_session(self.id)
            self.status = constants.DEAD_SESSION_STATUS
            self.id = -1
        else:
            raise ValueError("Cannot delete session {} that is in state '{}'."
                             .format(self.id, self.status))

    def wait_for_idle(self, seconds_to_wait=None):
        """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as
        indicated by the constructor.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        if seconds_to_wait is None:
            seconds_to_wait = self._wait_for_idle_timeout_seconds

        while True:
            self._refresh_status()
            if self.status == constants.IDLE_SESSION_STATUS:
                return

            if self.status in constants.FINAL_STATUS:
                error = "Session {} unexpectedly reached final status '{}'. See logs:\n{}"\
                    .format(self.id, self.status, self.get_logs())
                self.logger.error(error)
                raise LivyUnexpectedStatusError(error)

            if seconds_to_wait <= 0.0:
                error = "Session {} did not reach idle status in time. Current status is {}."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyClientTimeoutError(error)

            start_time = time()
            self.logger.debug("Session {} in state {}. Sleeping {} seconds."
                              .format(self.id, self.status, self._status_sleep_seconds))
            sleep(self._status_sleep_seconds)
            seconds_to_wait -= time() - start_time

    def sleep(self):
        sleep(self._statement_sleep_seconds)

    def _refresh_status(self):
        status = self._http_client.get_session(self.id)['state']

        if status in constants.POSSIBLE_SESSION_STATUS:
            self.status = status
        else:
            raise ValueError("Status '{}' not supported by session.".format(status))

        return self.status

    def _get_sql_context_creation_command(self):
        if self.kind == constants.SESSION_KIND_SPARK:
            sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_PYSPARK:
            sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_SPARKR:
            sql_context_command = "sqlContext <- sparkRHive.init(sc)"
        else:
            raise ValueError("Do not know how to create HiveContext in session of kind {}.".format(self.kind))

        return Command(sql_context_command)
Beispiel #21
0
class SparkKernelBase(IPythonKernel):
    def __init__(self, implementation, implementation_version, language, language_version, language_info,
                 session_language, user_code_parser=None, **kwargs):
        # Required by Jupyter - Override
        self.implementation = implementation
        self.implementation_version = implementation_version
        self.language = language
        self.language_version = language_version
        self.language_info = language_info

        # Override
        self.session_language = session_language

        super(SparkKernelBase, self).__init__(**kwargs)

        self.logger = Log("_jupyter_kernel".format(self.session_language))
        self._fatal_error = None
        self.ipython_display = IpythonDisplay()

        if user_code_parser is None:
            self.user_code_parser = UserCodeParser()
        else:
            self.user_code_parser = user_code_parser

        # Disable warnings for test env in HDI
        requests.packages.urllib3.disable_warnings()

        if not kwargs.get("testing", False):
            self._load_magics_extension()
            self._change_language()
            if conf.use_auto_viz():
                self._register_auto_viz()

    def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        def f(self):
            if self._fatal_error is not None:
                return self._repeat_fatal_error()

            return self._do_execute(code, silent, store_history, user_expressions, allow_stdin)
        return wrap_unexpected_exceptions(f, self._complete_cell)(self)

    def do_shutdown(self, restart):
        # Cleanup
        self._delete_session()

        return self._do_shutdown_ipykernel(restart)

    def _do_execute(self, code, silent, store_history, user_expressions, allow_stdin):
        code_to_run = self.user_code_parser.get_code_to_run(code)

        res = self._execute_cell(code_to_run, silent, store_history, user_expressions, allow_stdin)

        return res

    def _load_magics_extension(self):
        register_magics_code = "%load_ext remotespark.kernels"
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to load the Spark kernels magics library.")
        self.logger.debug("Loaded magics.")

    def _change_language(self):
        register_magics_code = "%%_do_not_call_change_language -l {}\n ".format(self.session_language)
        self._execute_cell(register_magics_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to change language to {}.".format(self.session_language))
        self.logger.debug("Changed language.")

    def _register_auto_viz(self):
        register_auto_viz_code = """from remotespark.datawidgets.utils import display_dataframe
ip = get_ipython()
ip.display_formatter.ipython_display_formatter.for_type_by_name('pandas.core.frame', 'DataFrame', display_dataframe)"""
        self._execute_cell(register_auto_viz_code, True, False, shutdown_if_error=True,
                           log_if_error="Failed to register auto viz for notebook.")
        self.logger.debug("Registered auto viz.")

    def _delete_session(self):
        code = "%%_do_not_call_delete_session\n "
        self._execute_cell_for_user(code, True, False)

    def _execute_cell(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False,
                      shutdown_if_error=False, log_if_error=None):
        reply_content = self._execute_cell_for_user(code, silent, store_history, user_expressions, allow_stdin)

        if shutdown_if_error and reply_content[u"status"] == u"error":
            error_from_reply = reply_content[u"evalue"]
            if log_if_error is not None:
                message = "{}\nException details:\n\t\"{}\"".format(log_if_error, error_from_reply)
                return self._abort_with_fatal_error(message)

        return reply_content

    def _execute_cell_for_user(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False):
        return super(SparkKernelBase, self).do_execute(code, silent, store_history, user_expressions, allow_stdin)

    def _do_shutdown_ipykernel(self, restart):
        return super(SparkKernelBase, self).do_shutdown(restart)

    def _complete_cell(self):
        """A method that runs a cell with no effect. Call this and return the value it
        returns when there's some sort of error preventing the user's cell from executing; this
        will register the cell from the Jupyter UI as being completed."""
        return self._execute_cell("None", False, True, None, False)

    def _show_user_error(self, message):
        self.logger.error(message)
        self.ipython_display.send_error(message)

    def _queue_fatal_error(self, message):
        """Queues up a fatal error to be thrown when the next cell is executed; does not
        raise an error immediately. We use this for errors that happen on kernel startup,
        since IPython crashes if we throw an exception in the __init__ method."""
        self._fatal_error = message

    def _abort_with_fatal_error(self, message):
        """Queues up a fatal error and throws it immediately."""
        self._queue_fatal_error(message)
        return self._repeat_fatal_error()

    def _repeat_fatal_error(self):
        """Throws an error that has already been queued."""
        error = conf.fatal_error_suggestion().format(self._fatal_error)
        self.logger.error(error)
        self.ipython_display.send_error(error)
        return self._complete_cell()
Beispiel #22
0
class SparkController(object):
    def __init__(self, ipython_display):
        self.logger = Log("SparkController")
        self.ipython_display = ipython_display
        self.session_manager = SessionManager()

    def get_logs(self, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return session_to_use.get_logs()

    def run_command(self, command, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return command.execute(session_to_use)

    def run_sqlquery(self, sqlquery, client_name=None):
        session_to_use = self.get_session_by_name_or_default(client_name)
        return sqlquery.execute(session_to_use)

    def get_all_sessions_endpoint(self, endpoint):
        http_client = self._http_client(endpoint)
        sessions = http_client.get_sessions()["sessions"]
        session_list = [
            self._livy_session(http_client, {"kind": s["kind"]},
                               self.ipython_display, s["id"]) for s in sessions
        ]
        for s in session_list:
            s.refresh_status()
        return session_list

    def get_all_sessions_endpoint_info(self, endpoint):
        sessions = self.get_all_sessions_endpoint(endpoint)
        return [str(s) for s in sessions]

    def cleanup(self):
        self.session_manager.clean_up_all()

    def cleanup_endpoint(self, endpoint):
        for session in self.get_all_sessions_endpoint(endpoint):
            session.delete()

    def delete_session_by_name(self, name):
        self.session_manager.delete_client(name)

    def delete_session_by_id(self, endpoint, session_id):
        http_client = self._http_client(endpoint)
        response = http_client.get_session(session_id)
        http_client = self._http_client(endpoint)
        session = self._livy_session(http_client, {"kind": response["kind"]},
                                     self.ipython_display, session_id, False)
        session.delete()

    def add_session(self, name, endpoint, skip_if_exists, properties):
        if skip_if_exists and (name
                               in self.session_manager.get_sessions_list()):
            self.logger.debug(
                "Skipping {} because it already exists in list of sessions.".
                format(name))
            return
        http_client = self._http_client(endpoint)
        session = self._livy_session(http_client, properties,
                                     self.ipython_display)
        self.session_manager.add_session(name, session)
        session.start()

    def get_session_id_for_client(self, name):
        return self.session_manager.get_session_id_for_client(name)

    def get_client_keys(self):
        return self.session_manager.get_sessions_list()

    def get_manager_sessions_str(self):
        return self.session_manager.get_sessions_info()

    def get_session_by_name_or_default(self, client_name):
        if client_name is None:
            return self.session_manager.get_any_session()
        else:
            client_name = client_name.lower()
            return self.session_manager.get_session(client_name)

    def get_managed_clients(self):
        return self.session_manager.sessions

    @staticmethod
    def _livy_session(http_client,
                      properties,
                      ipython_display,
                      session_id=-1,
                      sql_created=None):
        return LivySession(http_client, properties, ipython_display,
                           session_id, sql_created)

    @staticmethod
    def _http_client(endpoint):
        return LivyReliableHttpClient.from_endpoint(endpoint)
Beispiel #23
0
class ReliableHttpClient(object):
    """Http client that is reliable in its requests. Uses requests library."""

    def __init__(self, endpoint, headers, retry_policy):
        self._endpoint = endpoint
        self._headers = headers
        self._retry_policy = retry_policy
        self.logger = Log("ReliableHttpClient")

        self.verify_ssl = not conf.ignore_ssl_errors()
        if not self.verify_ssl:
            self.logger.debug("ATTENTION: Will ignore SSL errors. This might render you vulnerable to attacks.")
            requests.packages.urllib3.disable_warnings()

    def compose_url(self, relative_url):
        r_u = "/{}".format(relative_url.rstrip("/").lstrip("/"))
        return self._endpoint.url + r_u

    def get(self, relative_url, accepted_status_codes):
        """Sends a get request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes, requests.get)

    def post(self, relative_url, accepted_status_codes, data):
        """Sends a post request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes, requests.post, data)

    def delete(self, relative_url, accepted_status_codes):
        """Sends a delete request. Returns a response."""
        return self._send_request(relative_url, accepted_status_codes, requests.delete)

    def _send_request(self, relative_url, accepted_status_codes, function, data=None):
        return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0)

    def _send_request_helper(self, url, accepted_status_codes, function, data, retry_count):
        while True:
            try:
                if not self._endpoint.authenticate:
                    if data is None:
                        r = function(url, headers=self._headers, verify=self.verify_ssl)
                    else:
                        r = function(url, headers=self._headers, data=json.dumps(data), verify=self.verify_ssl)
                else:
                    if data is None:
                        r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password),
                                     verify=self.verify_ssl)
                    else:
                        r = function(url, headers=self._headers, auth=(self._endpoint.username, self._endpoint.password),
                                     data=json.dumps(data), verify=self.verify_ssl)
            except requests.exceptions.RequestException as e:
                error = True
                r = None
                status = None

                self.logger.error("Request to '{}' failed with '{}'".format(url, e))
            else:
                error = False
                status = r.status_code

            if error or status not in accepted_status_codes:
                if self._retry_policy.should_retry(status, error, retry_count):
                    sleep(self._retry_policy.seconds_to_sleep(retry_count))
                    retry_count += 1
                    continue
                else:
                    raise HttpClientException("Invalid status code '{}' or error '{}' from {}"
                                              .format(status, error, url))
            return r
Beispiel #24
0
class LivySession(ObjectWithGuid):
    def __init__(self, http_client, properties, ipython_display,
                 session_id=-1, sql_created=None, spark_events=None):
        super(LivySession, self).__init__()
        assert "kind" in list(properties.keys())
        kind = properties["kind"]
        self.properties = properties
        self.ipython_display = ipython_display

        if spark_events is None:
            spark_events = SparkEvents()
        self._spark_events = spark_events

        status_sleep_seconds = conf.status_sleep_seconds()
        statement_sleep_seconds = conf.statement_sleep_seconds()
        wait_for_idle_timeout_seconds = conf.wait_for_idle_timeout_seconds()

        assert status_sleep_seconds > 0
        assert statement_sleep_seconds > 0
        assert wait_for_idle_timeout_seconds > 0
        if session_id == -1 and sql_created is True:
            raise BadUserDataException("Cannot indicate sql state without session id.")

        self.logger = Log("LivySession")

        kind = kind.lower()
        if kind not in constants.SESSION_KINDS_SUPPORTED:
            raise BadUserDataException("Session of kind '{}' not supported. Session must be of kinds {}."
                                       .format(kind, ", ".join(constants.SESSION_KINDS_SUPPORTED)))

        if session_id == -1:
            self.status = constants.NOT_STARTED_SESSION_STATUS
            sql_created = False
        else:
            self.status = constants.BUSY_SESSION_STATUS

        self._logs = ""
        self._http_client = http_client
        self._status_sleep_seconds = status_sleep_seconds
        self._statement_sleep_seconds = statement_sleep_seconds
        self._wait_for_idle_timeout_seconds = wait_for_idle_timeout_seconds

        self.kind = kind
        self.id = session_id
        self.created_sql_context = sql_created

    def __str__(self):
        return "Session id: {}\tKind: {}\tState: {}".format(self.id, self.kind, self.status)

    def start(self, create_sql_context=True):
        """Start the session against actual livy server."""
        self._spark_events.emit_session_creation_start_event(self.guid, self.kind)

        try:
            r = self._http_client.post_session(self.properties)
            self.id = r["id"]
            self.status = str(r["state"])

            self.ipython_display.writeln("Creating SparkContext as 'sc'")
            # We wait for livy_session_startup_timeout_seconds() for the session to start up.
            try:
                self.wait_for_idle(conf.livy_session_startup_timeout_seconds())
            except LivyClientTimeoutException:
                raise LivyClientTimeoutException("Session {} did not start up in {} seconds."
                                                 .format(self.id, conf.livy_session_startup_timeout_seconds()))

            if create_sql_context:
                self.create_sql_context()
        except Exception as e:
            self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status,
                                                               False, e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_creation_end_event(self.guid, self.kind, self.id, self.status, True, "", "")

    def create_sql_context(self):
        """Create a sqlContext object on the session. Object will be accessible via variable 'sqlContext'."""
        if self.created_sql_context:
            return
        self.logger.debug("Starting '{}' hive session.".format(self.kind))
        self.ipython_display.writeln("Creating HiveContext as 'sqlContext'")
        command = self._get_sql_context_creation_command()
        try:
            (success, out) = command.execute(self)
        except LivyClientTimeoutException:
            raise LivyClientTimeoutException("Failed to create the SqlContext in time. Timed out after {} seconds."
                                             .format(self._wait_for_idle_timeout_seconds))
        if success:
            self.created_sql_context = True
        else:
            raise FailedToCreateSqlContextException("Failed to create the SqlContext.\nError: '{}'".format(out))

    def get_logs(self):
        log_array = self._http_client.get_all_session_logs(self.id)['log']
        self._logs = "\n".join(log_array)
        return self._logs

    @property
    def http_client(self):
        return self._http_client

    @staticmethod
    def is_final_status(status):
        return status in constants.FINAL_STATUS

    def delete(self):
        session_id = self.id
        self._spark_events.emit_session_deletion_start_event(self.guid, self.kind, session_id, self.status)

        try:
            self.logger.debug("Deleting session '{}'".format(session_id))

            if self.status != constants.NOT_STARTED_SESSION_STATUS and self.status != constants.DEAD_SESSION_STATUS:
                self._http_client.delete_session(session_id)
                self.status = constants.DEAD_SESSION_STATUS
                self.id = -1
            else:
                self.ipython_display.send_error("Cannot delete session {} that is in state '{}'."
                                                .format(session_id, self.status))
        except Exception as e:
            self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, False,
                                                               e.__class__.__name__, str(e))
            raise
        else:
            self._spark_events.emit_session_deletion_end_event(self.guid, self.kind, session_id, self.status, True, "", "")

    def wait_for_idle(self, seconds_to_wait=None):
        """Wait for session to go to idle status. Sleep meanwhile. Calls done every status_sleep_seconds as
        indicated by the constructor.

        Parameters:
            seconds_to_wait : number of seconds to wait before giving up.
        """
        if seconds_to_wait is None:
            seconds_to_wait = self._wait_for_idle_timeout_seconds

        while True:
            self.refresh_status()
            if self.status == constants.IDLE_SESSION_STATUS:
                return

            if self.status in constants.FINAL_STATUS:
                error = "Session {} unexpectedly reached final status '{}'."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyUnexpectedStatusException('{} See logs:\n{}'.format(error, self.get_logs()))

            if seconds_to_wait <= 0.0:
                error = "Session {} did not reach idle status in time. Current status is {}."\
                    .format(self.id, self.status)
                self.logger.error(error)
                raise LivyClientTimeoutException(error)

            start_time = time()
            self.logger.debug("Session {} in state {}. Sleeping {} seconds."
                              .format(self.id, self.status, self._status_sleep_seconds))
            sleep(self._status_sleep_seconds)
            seconds_to_wait -= time() - start_time

    def sleep(self):
        sleep(self._statement_sleep_seconds)

    def refresh_status(self):
        status = self._http_client.get_session(self.id)['state']

        if status in constants.POSSIBLE_SESSION_STATUS:
            self.status = status
        else:
            raise LivyUnexpectedStatusException("Status '{}' not supported by session.".format(status))

        return self.status

    def _get_sql_context_creation_command(self):
        if self.kind == constants.SESSION_KIND_SPARK:
            sql_context_command = "val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_PYSPARK:
            sql_context_command = "from pyspark.sql import HiveContext\nsqlContext = HiveContext(sc)"
        elif self.kind == constants.SESSION_KIND_SPARKR:
            sql_context_command = "sqlContext <- sparkRHive.init(sc)"
        else:
            raise BadUserDataException("Do not know how to create HiveContext in session of kind {}.".format(self.kind))

        return Command(sql_context_command)