Example #1
0
def test_execute_sql():
    sqlquery = SQLQuery("HERE IS THE QUERY", "take", 100, 0.2)
    result = """{"z":100,"y":50}
{"z":25,"y":10}"""
    result_data = pd.DataFrame([{'z': 100, 'y': 50}, {'z':25, 'y':10}])
    mock_spark_session.kind = "pyspark"
    mock_spark_session.execute.return_value = (True, result)
    result = client.execute_sql(sqlquery)
    assert_frame_equal(result, result_data)
    mock_spark_session.execute.assert_called_once_with(sqlquery.to_command("pyspark"))
Example #2
0
def test_sqlquery_only_columns():
    query = "HERE IS MY SQL QUERY SELECT * FROM CREATE DROP TABLE"
    samplemethod = "take"
    maxrows = 120
    samplefraction = 0.6
    sqlquery = SQLQuery(query, samplemethod, maxrows, samplefraction)
    assert_false(sqlquery.only_columns)
    sqlquery2 = SQLQuery.as_only_columns_query(sqlquery)

    sqlquery.only_columns = True
    assert_equals(sqlquery, sqlquery2)
Example #3
0
def test_sqlquery_only_columns():
    query = "HERE IS MY SQL QUERY SELECT * FROM CREATE DROP TABLE"
    samplemethod = "take"
    maxrows = 120
    samplefraction = 0.6
    sqlquery = SQLQuery(query, samplemethod, maxrows, samplefraction)
    assert_false(sqlquery.only_columns)
    sqlquery2 = sqlquery.to_only_columns_query()

    sqlquery.only_columns = True
    assert_equals(sqlquery, sqlquery2)
Example #4
0
def test_execute_sql():
    sqlquery = SQLQuery("HERE IS THE QUERY", "take", 100, 0.2)
    sqlquery.to_command = MagicMock(return_value=MagicMock())
    result = """{"z":100,"y":50}
{"z":25,"y":10}"""
    sqlquery.to_command.return_value.execute = MagicMock(return_value=(True, result))
    result_data = pd.DataFrame([{'z': 100, 'y': 50}, {'z':25, 'y':10}])
    mock_spark_session = MagicMock()
    mock_spark_session.kind = "pyspark"
    result = sqlquery.execute(mock_spark_session)
    assert_frame_equal(result, result_data)
    sqlquery.to_command.return_value.execute.assert_called_once_with(mock_spark_session)
Example #5
0
def test_unicode_sql():
    query = u"SELECT 'รจ'"

    sqlquery = SQLQuery(query, samplemethod='take', maxrows=120)
    assert_equals(sqlquery._pyspark_command(),
                  Command(u'for {} in sqlContext.sql(u"""{}""").toJSON().take(120): print({}.encode("{}"))'\
                          .format(LONG_RANDOM_VARIABLE_NAME, query,
                                  LONG_RANDOM_VARIABLE_NAME, conf.pyspark_sql_encoding())))
    assert_equals(sqlquery._scala_command(),
                  Command(u'sqlContext.sql("""{}""").toJSON.take(120).foreach(println)'.format(query)))

    try:
        sqlquery._r_command()
        assert False
    except NotImplementedError:
        pass
Example #6
0
def test_execute_sql_no_results():
    global executed_once
    executed_once = False
    sqlquery = SQLQuery("SHOW TABLES", "take", maxrows=-1)
    sqlquery.to_command = MagicMock()
    sqlquery.to_only_columns_query = MagicMock()
    result1 = ""
    result2 = """column_a
THE_SECOND_COLUMN"""
    result_data = pd.DataFrame.from_records([], columns=['column_a', 'THE_SECOND_COLUMN'])
    mock_spark_session = MagicMock()
    sqlquery.to_command.return_value.execute.return_value = (True, result1)
    sqlquery.to_only_columns_query.return_value.to_command.return_value.execute.return_value = (True, result2)
    mock_spark_session.kind = "spark"
    result = sqlquery.execute(mock_spark_session)
    assert_frame_equal(result, result_data)
    sqlquery.to_command.return_value.execute.assert_called_once_with(mock_spark_session)
    sqlquery.to_only_columns_query.return_value.to_command.return_value.execute.assert_called_once_with(mock_spark_session)
Example #7
0
def test_execute_sql_failure_emits_event():
    spark_events = MagicMock()
    sqlquery = SQLQuery("HERE IS THE QUERY", "take", 100, 0.2, spark_events)
    sqlquery.to_command = MagicMock()
    sqlquery.to_command.return_value.execute = MagicMock(side_effect=ValueError('yo'))
    session = MagicMock()
    session.kind = "pyspark"
    try:
        result = sqlquery.execute(session)
        assert False
    except ValueError:
        sqlquery.to_command.return_value.execute.assert_called_once_with(session)
        spark_events.emit_sql_execution_start_event._assert_called_once_with(session.guid, session.kind,
                                                                             session.id, sqlquery.guid)
        spark_events.emit_sql_execution_end_event._assert_called_once_with(session.guid, session.kind,
                                                                           session.id, sqlquery.guid,
                                                                           sqlquery.to_command.return_value.guid,
                                                                           True, "ValueError", "yo")
Example #8
0
 def sql(self, line, cell="", local_ns=None):
     if self._do_not_call_start_session(""):
         args = parse_argstring(self.sql, line)
         sql_query = SQLQuery(cell, args.samplemethod, args.maxrows,
                              args.samplefraction)
         return self.execute_sqlquery(sql_query, None, args.output,
                                      args.quiet)
     else:
         return None
Example #9
0
def test_execute_sql_no_results():
    sqlquery = SQLQuery("SHOW TABLES", "take", maxrows=-1)
    result1 = ""
    result2 = """column_a
THE_SECOND_COLUMN"""
    result_data = pd.DataFrame.from_records([], columns=['column_a', 'THE_SECOND_COLUMN'])
    def calls(c):
        if c == sqlquery.to_command("spark"):
            return True, result1
        else:
            return True, result2
    mock_spark_session.execute = MagicMock(wraps=calls)
    mock_spark_session.kind = "spark"
    result = client.execute_sql(sqlquery)
    assert_frame_equal(result, result_data)
    assert_equals(mock_spark_session.execute.mock_calls,
                  [call(sqlquery.to_command("spark")),
                   call(SQLQuery.as_only_columns_query(sqlquery).to_command("spark"))])
Example #10
0
def test_sqlquery_initializes():
    query = "HERE IS MY SQL QUERY SELECT * FROM CREATE DROP TABLE"
    samplemethod = "take"
    maxrows = 120
    samplefraction = 0.6
    sqlquery = SQLQuery(query, samplemethod, maxrows, samplefraction)
    assert_equals(sqlquery.query, query)
    assert_equals(sqlquery.samplemethod, samplemethod)
    assert_equals(sqlquery.maxrows, maxrows)
    assert_equals(sqlquery.samplefraction, samplefraction)
Example #11
0
def test_execute_sql():
    spark_events = MagicMock()
    sqlquery = SQLQuery("HERE IS THE QUERY", "take", 100, 0.2, spark_events=spark_events)
    sqlquery.to_command = MagicMock(return_value=MagicMock())
    result = """{"z":100,"y":50}
{"z":25,"y":10}"""
    sqlquery.to_command.return_value.execute = MagicMock(return_value=(True, result))
    result_data = pd.DataFrame([{'z': 100, 'y': 50}, {'z':25, 'y':10}])
    session = MagicMock()
    session.kind = "pyspark"
    result = sqlquery.execute(session)
    assert_frame_equal(result, result_data)
    sqlquery.to_command.return_value.execute.assert_called_once_with(session)
    spark_events.emit_sql_execution_start_event._assert_called_once_with(session.guid, session.kind,
                                                                         session.id, sqlquery.guid)
    spark_events.emit_sql_execution_end_event._assert_called_once_with(session.guid, session.kind,
                                                                       session.id, sqlquery.guid,
                                                                       sqlquery.to_command.return_value.guid,
                                                                       True, "", "")
Example #12
0
def test_execute_sql_some_exception():
    sqlquery = SQLQuery("HERE IS THE QUERY", "take", 100, 0.2)
    client.execute = MagicMock(return_value=(False, ''))
    mock_spark_session.kind = "pyspark"

    try:
        result = client.execute_sql(sqlquery)
        assert False
    except DataFrameParseException as e:
        pass
Example #13
0
def test_sql_with_output():
    line = "-o my_var"
    cell = "some spark code"
    magic.execute_sqlquery = MagicMock()

    magic.sql(line, cell)

    spark_controller.add_session.assert_called_once_with(
        magic.session_name, magic.connection_string, False,
        {"kind": constants.SESSION_KIND_PYSPARK})
    magic.execute_sqlquery.assert_called_once_with(SQLQuery(cell), None,
                                                   "my_var", False)
Example #14
0
def test_sql_quiet():
    line = "-q -o Output"
    cell = ""
    magic.execute_sqlquery = MagicMock()

    ret = magic.sql(line, cell)

    spark_controller.add_session.assert_called_once_with(
        magic.session_name, magic.connection_string, False,
        {"kind": constants.SESSION_KIND_PYSPARK})
    magic.execute_sqlquery.assert_called_once_with(SQLQuery(cell), None,
                                                   "Output", True)
Example #15
0
def test_sqlquery_loads_defaults():
    defaults = {
        conf.default_samplemethod.__name__: "sample",
        conf.default_maxrows.__name__: 419,
        conf.default_samplefraction.__name__: 0.99,
    }
    conf.override_all(defaults)
    query = "DROP TABLE USERS;"
    sqlquery = SQLQuery(query)
    assert_equals(sqlquery.query, query)
    assert_equals(sqlquery.samplemethod, defaults[conf.default_samplemethod.__name__])
    assert_equals(sqlquery.maxrows, defaults[conf.default_maxrows.__name__])
    assert_equals(sqlquery.samplefraction, defaults[conf.default_samplefraction.__name__])
Example #16
0
def test_sql_sample_options():
    line = "-q -m sample -n 142 -r 0.3"
    cell = ""
    magic.execute_sqlquery = MagicMock()

    ret = magic.sql(line, cell)

    spark_controller.add_session.assert_called_once_with(
        magic.session_name, magic.connection_string, False,
        {"kind": constants.SESSION_KIND_PYSPARK})
    magic.execute_sqlquery.assert_called_once_with(
        SQLQuery(cell, samplemethod="sample", maxrows=142, samplefraction=0.3),
        None, None, True)
Example #17
0
def test_execute_sql_no_results():
    global executed_once
    executed_once = False
    spark_events = MagicMock()
    sqlquery = SQLQuery("SHOW TABLES", "take", maxrows=-1, spark_events=spark_events)
    sqlquery.to_command = MagicMock()
    sqlquery.to_only_columns_query = MagicMock()
    result1 = ""
    result_data = pd.DataFrame([])
    session = MagicMock()
    sqlquery.to_command.return_value.execute.return_value = (True, result1)
    session.kind = "spark"
    result = sqlquery.execute(session)
    assert_frame_equal(result, result_data)
    sqlquery.to_command.return_value.execute.assert_called_once_with(session)
    spark_events.emit_sql_execution_start_event._assert_called_once_with(session.guid, session.kind,
                                                                         session.id, sqlquery.guid,
                                                                         sqlquery.samplemethod, sqlquery.maxrows,
                                                                         sqlquery.samplefraction)
    spark_events.emit_sql_execution_end_event._assert_called_once_with(session.guid, session.kind,
                                                                       session.id, sqlquery.guid,
                                                                       sqlquery.to_command.return_value.guid,
                                                                       True, "", "")
Example #18
0
def test_df_execution_without_output_var():
    shell = MagicMock()
    shell.user_ns = {}
    magic = SparkMagicBase(None)
    magic.shell = shell

    df = 0
    query = SQLQuery("")
    session = MagicMock()
    output_var = None

    magic.spark_controller = MagicMock()
    magic.spark_controller.run_cell_sql = MagicMock(return_value=df)

    res = magic.execute_sqlquery(query, session, output_var, False)

    magic.spark_controller.run_cell_sql.assert_called_once_with(query, session)
    assert res == df
    assert_equals(list(shell.user_ns.keys()), [])
Example #19
0
def test_df_execution_quiet_with_output_var():
    shell = MagicMock()
    shell.user_ns = {}
    magic = SparkMagicBase(None)
    magic.shell = shell

    df = 0
    cell = SQLQuery("")
    session = MagicMock()
    output_var = "var_name"

    magic.spark_controller = MagicMock()
    magic.spark_controller.run_cell_sql = MagicMock(return_value=df)

    res = magic.execute_sqlquery(cell, session, output_var, True)

    magic.spark_controller.run_cell_sql.assert_called_once_with(cell, session)
    assert res is None
    assert shell.user_ns[output_var] == df
Example #20
0
def test_run_sql_command_parses():
    run_cell_method = MagicMock()
    run_cell_method.return_value = (True, "")
    spark_controller.run_sqlquery = run_cell_method

    command = "-s"
    name = "sessions_name"
    context = "-c"
    context_name = "sql"
    meth = "-m"
    method_name = "sample"
    line = " ".join([command, name, context, context_name, meth, method_name])
    cell = "cell code"

    result = magic.spark(line, cell)

    run_cell_method.assert_called_once_with(
        SQLQuery(cell, samplemethod=method_name), name)
    assert result is not None
Example #21
0
def test_run_sql_command_exception():
    run_cell_method = MagicMock()
    run_cell_method.side_effect = LivyUnexpectedStatusException('WOW')
    spark_controller.run_sqlquery = run_cell_method

    command = "-s"
    name = "sessions_name"
    context = "-c"
    context_name = "sql"
    meth = "-m"
    method_name = "sample"
    line = " ".join([command, name, context, context_name, meth, method_name])
    cell = "cell code"

    result = magic.spark(line, cell)

    run_cell_method.assert_called_once_with(
        SQLQuery(cell, samplemethod=method_name), name)
    ipython_display.send_error.assert_called_once_with(
        EXPECTED_ERROR_MSG.format(run_cell_method.side_effect))
Example #22
0
def test_df_execution_throws():
    shell = MagicMock()
    shell.user_ns = {}
    magic = SparkMagicBase(None)
    magic.shell = shell
    error = "error"

    query = SQLQuery("")
    session = MagicMock()
    output_var = "var_name"

    magic.spark_controller = MagicMock()
    magic.spark_controller.run_cell_sql = MagicMock(
        side_effect=DataFrameParseException(error))

    res = magic.execute_sqlquery(query, session, output_var, False)

    magic.spark_controller.run_cell_sql.assert_called_once_with(query, session)
    assert res is None
    assert_equals(list(shell.user_ns.keys()), [])
Example #23
0
def test_sqlquery_rejects_bad_data():
    query = "HERE IS MY SQL QUERY SELECT * FROM CREATE DROP TABLE"
    samplemethod = "foo"
    _ = SQLQuery(query, samplemethod)
Example #24
0
def test_scala_livy_sql_options():
    query = "abc"

    sqlquery = SQLQuery(query, samplemethod='take', maxrows=100)
    assert_equals(sqlquery._scala_command(),
                  Command('sqlContext.sql("""{}""").toJSON.take(100).foreach(println)'.format(query)))

    sqlquery = SQLQuery(query, samplemethod='take', maxrows=-1)
    assert_equals(sqlquery._scala_command(),
                  Command('sqlContext.sql("""{}""").toJSON.collect.foreach(println)'.format(query)))

    sqlquery = SQLQuery(query, samplemethod='sample', samplefraction=0.25, maxrows=-1)
    assert_equals(sqlquery._scala_command(),
                  Command('sqlContext.sql("""{}""").toJSON.sample(false, 0.25).collect.foreach(println)'.format(query)))

    sqlquery = SQLQuery(query, samplemethod='sample', samplefraction=0.33, maxrows=3234)
    assert_equals(sqlquery._scala_command(),
                  Command('sqlContext.sql("""{}""").toJSON.sample(false, 0.33).take(3234).foreach(println)'.format(query)))
Example #25
0
 def _sqlquery(cell, samplemethod, maxrows, samplefraction):
     return SQLQuery(cell, samplemethod, maxrows, samplefraction)
Example #26
0
    def spark(self, line, cell="", local_ns=None):
        """Magic to execute spark remotely.

           This magic allows you to create a Livy Scala or Python session against a Livy endpoint. Every session can
           be used to execute either Spark code or SparkSQL code by executing against the SQL context in the session.
           When the SQL context is used, the result will be a Pandas dataframe of a sample of the results.

           If invoked with no subcommand, the cell will be executed against the specified session.

           Subcommands
           -----------
           info
               Display the available Livy sessions and other configurations for sessions.
           add
               Add a Livy session. First argument is the name of the session, second argument
               is the language, and third argument is the connection string of the Livy endpoint.
               A fourth argument specifying if session creation can be skipped if it already exists is optional:
               "skip" or empty.
               e.g. `%%spark add test python url=https://sparkcluster.net/livy;username=u;password=p skip`
               or
               e.g. `%%spark add test python url=https://sparkcluster.net/livy;username=u;password=p`
           config
               Override the livy session properties sent to Livy on session creation. All session creations will
               contain these config settings from then on.
               Expected value is a JSON key-value string to be sent as part of the Request Body for the POST /sessions
               endpoint in Livy.
               e.g. `%%spark config {"driverMemory":"1000M", "executorCores":4}`
           run
               Run Spark code against a session.
               e.g. `%%spark -s testsession` will execute the cell code against the testsession previously created
               e.g. `%%spark -s testsession -c sql` will execute the SQL code against the testsession previously created
               e.g. `%%spark -s testsession -c sql -o my_var` will execute the SQL code against the testsession
                        previously created and store the pandas dataframe created in the my_var variable in the
                        Python environment.
           logs
               Returns the logs for a given session.
               e.g. `%%spark logs -s testsession` will return the logs for the testsession previously created
           delete
               Delete a Livy session. Argument is the name of the session to be deleted.
               e.g. `%%spark delete defaultlivy`
           cleanup
               Delete all Livy sessions created by the notebook. No arguments required.
               e.g. `%%spark cleanup`
        """
        usage = "Please look at usage of %spark by executing `%spark?`."
        user_input = line
        args = parse_argstring(self.spark, user_input)

        subcommand = args.command[0].lower()

        try:
            # info
            if subcommand == "info":
                if len(args.command) == 2:
                    connection_string = args.command[1]
                    info_sessions = self.spark_controller.get_all_sessions_endpoint_info(connection_string)
                    self.print_endpoint_info(info_sessions)
                elif len(args.command) == 1:
                    self._print_local_info()
                else:
                    raise ValueError("Subcommand 'info' requires no value or a connection string to show all sessions.\n"
                                     "{}".format(usage))
            # config
            elif subcommand == "config":
                # Would normally do " ".join(args.command[1:]) but parse_argstring removes quotes...
                rest_of_line = user_input[7:]
                conf.override(conf.session_configs.__name__, json.loads(rest_of_line))
            # add
            elif subcommand == "add":
                if len(args.command) != 4 and len(args.command) != 5:
                    raise ValueError("Subcommand 'add' requires three or four arguments.\n{}".format(usage))

                name = args.command[1].lower()
                language = args.command[2].lower()
                connection_string = args.command[3]

                if len(args.command) == 5:
                    skip = args.command[4].lower() == "skip"
                else:
                    skip = False

                properties = conf.get_session_properties(language)

                self.spark_controller.add_session(name, connection_string, skip, properties)
            # delete
            elif subcommand == "delete":
                if len(args.command) == 2:
                    name = args.command[1].lower()
                    self.spark_controller.delete_session_by_name(name)
                elif len(args.command) == 3:
                    connection_string = args.command[1]
                    session_id = args.command[2]
                    self.spark_controller.delete_session_by_id(connection_string, session_id)
                else:
                    raise ValueError("Subcommand 'delete' requires a session name or a connection string and id.\n{}"
                                     .format(usage))
            # cleanup
            elif subcommand == "cleanup":
                if len(args.command) == 2:
                    connection_string = args.command[1]
                    self.spark_controller.cleanup_endpoint(connection_string)
                elif len(args.command) == 1:
                    self.spark_controller.cleanup()
                else:
                    raise ValueError("Subcommand 'cleanup' requires no further values or a connection string to clean up "
                                     "sessions.\n{}".format(usage))
            # logs
            elif subcommand == "logs":
                if len(args.command) == 1:
                    (success, out) = self.spark_controller.get_logs(args.session)
                    if success:
                        self.ipython_display.write(out)
                    else:
                        self.ipython_display.send_error(out)
                else:
                    raise ValueError("Subcommand 'logs' requires no further values.\n{}".format(usage))
            # run
            elif len(subcommand) == 0:
                if args.context == CONTEXT_NAME_SPARK:
                    (success, out) = self.spark_controller.run_cell(cell, args.session)
                    if success:
                        self.ipython_display.write(out)
                    else:
                        self.ipython_display.send_error(out)
                elif args.context == CONTEXT_NAME_SQL:
                    sqlquery = SQLQuery(cell, args.samplemethod, args.maxrows, args.samplefraction)
                    return self.execute_sqlquery(sqlquery, args.session, args.output,
                                                 args.quiet)
                else:
                    raise ValueError("Context '{}' not found".format(args.context))
            # error
            else:
                raise ValueError("Subcommand '{}' not found. {}".format(subcommand, usage))
        except ValueError as err:
            self.ipython_display.send_error("{}".format(err))
Example #27
0
def test_pyspark_livy_sql_options():
    query = "abc"

    sqlquery = SQLQuery(query, samplemethod='take', maxrows=120)
    assert_equals(sqlquery._pyspark_command(),
                  Command('for {} in sqlContext.sql("""{}""").toJSON().take(120): print({})'\
                          .format(LONG_RANDOM_VARIABLE_NAME, query,
                                  LONG_RANDOM_VARIABLE_NAME)))

    sqlquery = SQLQuery(query, samplemethod='take', maxrows=-1)
    assert_equals(sqlquery._pyspark_command(),
                  Command('for {} in sqlContext.sql("""{}""").toJSON().collect(): print({})'\
                          .format(LONG_RANDOM_VARIABLE_NAME, query,
                                  LONG_RANDOM_VARIABLE_NAME)))

    sqlquery = SQLQuery(query, samplemethod='sample', samplefraction=0.25, maxrows=-1)
    assert_equals(sqlquery._pyspark_command(),
                  Command('for {} in sqlContext.sql("""{}""").toJSON().sample(False, 0.25).collect(): print({})'\
                          .format(LONG_RANDOM_VARIABLE_NAME, query,
                                  LONG_RANDOM_VARIABLE_NAME)))

    sqlquery = SQLQuery(query, samplemethod='sample', samplefraction=0.33, maxrows=3234)
    assert_equals(sqlquery._pyspark_command(),
                  Command('for {} in sqlContext.sql("""{}""").toJSON().sample(False, 0.33).take(3234): print({})'\
                          .format(LONG_RANDOM_VARIABLE_NAME, query,
                                  LONG_RANDOM_VARIABLE_NAME)))