Exemple #1
0
def test_list_sessions(mocker, server):

    mocker.patch.object(Session, 'from_json')

    client = LivyClient(server)
    sessions = client.list_sessions()

    assert sessions == [Session.from_json.return_value]
    Session.from_json.assert_called_once_with(MOCK_SESSION_JSON)
Exemple #2
0
def test_get_session(mocker, server, auth):

    mocker.patch.object(Session, "from_json")

    client = LivyClient(server, auth)
    session = client.get_session(MOCK_SESSION_ID)

    assert session == Session.from_json.return_value
    Session.from_json.assert_called_once_with(MOCK_SESSION_JSON)
Exemple #3
0
def test_verify(requests_mock, mocker, verify):
    requests_mock.get("http://example.com/sessions", json={"sessions": []})
    mocker.patch.object(Session, "from_json")

    client = LivyClient("http://example.com", verify=verify)
    client.list_sessions()

    [request] = requests_mock.request_history
    assert request.verify is verify
Exemple #4
0
def test_create_session(mocker, server):

    mocker.patch.object(Session, 'from_json')

    client = LivyClient(server)
    session = client.create_session(SessionKind.PYSPARK)

    assert session == Session.from_json.return_value
    Session.from_json.assert_called_once_with(MOCK_SESSION_JSON)
Exemple #5
0
def test_list_statements(mocker, server, auth):

    mocker.patch.object(Statement, "from_json")

    client = LivyClient(server, auth)
    statements = client.list_statements(MOCK_SESSION_ID)

    assert statements == [Statement.from_json.return_value]
    Statement.from_json.assert_called_once_with(MOCK_SESSION_ID,
                                                MOCK_STATEMENT_JSON)
Exemple #6
0
 def __init__(
     self,
     url: str,
     batch_id: int,
     auth: Auth = None,
     verify: Verify = True,
     requests_session: requests.Session = None,
 ) -> None:
     self.client = LivyClient(url, auth, verify, requests_session)
     self.batch_id = batch_id
Exemple #7
0
def test_delete_session(requests_mock):
    requests_mock.delete(
        f"http://example.com/sessions/{MOCK_SESSION_ID}",
        json={"msg": "deleted"},
    )

    client = LivyClient("http://example.com")
    client.delete_session(MOCK_SESSION_ID)

    assert requests_mock.called
Exemple #8
0
def test_get_statement(mocker, server):

    mocker.patch.object(Statement, 'from_json')

    client = LivyClient(server)
    statement = client.get_statement(MOCK_SESSION_ID, MOCK_STATEMENT_ID)

    assert statement == Statement.from_json.return_value
    Statement.from_json.assert_called_once_with(MOCK_SESSION_ID,
                                                MOCK_STATEMENT_JSON)
def test_list_sessions(requests_mock, mocker):
    requests_mock.get("http://example.com/sessions",
                      json={"sessions": [MOCK_SESSION_JSON]})
    mocker.patch.object(Session, "from_json")

    client = LivyClient("http://example.com")
    sessions = client.list_sessions()

    assert sessions == [Session.from_json.return_value]
    Session.from_json.assert_called_once_with(MOCK_SESSION_JSON)
Exemple #10
0
def test_create_statement(mocker, server, auth):

    mocker.patch.object(Statement, "from_json")

    client = LivyClient(server, auth)
    statement = client.create_statement(MOCK_SESSION_ID, MOCK_CODE,
                                        StatementKind.PYSPARK)

    assert statement == Statement.from_json.return_value
    Statement.from_json.assert_called_once_with(MOCK_SESSION_ID,
                                                MOCK_STATEMENT_JSON)
Exemple #11
0
 def __init__(self,
              url: str,
              kind: SessionKind = SessionKind.PYSPARK,
              spark_conf: Dict[str, Any] = None,
              echo: bool = True,
              check: bool = True) -> None:
     self.client = LivyClient(url)
     self.kind = kind
     self.echo = echo
     self.check = check
     self.session_id: Optional[int] = None
     self.spark_conf = spark_conf
def test_get_batch_log(requests_mock, mocker):
    requests_mock.get(
        f"http://example.com/batches/{MOCK_BATCH_ID}/log",
        json=MOCK_BATCH_LOG_JSON,
    )
    mocker.patch.object(BatchLog, "from_json")

    client = LivyClient("http://example.com")
    batch = client.get_batch_log(MOCK_BATCH_ID)

    assert batch == BatchLog.from_json.return_value
    BatchLog.from_json.assert_called_once_with(MOCK_BATCH_LOG_JSON)
Exemple #13
0
def test_get_session(requests_mock, mocker):
    requests_mock.get(
        f"http://example.com/sessions/{MOCK_SESSION_ID}",
        json=MOCK_SESSION_JSON,
    )
    mocker.patch.object(Session, "from_json")

    client = LivyClient("http://example.com")
    session = client.get_session(MOCK_SESSION_ID)

    assert session == Session.from_json.return_value
    Session.from_json.assert_called_once_with(MOCK_SESSION_JSON)
def test_list_statements(requests_mock, mocker):
    requests_mock.get(
        f"http://example.com/sessions/{MOCK_SESSION_ID}/statements",
        json={"statements": [MOCK_STATEMENT_JSON]},
    )
    mocker.patch.object(Statement, "from_json")

    client = LivyClient("http://example.com")
    statements = client.list_statements(MOCK_SESSION_ID)

    assert statements == [Statement.from_json.return_value]
    Statement.from_json.assert_called_once_with(MOCK_SESSION_ID,
                                                MOCK_STATEMENT_JSON)
Exemple #15
0
def test_create_session(mocker, server, auth):

    mocker.patch.object(Session, "from_json")

    client = LivyClient(server, auth)
    session = client.create_session(
        SessionKind.PYSPARK,
        proxy_user=MOCK_PROXY_USER,
        spark_conf=MOCK_SPARK_CONF,
    )

    assert session == Session.from_json.return_value
    Session.from_json.assert_called_once_with(MOCK_SESSION_JSON)
def test_auth(requests_mock, mocker):
    requests_mock.get("http://example.com/sessions", json={"sessions": []})
    mocker.patch.object(Session, "from_json")

    def dummy_auth(request):
        request.headers["Authorization"] = "dummy-token"
        return request

    client = LivyClient("http://example.com", auth=dummy_auth)
    client.list_sessions()

    [request] = requests_mock.request_history
    assert request.headers["Authorization"] == "dummy-token"
def test_get_statement(requests_mock, mocker):
    requests_mock.get(
        f"http://example.com/sessions/{MOCK_SESSION_ID}" +
        f"/statements/{MOCK_STATEMENT_ID}",
        json=MOCK_STATEMENT_JSON,
    )
    mocker.patch.object(Statement, "from_json")

    client = LivyClient("http://example.com")
    statement = client.get_statement(MOCK_SESSION_ID, MOCK_STATEMENT_ID)

    assert statement == Statement.from_json.return_value
    Statement.from_json.assert_called_once_with(MOCK_SESSION_ID,
                                                MOCK_STATEMENT_JSON)
Exemple #18
0
 def __init__(
     self,
     url: str,
     session_id: int,
     auth: Auth = None,
     verify: Verify = True,
     requests_session: requests.Session = None,
     kind: SessionKind = SessionKind.PYSPARK,
     echo: bool = True,
     check: bool = True,
 ) -> None:
     self.client = LivyClient(url, auth, verify, requests_session)
     self.session_id = session_id
     self.kind = kind
     self.echo = echo
     self.check = check
def test_create_batch(requests_mock, mocker):
    requests_mock.get("http://example.com/version",
                      json={"version": "0.5.0-incubating"})
    requests_mock.post("http://example.com/batches", json=MOCK_BATCH_JSON)
    mocker.patch.object(Batch, "from_json")

    client = LivyClient("http://example.com")
    batch = client.create_batch(
        file=MOCK_BATCH_FILE,
        class_name=MOCK_BATCH_CLASSNAME,
        args=MOCK_BATCH_ARGS,
        proxy_user=MOCK_PROXY_USER,
        jars=MOCK_JARS,
        py_files=MOCK_PY_FILES,
        files=MOCK_FILES,
        driver_memory=MOCK_DRIVER_MEMORY,
        driver_cores=MOCK_DRIVER_CORES,
        executor_memory=MOCK_EXECUTOR_MEMORY,
        executor_cores=MOCK_EXECUTOR_CORES,
        num_executors=MOCK_NUM_EXECUTORS,
        archives=MOCK_ARCHIVES,
        queue=MOCK_QUEUE,
        name=MOCK_NAME,
        spark_conf=MOCK_SPARK_CONF,
    )

    assert batch == Batch.from_json.return_value
    Batch.from_json.assert_called_once_with(MOCK_BATCH_JSON)
    assert requests_mock.last_request.json() == {
        "file": MOCK_BATCH_FILE,
        "proxyUser": MOCK_PROXY_USER,
        "className": MOCK_BATCH_CLASSNAME,
        "args": MOCK_BATCH_ARGS,
        "jars": MOCK_JARS,
        "pyFiles": MOCK_PY_FILES,
        "files": MOCK_FILES,
        "driverMemory": MOCK_DRIVER_MEMORY,
        "driverCores": MOCK_DRIVER_CORES,
        "executorMemory": MOCK_EXECUTOR_MEMORY,
        "executorCores": MOCK_EXECUTOR_CORES,
        "numExecutors": MOCK_NUM_EXECUTORS,
        "archives": MOCK_ARCHIVES,
        "queue": MOCK_QUEUE,
        "name": MOCK_NAME,
        "conf": MOCK_SPARK_CONF,
    }
Exemple #20
0
 def __init__(
     self,
     url: str,
     auth: Auth = None,
     kind: SessionKind = SessionKind.PYSPARK,
     proxy_user: str = None,
     spark_conf: Dict[str, Any] = None,
     echo: bool = True,
     check: bool = True,
 ) -> None:
     self.client = LivyClient(url, auth)
     self.kind = kind
     self.proxy_user = proxy_user
     self.spark_conf = spark_conf
     self.echo = echo
     self.check = check
     self.session_id: Optional[int] = None
def test_create_session(requests_mock, mocker):
    requests_mock.get("http://example.com/version",
                      json={"version": "0.5.0-incubating"})
    requests_mock.post("http://example.com/sessions", json=MOCK_SESSION_JSON)
    mocker.patch.object(Session, "from_json")

    client = LivyClient("http://example.com")
    session = client.create_session(
        SessionKind.PYSPARK,
        proxy_user=MOCK_PROXY_USER,
        jars=MOCK_JARS,
        py_files=MOCK_PY_FILES,
        files=MOCK_FILES,
        driver_memory=MOCK_DRIVER_MEMORY,
        driver_cores=MOCK_DRIVER_CORES,
        executor_memory=MOCK_EXECUTOR_MEMORY,
        executor_cores=MOCK_EXECUTOR_CORES,
        num_executors=MOCK_NUM_EXECUTORS,
        archives=MOCK_ARCHIVES,
        queue=MOCK_QUEUE,
        name=MOCK_NAME,
        spark_conf=MOCK_SPARK_CONF,
        heartbeat_timeout=MOCK_HEARTBEAT_TIMEOUT,
    )

    assert session == Session.from_json.return_value
    Session.from_json.assert_called_once_with(MOCK_SESSION_JSON)
    assert requests_mock.last_request.json() == {
        "kind": "pyspark",
        "proxyUser": MOCK_PROXY_USER,
        "conf": MOCK_SPARK_CONF,
        "heartbeatTimeoutInSecond": MOCK_HEARTBEAT_TIMEOUT,
        "jars": MOCK_JARS,
        "pyFiles": MOCK_PY_FILES,
        "files": MOCK_FILES,
        "driverMemory": MOCK_DRIVER_MEMORY,
        "driverCores": MOCK_DRIVER_CORES,
        "executorMemory": MOCK_EXECUTOR_MEMORY,
        "executorCores": MOCK_EXECUTOR_CORES,
        "numExecutors": MOCK_NUM_EXECUTORS,
        "archives": MOCK_ARCHIVES,
        "queue": MOCK_QUEUE,
        "name": MOCK_NAME,
    }
def test_create_statement(requests_mock, mocker):
    requests_mock.get("http://example.com/version",
                      json={"version": "0.5.0-incubating"})
    requests_mock.post(
        f"http://example.com/sessions/{MOCK_SESSION_ID}/statements",
        json=MOCK_STATEMENT_JSON,
    )
    mocker.patch.object(Statement, "from_json")

    client = LivyClient("http://example.com")
    statement = client.create_statement(MOCK_SESSION_ID, MOCK_CODE,
                                        StatementKind.PYSPARK)

    assert statement == Statement.from_json.return_value
    Statement.from_json.assert_called_once_with(MOCK_SESSION_ID,
                                                MOCK_STATEMENT_JSON)
    assert requests_mock.last_request.json() == {
        "code": MOCK_CODE,
        "kind": "pyspark",
    }
Exemple #23
0
 def __init__(
     self,
     url: str,
     auth: Auth = None,
     verify: Verify = True,
     kind: SessionKind = SessionKind.PYSPARK,
     proxy_user: str = None,
     jars: List[str] = None,
     py_files: List[str] = None,
     files: List[str] = None,
     driver_memory: str = None,
     driver_cores: int = None,
     executor_memory: str = None,
     executor_cores: int = None,
     num_executors: int = None,
     archives: List[str] = None,
     queue: str = None,
     name: str = None,
     spark_conf: Dict[str, Any] = None,
     echo: bool = True,
     check: bool = True,
     headers: dict = {},
 ) -> None:
     self.client = LivyClient(url, auth, verify=verify, headers=headers)
     self.kind = kind
     self.proxy_user = proxy_user
     self.jars = jars
     self.py_files = py_files
     self.files = files
     self.driver_memory = driver_memory
     self.driver_cores = driver_cores
     self.executor_memory = executor_memory
     self.executor_cores = executor_cores
     self.num_executors = num_executors
     self.archives = archives
     self.queue = queue
     self.name = name
     self.spark_conf = spark_conf
     self.echo = echo
     self.check = check
     self.session_id: Optional[int] = None
Exemple #24
0
 def __init__(
     self,
     url: str,
     file: str,
     auth: Auth = None,
     class_name: str = None,
     args: List[str] = None,
     proxy_user: str = None,
     jars: List[str] = None,
     py_files: List[str] = None,
     files: List[str] = None,
     driver_memory: str = None,
     driver_cores: int = None,
     executor_memory: str = None,
     executor_cores: int = None,
     num_executors: int = None,
     archives: List[str] = None,
     queue: str = None,
     name: str = None,
     spark_conf: Dict[str, Any] = None,
 ) -> None:
     self.client = LivyClient(url, auth)
     self.file = file
     self.class_name = class_name
     self.args = args
     self.proxy_user = proxy_user
     self.jars = jars
     self.py_files = py_files
     self.files = files
     self.driver_memory = driver_memory
     self.driver_cores = driver_cores
     self.executor_memory = executor_memory
     self.executor_cores = executor_cores
     self.num_executors = num_executors
     self.archives = archives
     self.queue = queue
     self.name = name
     self.spark_conf = spark_conf
     self.batch_id: Optional[int] = None
def test_custom_requests_session(mocker):
    mocker.patch.object(Session, "from_json")

    mock_requests_session = mocker.Mock()
    mock_response = mocker.Mock()
    mock_response.json.return_value = {"sessions": []}
    mock_requests_session.request.return_value = mock_response

    client = LivyClient("http://example.com",
                        requests_session=mock_requests_session)
    client.list_sessions()

    mock_requests_session.request.assert_called_once()

    # Check that a custom session does not get closed
    client.close()
    mock_requests_session.close.assert_not_called()
Exemple #26
0
class LivyBatch:
    """Manages a remote Livy batch and high-level interactions with it.

    The py_files, files, jars and archives arguments are lists of URLs, e.g.
    ["s3://bucket/object", "hdfs://path/to/file", ...] and must be reachable by
    the Spark driver process.  If the provided URL has no scheme, it's
    considered to be relative to the default file system configured in the Livy
    server.

    URLs in the py_files argument are copied to a temporary staging area and
    inserted into Python's sys.path ahead of the standard library paths. This
    allows you to import .py, .zip and .egg files in Python.

    URLs for jars, py_files, files and archives arguments are all copied to the
    same working directory on the Spark cluster.

    The driver_memory and executor_memory arguments have the same format as JVM
    memory strings with a size unit suffix ("k", "m", "g" or "t") (e.g. 512m,
    2g).

    See https://spark.apache.org/docs/latest/configuration.html for more
    information on Spark configuration properties.

    :param url: The URL of the Livy server.
    :param file: File containing the application to execute.
    :param auth: A requests-compatible auth object to use when making requests.
    :param class_name: Application Java/Spark main class.
    :param proxy_user: User to impersonate when starting the session.
    :param jars: URLs of jars to be used in this session.
    :param py_files: URLs of Python files to be used in this session.
    :param files: URLs of files to be used in this session.
    :param driver_memory: Amount of memory to use for the driver process (e.g.
        '512m').
    :param driver_cores: Number of cores to use for the driver process.
    :param executor_memory: Amount of memory to use per executor process (e.g.
        '512m').
    :param executor_cores: Number of cores to use for each executor.
    :param num_executors: Number of executors to launch for this session.
    :param archives: URLs of archives to be used in this session.
    :param queue: The name of the YARN queue to which submitted.
    :param name: The name of this session.
    :param spark_conf: Spark configuration properties.
    """

    def __init__(
        self,
        url: str,
        file: str,
        auth: Auth = None,
        class_name: str = None,
        args: List[str] = None,
        proxy_user: str = None,
        jars: List[str] = None,
        py_files: List[str] = None,
        files: List[str] = None,
        driver_memory: str = None,
        driver_cores: int = None,
        executor_memory: str = None,
        executor_cores: int = None,
        num_executors: int = None,
        archives: List[str] = None,
        queue: str = None,
        name: str = None,
        spark_conf: Dict[str, Any] = None,
    ) -> None:
        self.client = LivyClient(url, auth)
        self.file = file
        self.class_name = class_name
        self.args = args
        self.proxy_user = proxy_user
        self.jars = jars
        self.py_files = py_files
        self.files = files
        self.driver_memory = driver_memory
        self.driver_cores = driver_cores
        self.executor_memory = executor_memory
        self.executor_cores = executor_cores
        self.num_executors = num_executors
        self.archives = archives
        self.queue = queue
        self.name = name
        self.spark_conf = spark_conf
        self.batch_id: Optional[int] = None

    def start(self) -> None:
        """Create the batch session.

        Unlike LivySession, this does not wait for the session to be ready.
        """
        batch = self.client.create_batch(
            self.file,
            self.class_name,
            self.args,
            self.proxy_user,
            self.jars,
            self.py_files,
            self.files,
            self.driver_memory,
            self.driver_cores,
            self.executor_memory,
            self.executor_cores,
            self.num_executors,
            self.archives,
            self.queue,
            self.name,
            self.spark_conf,
        )
        self.batch_id = batch.batch_id

    def wait(self) -> SessionState:
        """Wait for the batch session to finish."""

        intervals = polling_intervals([0.1, 0.5, 1.0, 3.0], 5.0)

        while True:
            state = self.state
            if state in SESSION_STATE_FINISHED:
                break
            time.sleep(next(intervals))

        return state

    @property
    def state(self) -> SessionState:
        """The state of the managed Spark batch."""
        if self.batch_id is None:
            raise ValueError("batch session not yet started")
        batch = self.client.get_batch(self.batch_id)
        if batch is None:
            raise ValueError(
                "batch session not found - it may have been shut down"
            )
        return batch.state

    def log(self, from_: int = None, size: int = None) -> List[str]:
        """Get logs for this Spark batch.

        :param from_: The line number to start getting logs from.
        :param size: The number of lines of logs to get.
        """
        if self.batch_id is None:
            raise ValueError("batch session not yet started")
        log = self.client.get_batch_log(self.batch_id, from_, size)
        if log is None:
            raise ValueError(
                "batch session not found - it may have been shut down"
            )
        return log.lines

    def kill(self) -> None:
        """Kill the managed Spark batch session."""
        if self.batch_id is not None:
            self.client.delete_batch(self.batch_id)
        self.client.close()
Exemple #27
0
    def create(
        cls,
        url: str,
        auth: Auth = None,
        verify: Verify = True,
        requests_session: requests.Session = None,
        kind: SessionKind = SessionKind.PYSPARK,
        proxy_user: str = None,
        jars: List[str] = None,
        py_files: List[str] = None,
        files: List[str] = None,
        driver_memory: str = None,
        driver_cores: int = None,
        executor_memory: str = None,
        executor_cores: int = None,
        num_executors: int = None,
        archives: List[str] = None,
        queue: str = None,
        name: str = None,
        spark_conf: Dict[str, Any] = None,
        heartbeat_timeout: int = None,
        echo: bool = True,
        check: bool = True,
    ) -> "LivySession":
        """Create a new Livy session.

        The py_files, files, jars and archives arguments are lists of URLs,
        e.g. ["s3://bucket/object", "hdfs://path/to/file", ...] and must be
        reachable by the Spark driver process. If the provided URL has no
        scheme, it's considered to be relative to the default file system
        configured in the Livy server.

        URLs in the py_files argument are copied to a temporary staging area
        and inserted into Python's sys.path ahead of the standard library
        paths. This allows you to import .py, .zip and .egg files in Python.

        URLs for jars, py_files, files and archives arguments are all copied to
        the same working directory on the Spark cluster.

        The driver_memory and executor_memory arguments have the same format as
        JVM memory strings with a size unit suffix ("k", "m", "g" or "t") (e.g.
        512m, 2g).

        See https://spark.apache.org/docs/latest/configuration.html for more
        information on Spark configuration properties.

        :param url: The URL of the Livy server.
        :param auth: A requests-compatible auth object to use when making
            requests.
        :param verify: Either a boolean, in which case it controls whether we
            verify the server’s TLS certificate, or a string, in which case it
            must be a path to a CA bundle to use. Defaults to ``True``.
        :param requests_session: A specific ``requests.Session`` to use,
            allowing advanced customisation. The caller is responsible for
            closing the session.
        :param kind: The kind of session to create.
        :param proxy_user: User to impersonate when starting the session.
        :param jars: URLs of jars to be used in this session.
        :param py_files: URLs of Python files to be used in this session.
        :param files: URLs of files to be used in this session.
        :param driver_memory: Amount of memory to use for the driver process
            (e.g. '512m').
        :param driver_cores: Number of cores to use for the driver process.
        :param executor_memory: Amount of memory to use per executor process
            (e.g. '512m').
        :param executor_cores: Number of cores to use for each executor.
        :param num_executors: Number of executors to launch for this session.
        :param archives: URLs of archives to be used in this session.
        :param queue: The name of the YARN queue to which submitted.
        :param name: The name of this session.
        :param spark_conf: Spark configuration properties.
        :param heartbeat_timeout: Optional Timeout in seconds to which session
            be automatically orphaned if no heartbeat is received.
        :param echo: Whether to echo output printed in the remote session.
            Defaults to ``True``.
        :param check: Whether to raise an exception when a statement in the
            remote session fails. Defaults to ``True``.
        """
        client = LivyClient(url, auth, verify, requests_session)
        session = client.create_session(
            kind,
            proxy_user,
            jars,
            py_files,
            files,
            driver_memory,
            driver_cores,
            executor_memory,
            executor_cores,
            num_executors,
            archives,
            queue,
            name,
            spark_conf,
            heartbeat_timeout,
        )
        client.close()
        return cls(
            url,
            session.session_id,
            auth,
            verify,
            requests_session,
            kind,
            echo,
            check,
        )
Exemple #28
0
class LivySession:
    """Manages a remote Livy session and high-level interactions with it.

    :param url: The URL of the Livy server.
    :param session_id: The ID of the Livy session.
    :param auth: A requests-compatible auth object to use when making requests.
    :param verify: Either a boolean, in which case it controls whether we
        verify the server’s TLS certificate, or a string, in which case it must
        be a path to a CA bundle to use. Defaults to ``True``.
    :param requests_session: A specific ``requests.Session`` to use, allowing
        advanced customisation. The caller is responsible for closing the
        session.
    :param kind: The kind of session to create.
    :param echo: Whether to echo output printed in the remote session. Defaults
        to ``True``.
    :param check: Whether to raise an exception when a statement in the remote
        session fails. Defaults to ``True``.
    """
    def __init__(
        self,
        url: str,
        session_id: int,
        auth: Auth = None,
        verify: Verify = True,
        requests_session: requests.Session = None,
        kind: SessionKind = SessionKind.PYSPARK,
        echo: bool = True,
        check: bool = True,
    ) -> None:
        self.client = LivyClient(url, auth, verify, requests_session)
        self.session_id = session_id
        self.kind = kind
        self.echo = echo
        self.check = check

    @classmethod
    def create(
        cls,
        url: str,
        auth: Auth = None,
        verify: Verify = True,
        requests_session: requests.Session = None,
        kind: SessionKind = SessionKind.PYSPARK,
        proxy_user: str = None,
        jars: List[str] = None,
        py_files: List[str] = None,
        files: List[str] = None,
        driver_memory: str = None,
        driver_cores: int = None,
        executor_memory: str = None,
        executor_cores: int = None,
        num_executors: int = None,
        archives: List[str] = None,
        queue: str = None,
        name: str = None,
        spark_conf: Dict[str, Any] = None,
        heartbeat_timeout: int = None,
        echo: bool = True,
        check: bool = True,
    ) -> "LivySession":
        """Create a new Livy session.

        The py_files, files, jars and archives arguments are lists of URLs,
        e.g. ["s3://bucket/object", "hdfs://path/to/file", ...] and must be
        reachable by the Spark driver process. If the provided URL has no
        scheme, it's considered to be relative to the default file system
        configured in the Livy server.

        URLs in the py_files argument are copied to a temporary staging area
        and inserted into Python's sys.path ahead of the standard library
        paths. This allows you to import .py, .zip and .egg files in Python.

        URLs for jars, py_files, files and archives arguments are all copied to
        the same working directory on the Spark cluster.

        The driver_memory and executor_memory arguments have the same format as
        JVM memory strings with a size unit suffix ("k", "m", "g" or "t") (e.g.
        512m, 2g).

        See https://spark.apache.org/docs/latest/configuration.html for more
        information on Spark configuration properties.

        :param url: The URL of the Livy server.
        :param auth: A requests-compatible auth object to use when making
            requests.
        :param verify: Either a boolean, in which case it controls whether we
            verify the server’s TLS certificate, or a string, in which case it
            must be a path to a CA bundle to use. Defaults to ``True``.
        :param requests_session: A specific ``requests.Session`` to use,
            allowing advanced customisation. The caller is responsible for
            closing the session.
        :param kind: The kind of session to create.
        :param proxy_user: User to impersonate when starting the session.
        :param jars: URLs of jars to be used in this session.
        :param py_files: URLs of Python files to be used in this session.
        :param files: URLs of files to be used in this session.
        :param driver_memory: Amount of memory to use for the driver process
            (e.g. '512m').
        :param driver_cores: Number of cores to use for the driver process.
        :param executor_memory: Amount of memory to use per executor process
            (e.g. '512m').
        :param executor_cores: Number of cores to use for each executor.
        :param num_executors: Number of executors to launch for this session.
        :param archives: URLs of archives to be used in this session.
        :param queue: The name of the YARN queue to which submitted.
        :param name: The name of this session.
        :param spark_conf: Spark configuration properties.
        :param heartbeat_timeout: Optional Timeout in seconds to which session
            be automatically orphaned if no heartbeat is received.
        :param echo: Whether to echo output printed in the remote session.
            Defaults to ``True``.
        :param check: Whether to raise an exception when a statement in the
            remote session fails. Defaults to ``True``.
        """
        client = LivyClient(url, auth, verify, requests_session)
        session = client.create_session(
            kind,
            proxy_user,
            jars,
            py_files,
            files,
            driver_memory,
            driver_cores,
            executor_memory,
            executor_cores,
            num_executors,
            archives,
            queue,
            name,
            spark_conf,
            heartbeat_timeout,
        )
        client.close()
        return cls(
            url,
            session.session_id,
            auth,
            verify,
            requests_session,
            kind,
            echo,
            check,
        )

    def __enter__(self) -> "LivySession":
        self.wait()
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()

    def wait(self) -> None:
        """Wait for the session to be ready."""
        intervals = polling_intervals([0.1, 0.2, 0.3, 0.5], 1.0)
        while self.state in SESSION_STATE_NOT_READY:
            time.sleep(next(intervals))

    @property
    def state(self) -> SessionState:
        """The state of the managed Spark session."""
        session = self.client.get_session(self.session_id)
        if session is None:
            raise ValueError("session not found - it may have been shut down")
        return session.state

    def close(self) -> None:
        """Kill the managed Spark session."""
        self.client.delete_session(self.session_id)
        self.client.close()

    def run(self, code: str) -> Output:
        """Run some code in the managed Spark session.

        :param code: The code to run.
        """
        output = self._execute(code)
        if self.echo and output.text:
            print(output.text)
        if self.check:
            output.raise_for_status()
        return output

    def download(self, dataframe_name: str) -> pandas.DataFrame:
        """Evaluate and download a Spark dataframe from the managed session.

        :param dataframe_name: The name of the Spark dataframe to download.
        """
        code = _spark_serialise_dataframe_code(dataframe_name, self.kind)
        output = self._execute(code)
        output.raise_for_status()
        if output.text is None:
            raise RuntimeError("statement had no text output")
        return _deserialise_dataframe(output.text)

    def read(self, dataframe_name: str) -> pandas.DataFrame:
        """Evaluate and retrieve a Spark dataframe in the managed session.

        :param dataframe_name: The name of the Spark dataframe to read.

        .. deprecated:: 0.8.0
            Use :meth:`download` instead.
        """
        warnings.warn(
            "LivySession.read is deprecated and will be removed in a future "
            "version. Use LivySession.download instead.",
            DeprecationWarning,
        )
        return self.download(dataframe_name)

    def download_sql(self, query: str) -> pandas.DataFrame:
        """Evaluate a Spark SQL query and download the result.

        :param query: The Spark SQL query to evaluate.
        """
        if self.kind != SessionKind.SQL:
            raise ValueError("not a SQL session")
        output = self._execute(query)
        output.raise_for_status()
        if output.json is None:
            raise RuntimeError("statement had no JSON output")
        return _dataframe_from_json_output(output.json)

    def read_sql(self, code: str) -> pandas.DataFrame:
        """Evaluate a Spark SQL statement and retrieve the result.

        :param code: The Spark SQL statement to evaluate.

        .. deprecated:: 0.8.0
            Use :meth:`download_sql` instead.
        """
        warnings.warn(
            "LivySession.read_sql is deprecated and will be removed in a "
            "future version. Use LivySession.download_sql instead.",
            DeprecationWarning,
        )
        return self.download_sql(code)

    def upload(self, dataframe_name: str, data: pandas.DataFrame) -> None:
        """Upload a pandas dataframe to a Spark dataframe in the session.

        :param dataframe_name: The name of the Spark dataframe to create.
        :param data: The pandas dataframe to upload.
        """
        code = _spark_create_dataframe_code(self.kind, dataframe_name, data)
        output = self._execute(code)
        output.raise_for_status()

    def _execute(self, code: str) -> Output:
        statement = self.client.create_statement(self.session_id, code)
        intervals = polling_intervals([0.1, 0.2, 0.3, 0.5], 1.0)

        def waiting_for_output(statement):
            not_finished = statement.state in {
                StatementState.WAITING,
                StatementState.RUNNING,
            }
            available = statement.state == StatementState.AVAILABLE
            return not_finished or (available and statement.output is None)

        while waiting_for_output(statement):
            time.sleep(next(intervals))
            statement = self.client.get_statement(statement.session_id,
                                                  statement.statement_id)

        if statement.output is None:
            raise RuntimeError("statement had no output")

        return statement.output
Exemple #29
0
class LivySession:
    """Manages a remote Livy session and high-level interactions with it.

    The py_files, files, jars and archives arguments are lists of URLs, e.g.
    ["s3://bucket/object", "hdfs://path/to/file", ...] and must be reachable by
    the Spark driver process.  If the provided URL has no scheme, it's
    considered to be relative to the default file system configured in the Livy
    server.

    URLs in the py_files argument are copied to a temporary staging area and
    inserted into Python's sys.path ahead of the standard library paths. This
    allows you to import .py, .zip and .egg files in Python.

    URLs for jars, py_files, files and archives arguments are all copied to the
    same working directory on the Spark cluster.

    The driver_memory and executor_memory arguments have the same format as JVM
    memory strings with a size unit suffix ("k", "m", "g" or "t") (e.g. 512m,
    2g).

    See https://spark.apache.org/docs/latest/configuration.html for more
    information on Spark configuration properties.

    :param url: The URL of the Livy server.
    :param auth: A requests-compatible auth object to use when making requests.
    :param verify: Either a boolean, in which case it controls whether we
        verify the server’s TLS certificate, or a string, in which case it must
        be a path to a CA bundle to use. Defaults to ``True``.
    :param kind: The kind of session to create.
    :param proxy_user: User to impersonate when starting the session.
    :param jars: URLs of jars to be used in this session.
    :param py_files: URLs of Python files to be used in this session.
    :param files: URLs of files to be used in this session.
    :param driver_memory: Amount of memory to use for the driver process (e.g.
        '512m').
    :param driver_cores: Number of cores to use for the driver process.
    :param executor_memory: Amount of memory to use per executor process (e.g.
        '512m').
    :param executor_cores: Number of cores to use for each executor.
    :param num_executors: Number of executors to launch for this session.
    :param archives: URLs of archives to be used in this session.
    :param queue: The name of the YARN queue to which submitted.
    :param name: The name of this session.
    :param spark_conf: Spark configuration properties.
    :param echo: Whether to echo output printed in the remote session. Defaults
        to ``True``.
    :param check: Whether to raise an exception when a statement in the remote
        session fails. Defaults to ``True``.
    """
    def __init__(
        self,
        url: str,
        auth: Auth = None,
        verify: Verify = True,
        kind: SessionKind = SessionKind.PYSPARK,
        proxy_user: str = None,
        jars: List[str] = None,
        py_files: List[str] = None,
        files: List[str] = None,
        driver_memory: str = None,
        driver_cores: int = None,
        executor_memory: str = None,
        executor_cores: int = None,
        num_executors: int = None,
        archives: List[str] = None,
        queue: str = None,
        name: str = None,
        spark_conf: Dict[str, Any] = None,
        echo: bool = True,
        check: bool = True,
    ) -> None:
        self.client = LivyClient(url, auth, verify=verify)
        self.kind = kind
        self.proxy_user = proxy_user
        self.jars = jars
        self.py_files = py_files
        self.files = files
        self.driver_memory = driver_memory
        self.driver_cores = driver_cores
        self.executor_memory = executor_memory
        self.executor_cores = executor_cores
        self.num_executors = num_executors
        self.archives = archives
        self.queue = queue
        self.name = name
        self.spark_conf = spark_conf
        self.echo = echo
        self.check = check
        self.session_id: Optional[int] = None

    def __enter__(self) -> "LivySession":
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()

    def start(self) -> None:
        """Create the remote Spark session and wait for it to be ready."""

        session = self.client.create_session(
            self.kind,
            self.proxy_user,
            self.jars,
            self.py_files,
            self.files,
            self.driver_memory,
            self.driver_cores,
            self.executor_memory,
            self.executor_cores,
            self.num_executors,
            self.archives,
            self.queue,
            self.name,
            self.spark_conf,
        )
        self.session_id = session.session_id

        not_ready = {SessionState.NOT_STARTED, SessionState.STARTING}
        intervals = polling_intervals([0.1, 0.2, 0.3, 0.5], 1.0)

        while self.state in not_ready:
            time.sleep(next(intervals))

    @property
    def state(self) -> SessionState:
        """The state of the managed Spark session."""
        if self.session_id is None:
            raise ValueError("session not yet started")
        session = self.client.get_session(self.session_id)
        if session is None:
            raise ValueError("session not found - it may have been shut down")
        return session.state

    def close(self) -> None:
        """Kill the managed Spark session."""
        if self.session_id is not None:
            self.client.delete_session(self.session_id)
        self.client.close()

    def run(self, code: str) -> Output:
        """Run some code in the managed Spark session.

        :param code: The code to run.
        """
        output = self._execute(code)
        if self.echo and output.text:
            print(output.text)
        if self.check:
            output.raise_for_status()
        return output

    def read(self, dataframe_name: str) -> pandas.DataFrame:
        """Evaluate and retrieve a Spark dataframe in the managed session.

        :param dataframe_name: The name of the Spark dataframe to read.
        """
        code = serialise_dataframe_code(dataframe_name, self.kind)
        output = self._execute(code)
        output.raise_for_status()
        if output.text is None:
            raise RuntimeError("statement had no text output")
        return deserialise_dataframe(output.text)

    def read_sql(self, code: str) -> pandas.DataFrame:
        """Evaluate a Spark SQL satatement and retrieve the result.

        :param code: The Spark SQL statement to evaluate.
        """
        if self.kind != SessionKind.SQL:
            raise ValueError("not a SQL session")
        output = self._execute(code)
        output.raise_for_status()
        if output.json is None:
            raise RuntimeError("statement had no JSON output")
        return dataframe_from_json_output(output.json)

    def _execute(self, code: str) -> Output:
        if self.session_id is None:
            raise ValueError("session not yet started")

        statement = self.client.create_statement(self.session_id, code)

        intervals = polling_intervals([0.1, 0.2, 0.3, 0.5], 1.0)

        def waiting_for_output(statement):
            not_finished = statement.state in {
                StatementState.WAITING,
                StatementState.RUNNING,
            }
            available = statement.state == StatementState.AVAILABLE
            return not_finished or (available and statement.output is None)

        while waiting_for_output(statement):
            time.sleep(next(intervals))
            statement = self.client.get_statement(statement.session_id,
                                                  statement.statement_id)

        if statement.output is None:
            raise RuntimeError("statement had no output")

        return statement.output
Exemple #30
0
class LivySession:
    def __init__(self,
                 url: str,
                 kind: SessionKind = SessionKind.PYSPARK,
                 spark_conf: Dict[str, Any] = None,
                 echo: bool = True,
                 check: bool = True) -> None:
        self.client = LivyClient(url)
        self.kind = kind
        self.echo = echo
        self.check = check
        self.session_id: Optional[int] = None
        self.spark_conf = spark_conf

    def __enter__(self) -> 'LivySession':
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()

    def start(self) -> None:
        session = self.client.create_session(self.kind, self.spark_conf)
        self.session_id = session.session_id

        not_ready = {SessionState.NOT_STARTED, SessionState.STARTING}
        intervals = polling_intervals([0.1, 0.2, 0.3, 0.5], 1.0)

        while self.state in not_ready:
            time.sleep(next(intervals))

    @property
    def state(self) -> SessionState:
        if self.session_id is None:
            raise ValueError('session not yet started')
        session = self.client.get_session(self.session_id)
        if session is None:
            raise ValueError('session not found - it may have been shut down')
        return session.state

    def close(self) -> None:
        if self.session_id is not None:
            self.client.delete_session(self.session_id)
        self.client.close()

    def run(self, code: str) -> Output:
        output = self._execute(code)
        if self.echo and output.text:
            print(output.text)
        if self.check:
            output.raise_for_status()
        return output

    def read(self, dataframe_name: str) -> pandas.DataFrame:
        code = serialise_dataframe_code(dataframe_name, self.kind)
        output = self._execute(code)
        output.raise_for_status()
        if output.text is None:
            raise RuntimeError('statement had no text output')
        return deserialise_dataframe(output.text)

    def read_sql(self, code: str) -> pandas.DataFrame:
        if self.kind != SessionKind.SQL:
            raise ValueError('not a SQL session')
        output = self._execute(code)
        output.raise_for_status()
        if output.json is None:
            raise RuntimeError('statement had no JSON output')
        return dataframe_from_json_output(output.json)

    def _execute(self, code: str) -> Output:
        if self.session_id is None:
            raise ValueError('session not yet started')

        statement = self.client.create_statement(self.session_id, code)

        not_finished = {StatementState.WAITING, StatementState.RUNNING}
        intervals = polling_intervals([0.1, 0.2, 0.3, 0.5], 1.0)

        while statement.state in not_finished:
            time.sleep(next(intervals))
            statement = self.client.get_statement(statement.session_id,
                                                  statement.statement_id)

        if statement.output is None:
            raise RuntimeError('statement had no output')

        return statement.output