Ejemplo n.º 1
0
  def test_get_schemas(self):
    get_schemas_req = TCLIService.TGetSchemasReq()
    get_schemas_req.sessionHandle = self.session_handle
    get_schemas_resp = self.hs2_client.GetSchemas(get_schemas_req)
    TestHS2.check_response(get_schemas_resp)
    fetch_results_req = TCLIService.TFetchResultsReq()
    fetch_results_req.operationHandle = get_schemas_resp.operationHandle
    fetch_results_req.maxRows = 100
    fetch_results_resp = self.hs2_client.FetchResults(fetch_results_req)
    TestHS2.check_response(fetch_results_resp)
    query_id = operation_id_to_query_id(get_schemas_resp.operationHandle.operationId)
    profile_page = self.impalad_test_service.read_query_profile_page(query_id)

    # Test fix for IMPALA-619
    assert "Sql Statement: GET_SCHEMAS" in profile_page
    assert "Query Type: DDL" in profile_page
Ejemplo n.º 2
0
class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):
    IDLE_SHUTDOWN_GRACE_PERIOD_S = 1
    IMPALA_SHUTDOWN_SIGNAL = signal.SIGRTMIN

    @classmethod
    def get_workload(cls):
        return 'functional-query'

    @SkipIfGCS.jira(reason="IMPALA-10562")
    @pytest.mark.execute_serially
    @CustomClusterTestSuite.with_args(
        impalad_args="--shutdown_grace_period_s={grace_period} \
          --hostname={hostname}".format(
            grace_period=IDLE_SHUTDOWN_GRACE_PERIOD_S,
            hostname=socket.gethostname()))
    def test_shutdown_idle(self):
        """Test that idle impalads shut down in a timely manner after the shutdown grace
    period elapses."""
        impalad1 = psutil.Process(self.cluster.impalads[0].get_pid())
        impalad2 = psutil.Process(self.cluster.impalads[1].get_pid())
        impalad3 = psutil.Process(self.cluster.impalads[2].get_pid())

        # Test that a failed shut down from a bogus host or port fails gracefully.
        ex = self.execute_query_expect_failure(
            self.client, ":shutdown('e6c00ca5cd67b567eb96c6ecfb26f05')")
        assert "Could not find IPv4 address for:" in str(ex)
        ex = self.execute_query_expect_failure(
            self.client, ":shutdown('localhost:100000')")
        assert "invalid port:" in str(ex)
        assert (
            "This may be because the port specified is wrong.") not in str(ex)

        # Test that pointing to the wrong thrift service (the HS2 port) fails gracefully-ish.
        thrift_port = 21051  # HS2 port.
        ex = self.execute_query_expect_failure(
            self.client, ":shutdown('localhost:{0}')".format(thrift_port))
        assert ("failed with error 'RemoteShutdown() RPC failed") in str(ex)
        assert ("This may be because the port specified is wrong.") in str(ex)

        # Test RPC error handling with debug action.
        ex = self.execute_query_expect_failure(
            self.client,
            ":shutdown('localhost:27001')",
            query_options={'debug_action': 'CRS_SHUTDOWN_RPC:FAIL'})
        assert 'Rpc to 127.0.0.1:27001 failed with error \'Debug Action: ' \
            'CRS_SHUTDOWN_RPC:FAIL' in str(ex)

        # Test remote shutdown.
        LOG.info("Start remote shutdown {0}".format(time.time()))
        self.execute_query_expect_success(self.client,
                                          ":shutdown('localhost:27001')",
                                          query_options={})

        # Remote shutdown does not require statestore.
        self.cluster.statestored.kill()
        self.cluster.statestored.wait_for_exit()
        self.execute_query_expect_success(self.client,
                                          ":shutdown('localhost:27002')",
                                          query_options={})

        # Test local shutdown, which should succeed even with injected RPC error.
        LOG.info("Start local shutdown {0}".format(time.time()))
        self.execute_query_expect_success(
            self.client,
            ":shutdown('{0}:27000')".format(socket.gethostname()),
            query_options={'debug_action': 'CRS_SHUTDOWN_RPC:FAIL'})

        # Make sure that the impala daemons exit after the shutdown grace period plus a 10
        # second margin of error.
        start_time = time.time()
        LOG.info("Waiting for impalads to exit {0}".format(start_time))
        impalad1.wait()
        LOG.info("First impalad exited {0}".format(time.time()))
        impalad2.wait()
        LOG.info("Second impalad exited {0}".format(time.time()))
        impalad3.wait()
        LOG.info("Third impalad exited {0}".format(time.time()))
        shutdown_duration = time.time() - start_time
        assert shutdown_duration <= self.IDLE_SHUTDOWN_GRACE_PERIOD_S + 10

    EXEC_SHUTDOWN_GRACE_PERIOD_S = 5
    EXEC_SHUTDOWN_DEADLINE_S = 10

    @pytest.mark.execute_serially
    @SkipIfNotHdfsMinicluster.scheduling
    @CustomClusterTestSuite.with_args(
        impalad_args="--shutdown_grace_period_s={grace_period} \
          --shutdown_deadline_s={deadline} \
          --hostname={hostname}".format(
            grace_period=EXEC_SHUTDOWN_GRACE_PERIOD_S,
            deadline=EXEC_SHUTDOWN_DEADLINE_S,
            hostname=socket.gethostname()))
    def test_shutdown_executor(self):
        self.do_test_shutdown_executor(fetch_delay_s=0)

    @pytest.mark.execute_serially
    @SkipIfNotHdfsMinicluster.scheduling
    @CustomClusterTestSuite.with_args(
        impalad_args="--shutdown_grace_period_s={grace_period} \
          --shutdown_deadline_s={deadline} \
          --stress_status_report_delay_ms={status_report_delay_ms} \
          --hostname={hostname}".format(
            grace_period=EXEC_SHUTDOWN_GRACE_PERIOD_S,
            deadline=EXEC_SHUTDOWN_DEADLINE_S,
            status_report_delay_ms=5000,
            hostname=socket.gethostname()))
    def test_shutdown_executor_with_delay(self):
        """Regression test for IMPALA-7931 that adds delays to status reporting and
    to fetching of results to trigger races that previously resulted in query failures."""
        print self.exploration_strategy
        if self.exploration_strategy() != 'exhaustive':
            pytest.skip()
        self.do_test_shutdown_executor(fetch_delay_s=5)

    def do_test_shutdown_executor(self, fetch_delay_s):
        """Implementation of test that shuts down and then restarts an executor. This should
    not disrupt any queries that start after the shutdown or complete before the shutdown
    time limit. The test is parameterized by 'fetch_delay_s', the amount to delay before
    fetching from the query that must survive shutdown of an executor."""
        # Add sleeps to make sure that the query takes a couple of seconds to execute on the
        # executors.
        QUERY = "select count(*) from functional_parquet.alltypes where sleep(1) = bool_col"
        # Subtle: use a splittable file format like text for lineitem so that each backend
        # is guaranteed to get scan ranges that contain some actual rows. With Parquet on
        # S3, the files get broken into 32MB scan ranges and a backend might get unlucky
        # and only get scan ranges that don't contain the midpoint of any row group, and
        # therefore not actually produce any rows.
        SLOW_QUERY = "select count(*) from tpch.lineitem where sleep(1) = l_orderkey"
        SHUTDOWN_EXEC2 = ": shutdown('localhost:27001')"

        # Run this query before shutdown and make sure that it executes successfully on
        # all executors through the shutdown grace period without disruption.
        before_shutdown_handle = self.__exec_and_wait_until_running(QUERY)

        # Run this query which simulates getting stuck in admission control until after
        # the shutdown grace period expires. This demonstrates that queries don't get
        # cancelled if the cluster membership changes while they're waiting for admission.
        before_shutdown_admission_handle = self.execute_query_async(
            QUERY, {'debug_action': 'AC_BEFORE_ADMISSION:SLEEP@30000'})

        # Shut down and wait for the shutdown state to propagate through statestore.
        result = self.execute_query_expect_success(self.client, SHUTDOWN_EXEC2)
        assert parse_shutdown_result(result) == ("{0}s000ms".format(
            self.EXEC_SHUTDOWN_GRACE_PERIOD_S), "{0}s000ms".format(
                self.EXEC_SHUTDOWN_DEADLINE_S), "0", "1")

        # Check that the status is reflected on the debug page.
        web_json = self.cluster.impalads[1].service.get_debug_webpage_json("")
        assert web_json.get('is_quiescing', None) is True, web_json
        assert 'shutdown_status' in web_json, web_json

        self.impalad_test_service.wait_for_num_known_live_backends(
            2,
            timeout=self.EXEC_SHUTDOWN_GRACE_PERIOD_S + 5,
            interval=0.2,
            include_shutting_down=False)

        # Run another query, which shouldn't get scheduled on the new executor. We'll let
        # this query continue running through the full shutdown and restart cycle.
        after_shutdown_handle = self.__exec_and_wait_until_running(QUERY)

        # Wait for the impalad to exit, then start it back up and run another query, which
        # should be scheduled on it again.
        self.cluster.impalads[1].wait_for_exit()

        # Finish fetching results from the first query (which will be buffered on the
        # coordinator) after the backend exits. Add a delay before fetching to ensure
        # that the query is not torn down on the coordinator when the failure is
        # detected by the statestore (see IMPALA-7931).
        assert self.__fetch_and_get_num_backends(QUERY,
                                                 before_shutdown_handle,
                                                 delay_s=fetch_delay_s) == 3

        # Confirm that the query stuck in admission succeeded.
        assert self.__fetch_and_get_num_backends(
            QUERY, before_shutdown_admission_handle, timeout_s=30) == 2

        # Start the impalad back up and run another query, which should be scheduled on it
        # again.
        self.cluster.impalads[1].start()
        self.impalad_test_service.wait_for_num_known_live_backends(
            3, timeout=30, interval=0.2, include_shutting_down=False)
        after_restart_handle = self.__exec_and_wait_until_running(QUERY)

        # The query started while the backend was shut down should not run on that backend.
        assert self.__fetch_and_get_num_backends(QUERY,
                                                 after_shutdown_handle) == 2
        assert self.__fetch_and_get_num_backends(QUERY,
                                                 after_restart_handle) == 3

        # Test that a query will fail when the executor shuts down after the limit.
        deadline_expiry_handle = self.__exec_and_wait_until_running(SLOW_QUERY)
        result = self.execute_query_expect_success(self.client, SHUTDOWN_EXEC2)
        assert parse_shutdown_result(result) == ("{0}s000ms".format(
            self.EXEC_SHUTDOWN_GRACE_PERIOD_S), "{0}s000ms".format(
                self.EXEC_SHUTDOWN_DEADLINE_S), "0", "1")
        self.cluster.impalads[1].wait_for_exit()
        self.__check_deadline_expired(SLOW_QUERY, deadline_expiry_handle)

        # Test that we can reduce the deadline after setting it to a high value.
        # Run a query that will fail as a result of the reduced deadline.
        deadline_expiry_handle = self.__exec_and_wait_until_running(SLOW_QUERY)
        SHUTDOWN_EXEC3 = ": shutdown('localhost:27002', {0})"
        VERY_HIGH_DEADLINE = 5000
        HIGH_DEADLINE = 1000
        LOW_DEADLINE = 5
        result = self.execute_query_expect_success(
            self.client, SHUTDOWN_EXEC3.format(HIGH_DEADLINE))
        grace, deadline, _, _ = parse_shutdown_result(result)
        assert grace == "{0}s000ms".format(self.EXEC_SHUTDOWN_GRACE_PERIOD_S)
        assert deadline == "{0}m{1}s".format(HIGH_DEADLINE / 60,
                                             HIGH_DEADLINE % 60)

        result = self.execute_query_expect_success(
            self.client, SHUTDOWN_EXEC3.format(VERY_HIGH_DEADLINE))
        _, deadline, _, _ = parse_shutdown_result(result)
        LOG.info("Deadline is {0}".format(deadline))
        min_string, sec_string = re.match("([0-9]*)m([0-9]*)s",
                                          deadline).groups()
        assert int(min_string) * 60 + int(sec_string) <= HIGH_DEADLINE, \
            "Cannot increase deadline " + deadline

        result = self.execute_query_expect_success(
            self.client, SHUTDOWN_EXEC3.format(LOW_DEADLINE))
        _, deadline, _, queries_executing = parse_shutdown_result(result)
        assert deadline == "{0}s000ms".format(LOW_DEADLINE)
        assert int(
            queries_executing) > 0, "Slow query should still be running."
        self.cluster.impalads[2].wait_for_exit()
        self.__check_deadline_expired(SLOW_QUERY, deadline_expiry_handle)

    COORD_SHUTDOWN_GRACE_PERIOD_S = 5
    COORD_SHUTDOWN_DEADLINE_S = 120

    @pytest.mark.execute_serially
    @CustomClusterTestSuite.with_args(
        impalad_args="--shutdown_grace_period_s={grace_period} \
          --shutdown_deadline_s={deadline} \
          --hostname={hostname}".format(
            grace_period=COORD_SHUTDOWN_GRACE_PERIOD_S,
            deadline=COORD_SHUTDOWN_DEADLINE_S,
            hostname=socket.gethostname()),
        default_query_options=[("num_scanner_threads", "1")])
    @needs_session(TCLIService.TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V6,
                   close_session=False)
    def test_shutdown_coordinator(self):
        """Test that shuts down the coordinator. Running queries should finish but new
    requests should be rejected."""
        # Start a query running. This should complete successfully and keep the coordinator
        # up until it finishes. We set NUM_SCANNER_THREADS=1 above to make the runtime more
        # predictable.
        SLOW_QUERY = """select * from tpch_parquet.lineitem where sleep(1) < l_orderkey"""
        SHUTDOWN = ": shutdown()"
        SHUTDOWN_ERROR_PREFIX = 'Server is being shut down:'

        before_shutdown_handle = self.__exec_and_wait_until_running(SLOW_QUERY)
        before_shutdown_hs2_handle = self.execute_statement(
            SLOW_QUERY).operationHandle

        # Shut down the coordinator. Operations that start after this point should fail.
        result = self.execute_query_expect_success(self.client, SHUTDOWN)
        grace, deadline, registered, _ = parse_shutdown_result(result)
        assert grace == "{0}s000ms".format(self.COORD_SHUTDOWN_GRACE_PERIOD_S)
        assert deadline == "{0}m".format(self.COORD_SHUTDOWN_DEADLINE_S /
                                         60), "4"
        assert registered == "3"

        # Expect that the beeswax shutdown error occurs when calling fn()
        def expect_beeswax_shutdown_error(fn):
            try:
                fn()
            except ImpalaBeeswaxException, e:
                assert SHUTDOWN_ERROR_PREFIX in str(e)

        expect_beeswax_shutdown_error(lambda: self.client.execute("select 1"))
        expect_beeswax_shutdown_error(
            lambda: self.client.execute_async("select 1"))

        # Test that the HS2 shutdown error occurs for various HS2 operations.
        self.execute_statement("select 1", None,
                               TCLIService.TStatusCode.ERROR_STATUS,
                               SHUTDOWN_ERROR_PREFIX)

        def check_hs2_shutdown_error(hs2_response):
            HS2TestSuite.check_response(hs2_response,
                                        TCLIService.TStatusCode.ERROR_STATUS,
                                        SHUTDOWN_ERROR_PREFIX)

        check_hs2_shutdown_error(
            self.hs2_client.OpenSession(TCLIService.TOpenSessionReq()))
        check_hs2_shutdown_error(
            self.hs2_client.GetInfo(
                TCLIService.TGetInfoReq(
                    self.session_handle,
                    TCLIService.TGetInfoType.CLI_MAX_DRIVER_CONNECTIONS)))
        check_hs2_shutdown_error(
            self.hs2_client.GetTypeInfo(
                TCLIService.TGetTypeInfoReq(self.session_handle)))
        check_hs2_shutdown_error(
            self.hs2_client.GetCatalogs(
                TCLIService.TGetCatalogsReq(self.session_handle)))
        check_hs2_shutdown_error(
            self.hs2_client.GetSchemas(
                TCLIService.TGetSchemasReq(self.session_handle)))
        check_hs2_shutdown_error(
            self.hs2_client.GetTables(
                TCLIService.TGetTablesReq(self.session_handle)))
        check_hs2_shutdown_error(
            self.hs2_client.GetTableTypes(
                TCLIService.TGetTableTypesReq(self.session_handle)))
        check_hs2_shutdown_error(
            self.hs2_client.GetColumns(
                TCLIService.TGetColumnsReq(self.session_handle)))
        check_hs2_shutdown_error(
            self.hs2_client.GetFunctions(
                TCLIService.TGetFunctionsReq(self.session_handle,
                                             functionName="")))

        # Operations on running HS2 query still work.
        self.fetch_until(before_shutdown_hs2_handle,
                         TCLIService.TFetchOrientation.FETCH_NEXT, 10)
        HS2TestSuite.check_response(
            self.hs2_client.CancelOperation(
                TCLIService.TCancelOperationReq(before_shutdown_hs2_handle)))
        HS2TestSuite.check_response(
            self.hs2_client.CloseOperation(
                TCLIService.TCloseOperationReq(before_shutdown_hs2_handle)))

        # Make sure that the beeswax query is still executing, then close it to allow the
        # coordinator to shut down.
        self.impalad_test_service.wait_for_query_state(
            self.client,
            before_shutdown_handle,
            self.client.QUERY_STATES['FINISHED'],
            timeout=20)
        self.client.close_query(before_shutdown_handle)
        self.cluster.impalads[0].wait_for_exit()