def test_spilling(self, vector): """Tests that query results which don't fully fit into memory are spilled to disk. The test runs a query asynchronously and wait for the PeakUnpinnedBytes counter in the PLAN_ROOT_SINK section of the runtime profile to reach a non-zero value. Then it fetches all the results and validates them.""" query = "select * from functional.alltypes order by id limit 1500" exec_options = vector.get_value('exec_option') # Set lower values for spill-to-disk configs to force the above query to spill # spooled results. exec_options['min_spillable_buffer_size'] = 8 * 1024 exec_options['default_spillable_buffer_size'] = 8 * 1024 exec_options['max_result_spooling_mem'] = 32 * 1024 exec_options['max_row_size'] = 16 * 1024 # Execute the query without result spooling and save the results for later validation base_result = self.execute_query(query, exec_options) assert base_result.success, "Failed to run {0} when result spooling is disabled" \ .format(query) exec_options['spool_query_results'] = 'true' # Amount of time to wait for the PeakUnpinnedBytes counter in the PLAN_ROOT_SINK # section of the profile to reach a non-zero value. timeout = 30 # Regexes to look for in the runtime profiles. # PeakUnpinnedBytes can show up in exec nodes as well, so we only look for the # PeakUnpinnedBytes metrics in the PLAN_ROOT_SINK section of the profile. unpinned_bytes_regex = "PLAN_ROOT_SINK[\s\S]*?PeakUnpinnedBytes.*\([1-9][0-9]*\)" # The PLAN_ROOT_SINK should have 'Spilled' in the 'ExecOption' info string. spilled_exec_option_regex = "ExecOption:.*Spilled" # PLAN_ROOT_SINK's reservation limit should be set at MAX_RESULT_SPOOLING_MEM = 32 KB. plan_root_sink_reservation_limit = "PLAN_ROOT_SINK[\s\S]*?ReservationLimit: 32.00 KB" # Fetch the runtime profile every 0.5 seconds until either the timeout is hit, or # PeakUnpinnedBytes shows up in the profile. start_time = time.time() handle = self.execute_query_async(query, exec_options) try: while not re.search(unpinned_bytes_regex, self.client.get_runtime_profile(handle)) \ and time.time() - start_time < timeout: time.sleep(0.5) profile = self.client.get_runtime_profile(handle) if not re.search(unpinned_bytes_regex, profile): raise Timeout( "Query {0} did not spill spooled results within the timeout {1}" .format(query, timeout)) # At this point PLAN_ROOT_SINK must have spilled, so spilled_exec_option_regex # should be in the profile as well. assert re.search(spilled_exec_option_regex, profile) # Check that PLAN_ROOT_SINK reservation limit is set accordingly. assert re.search(plan_root_sink_reservation_limit, profile) result = self.client.fetch(query, handle) assert result.data == base_result.data finally: self.client.close_query(handle)
def restart(self): LOG.info("Restarting Impala") command = self._api.restart() command = command.wait(timeout=(60 * 15)) if command.active: raise Timeout("Timeout waiting for Impala to restart") if not command.success: raise Exception("Failed to restart Impala: %s" % command.resultMessage)
def _request_web_page(self, relative_url, params={}, timeout_secs=DEFAULT_TIMEOUT): url = "http://%s:%s%s" % (self.host_name, self.web_ui_port, relative_url) try: resp = requests.get(url, params=params, timeout=timeout_secs) except requests.exceptions.Timeout as e: raise Timeout(underlying_exception=e) resp.raise_for_status() return resp
def wait_for_state(self, handle, expected_state, timeout): """Waits for the given 'query_handle' to reach the 'expected_state'. If it does not reach the given state within 'timeout' seconds, the method throws an AssertionError. """ start_time = time.time() actual_state = self.client.get_state(handle) while actual_state != expected_state and time.time() - start_time < timeout: actual_state = self.client.get_state(handle) time.sleep(0.5) if actual_state != expected_state: raise Timeout("query '%s' did not reach expected state '%s', last known state '%s'" % (handle.get_handle().id, expected_state, actual_state))
def wait_for_any_state(self, handle, expected_states, timeout): """Waits for the given 'query_handle' to reach one of 'expected_states'. If it does not reach one of the given states within 'timeout' seconds, the method throws an AssertionError. Returns the final state. """ start_time = time.time() actual_state = self.client.get_state(handle) while actual_state not in expected_states and time.time() - start_time < timeout: actual_state = self.client.get_state(handle) time.sleep(0.5) if actual_state not in expected_states: raise Timeout("query {0} did not reach one of the expected states {1}, " "last known state {2}".format(handle.get_handle().id, expected_states, actual_state)) return actual_state
def __wait_until_retried(self, handle, timeout=300): """Wait until the given query handle has been retried. This is achieved by polling the runtime profile of the query and checking the 'Retry Status' field.""" retried_state = "RETRIED" def __get_retry_status(): profile = self.__get_original_query_profile(handle.get_handle().id) retry_status = re.search("Retry Status: (.*)", profile) return retry_status.group(1) if retry_status else None start_time = time.time() retry_status = __get_retry_status() while retry_status != retried_state and time.time() - start_time < timeout: retry_status = __get_retry_status() time.sleep(0.5) if retry_status != retried_state: raise Timeout("query {0} was not retried within timeout".format (handle.get_handle().id))
def test_multiple_fetch_multiple_batches_timeout(self): """Test the query option FETCH_ROWS_TIMEOUT_MS by running a query with a DELAY DEBUG_ACTION and a low value for the fetch timeout. This test issues fetch requests in a loop until all results have been returned, and validates that some of the fetch requests timed out. It is similar to test_fetch_multiple_batches_timeout except it issues multiple fetch requests that are expected to timeout.""" num_rows = 100 statement = "select * from functional.alltypes limit {0}".format(num_rows) execute_statement_resp = self.execute_statement(statement, conf_overlay={'batch_size': '1', 'debug_action': '0:GETNEXT:DELAY', 'fetch_rows_timeout_ms': '1'}) HS2TestSuite.check_response(execute_statement_resp) # Wait for rows to be available for fetch. get_operation_status_resp = self.wait_for_operation_state( execute_statement_resp.operationHandle, TCLIService.TOperationState.FINISHED_STATE, timeout=30) HS2TestSuite.check_response(get_operation_status_resp) # The timeout to wait for fetch requests to fetch all rows. timeout = 30 start_time = time() num_fetched = 0 num_fetch_requests = 0 # Fetch results until either the timeout is hit or all rows have been fetched. while num_fetched != num_rows and time() - start_time < timeout: sleep(0.5) fetch_results_resp = self.hs2_client.FetchResults(TCLIService.TFetchResultsReq( operationHandle=execute_statement_resp.operationHandle, maxRows=num_rows)) HS2TestSuite.check_response(fetch_results_resp) num_fetched += HS2TestSuite.get_num_rows(fetch_results_resp.results) num_fetch_requests += 1 if num_fetched != num_rows: raise Timeout("Query {0} did not fetch all results within the timeout {1}" .format(statement, timeout)) # The query produces 100 RowBatches, each batch was delayed 100ms before it was sent # to the PlanRootSink. Each fetch request requested all 100 rows, but since the # timeout is set to such a low value, multiple fetch requests should be necessary to # read all rows. assert num_fetch_requests >= 5
def _request_web_page(self, relative_url, params={}, timeout_secs=DEFAULT_TIMEOUT): if self.cluster.use_ssl: scheme = 'https' else: scheme = 'http' url = '{scheme}://{host}:{port}{url}'.format(scheme=scheme, host=self.host_name, port=self.web_ui_port, url=relative_url) try: verify_ca = self.cluster.ca_cert if self.cluster.ca_cert is not None else False resp = requests.get(url, params=params, timeout=timeout_secs, verify=verify_ca) except requests.exceptions.Timeout as e: raise Timeout(underlying_exception=e) resp.raise_for_status() return resp
def fetch_num_rows(hs2_client, op_handle, num_rows, statement): """Fetch the specified number of rows in the given op_handle and validate that the number of rows returned matches the expected number of rows. If the op_handle does not return the expected number of rows within a timeout, an error is thrown.""" # The timeout to wait for fetch requests to fetch all rows. timeout = 30 start_time = time() num_fetched = 0 # Fetch results until either the timeout is hit or all rows have been fetched. while num_fetched != num_rows and time() - start_time < timeout: sleep(0.5) fetch_results_resp = hs2_client.FetchResults( TCLIService.TFetchResultsReq(operationHandle=op_handle, maxRows=num_rows - num_fetched)) HS2TestSuite.check_response(fetch_results_resp) num_fetched += HS2TestSuite.get_num_rows(fetch_results_resp.results) if num_fetched != num_rows: raise Timeout("Query {0} did not fetch all results within the timeout {1}" .format(statement, timeout)) assert num_fetched == num_rows
def _request_web_page(self, relative_url, params={}, timeout_secs=DEFAULT_TIMEOUT): if self.cluster.use_ssl: scheme = 'https' else: scheme = 'http' url = '{scheme}://{host}:{port}{url}'.format(scheme=scheme, host=self.host_name, port=self.web_ui_port, url=relative_url) try: # verify=False is needed because of self-signed certifiates # TODO: support a CA bundle that users could point to instead resp = requests.get(url, params=params, timeout=timeout_secs, verify=False) except requests.exceptions.Timeout as e: raise Timeout(underlying_exception=e) resp.raise_for_status() return resp
def shell(self, cmd, cmd_prepend="set -euo pipefail\n", timeout_secs=None): """Executes a command and returns its output. If the command's return code is non-zero or the command times out, an exception is raised. """ cmd = textwrap.dedent(cmd.strip()) if cmd_prepend: cmd = cmd_prepend + cmd LOG.debug("Running command via ssh on %s:\n%s" % (self.host_name, cmd)) transport = self.get_transport() for is_first_attempt in (True, False): try: channel = transport.open_session() break except Exception as e: if is_first_attempt: LOG.warn("Error opening ssh session: %s" % e) self.close() self.connect(self.host_name, **self.connect_kwargs) else: raise Exception("Unable to open ssh session to %s: %s" % (self.host_name, e)) channel.set_combine_stderr(True) channel.exec_command(cmd) process = RemoteProcess(channel) deadline = time.time( ) + timeout_secs if timeout_secs is not None else None while True: retcode = process.poll() if retcode is not None or (deadline and time.time() > deadline): break time.sleep(0.1) if retcode == 0: return process.stdout.read().decode("utf-8").encode( "ascii", errors="ignore") if retcode is None: if process.channel.recv_ready(): output = process.channel.recv(None) else: output = "" if process.channel.recv_stderr_ready(): err = process.channel.recv_stderr(None) else: err = "" else: output = process.stdout.read() err = process.stderr.read() if output: output = output.decode("utf-8").encode("ascii", errors="ignore") else: output = "(No stdout)" if err: err = err.decode("utf-8").encode("ascii", errors="ignore") else: err = "(No stderr)" if retcode is None: raise Timeout( "Command timed out after %s seconds\ncmd: %s\nstdout: %s\nstderr: %s" % (timeout_secs, cmd, output, err)) raise Exception(("Command returned non-zero exit code: %s" "\ncmd: %s\nstdout: %s\nstderr: %s") % (retcode, cmd, output, err))
def shell(cmd, cmd_prepend="set -euo pipefail\n", stdout=PIPE, stderr=STDOUT, timeout_secs=None, **popen_kwargs): """Executes a command and returns its output. If the command's return code is non-zero or the command times out, an exception is raised. """ cmd = dedent(cmd.strip()) if cmd_prepend: cmd = cmd_prepend + cmd LOG.debug("Running command with %s timeout: %s" % ("no" if timeout_secs is None else ("%s second" % timeout_secs), cmd)) process = Popen(cmd, shell=True, executable="/bin/bash", stdout=stdout, stderr=stderr, **popen_kwargs) stdout_fileno = process.stdout and process.stdout.fileno() stderr_fileno = process.stderr and process.stderr.fileno() remaining_fds = list() if stdout_fileno is not None: remaining_fds.append(stdout_fileno) if stderr_fileno is not None: remaining_fds.append(stderr_fileno) stdout = list() stderr = list() def _read_available_output(): while True: available_fds, _, _ = select(remaining_fds, [], [], 0) if not available_fds: return for fd in available_fds: data = os.read(fd, 4096) if fd == stdout_fileno: if not data: del remaining_fds[0] else: stdout.append(data) elif fd == stderr_fileno: if not data: del remaining_fds[-1] else: stderr.append(data) deadline = time() + timeout_secs if timeout_secs is not None else None while True: # The subprocess docs indicate that stdout/err need to be drained while waiting # if the PIPE option is used. _read_available_output() retcode = process.poll() if retcode is not None or (deadline and time() > deadline): break sleep(0.1) _read_available_output() output = "".join(stdout) if retcode == 0: return output if not output: output = "(No stdout)" err = "".join(stderr) if stderr else "(No stderr)" if retcode is None: raise Timeout( "Command timed out after %s seconds\ncmd: %s\nstdout: %s\nstderr: %s" % (timeout_secs, cmd, output, err)) raise Exception( ("Command returned non-zero exit code: %s" "\ncmd: %s\nstdout: %s\nstderr: %s") % (retcode, cmd, output, err))