def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection(db_name=db_name) connection.hive_connection = hive_connection for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table( 'table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) for connection in connections: connection.bulk_load_data_file = open( "/tmp/%s_%s.data" % (table.name, connection.db_type.lower()), "w") connection.begin_bulk_load_table(table) row_count = randint(min_number_of_rows, max_number_of_rows) LOG.info('Inserting %s rows into %s', row_count, table.name) while row_count: batch_size = min(1000, row_count) rows = self.generate_table_data(table, number_of_rows=batch_size) row_count -= batch_size for connection in connections: connection.handle_bulk_load_table_data(rows) for connection in connections: connection.end_bulk_load_table() self.index_tables_in_database(connections) for connection in connections: connection.close() if hive_connection: hive_connection.close()
def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection( db_name=db_name) connection.hive_connection = hive_connection for table_idx in xrange( randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table('table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) for connection in connections: connection.bulk_load_data_file = open( "/tmp/%s_%s.data" % (table.name, connection.db_type.lower()), "w") connection.begin_bulk_load_table(table) row_count = randint(min_number_of_rows, max_number_of_rows) LOG.info('Inserting %s rows into %s', row_count, table.name) while row_count: batch_size = min(1000, row_count) rows = self.generate_table_data(table, number_of_rows=batch_size) row_count -= batch_size for connection in connections: connection.handle_bulk_load_table_data(rows) for connection in connections: connection.end_bulk_load_table() self.index_tables_in_database(connections) for connection in connections: connection.close() if hive_connection: hive_connection.close()
def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection(db_name=db_name) connection.hive_connection = hive_connection tables = list() for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table( 'table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) tables.append(table) self.populate_tables_with_random_data( tables, connections, min_number_of_rows, max_number_of_rows, create_tables=True) for connection in connections: connection.close() if hive_connection: hive_connection.close()
def populate_db_with_random_data(self, db_name, db_connectors, min_number_of_tables, max_number_of_tables, min_number_of_cols, max_number_of_cols, min_number_of_rows, max_number_of_rows, allowed_storage_formats, create_files): '''Create tables with a random number of cols. The given db_name must have already been created. ''' connections = list() hive_connection = None for connector in db_connectors: connection = connector.create_connection(db_name=db_name) connections.append(connection) if connector.db_type == IMPALA: # The Impala table creator needs help from Hive for some storage formats. # Eventually Impala should be able to write in all formats and this can be # removed. hive_connection = DbConnector(HIVE).create_connection( db_name=db_name) connection.hive_connection = hive_connection tables = list() for table_idx in xrange( randint(min_number_of_tables, max_number_of_tables)): table = self.create_random_table('table_%s' % (table_idx + 1), min_number_of_cols, max_number_of_cols, allowed_storage_formats) tables.append(table) self.populate_tables_with_random_data(tables, connections, min_number_of_rows, max_number_of_rows, create_tables=True) for connection in connections: connection.close() if hive_connection: hive_connection.close()
class QueryRunner(object): """Encapsulates functionality to run a query and provide a runtime report.""" SPILLED_PATTERN = re.compile("ExecOption:.*Spilled") BATCH_SIZE = 1024 def __init__(self): self.impalad = None self.impalad_conn = None def connect(self): self.impalad_conn = DbConnector( IMPALA, host_name=self.impalad.host_name, port=self.impalad.port ).create_connection() def disconnect(self): if self.impalad_conn: self.impalad_conn.close() self.impalad_conn = None def run_query(self, query, timeout_secs, mem_limit_mb): """Run a query and return an execution report.""" if not self.impalad_conn: raise Exception("connect() must first be called") timeout_unix_time = time() + timeout_secs report = QueryReport() try: with self.impalad_conn.open_cursor() as cursor: start_time = time() LOG.debug("Setting mem limit to %s MB", mem_limit_mb) cursor.execute("SET MEM_LIMIT=%sM" % mem_limit_mb) LOG.debug("Using %s database", query.db_name) cursor.execute("USE %s" % query.db_name) LOG.debug("Running query with %s MB mem limit at %s with timeout secs %s:\n%s", mem_limit_mb, self.impalad.host_name, timeout_secs, query.sql) error = None try: cursor.execute_async("/* Mem: %s MB. Coordinator: %s. */\n" % (mem_limit_mb, self.impalad.host_name) + query.sql) LOG.debug("Query id is %s", cursor._last_operation_handle) while cursor.is_executing(): if time() > timeout_unix_time: self._cancel(cursor, report) return report sleep(0.1) try: report.result_hash = self._hash_result(cursor, timeout_unix_time) except QueryTimeout: self._cancel(cursor, report) return report except Exception as error: LOG.debug("Error running query with id %s: %s", cursor._last_operation_handle, error) self._check_for_mem_limit_exceeded(report, cursor, error) if report.non_mem_limit_error or report.mem_limit_exceeded: return report report.runtime_secs = time() - start_time report.profile = cursor.get_profile() report.mem_was_spilled = \ QueryRunner.SPILLED_PATTERN.search(report.profile) is not None except Exception as error: # A mem limit error would have been caught above, no need to check for that here. report.non_mem_limit_error = error return report def _cancel(self, cursor, report): report.timed_out = True if cursor._last_operation_handle: LOG.debug("Attempting cancellation of query with id %s", cursor._last_operation_handle) cursor.cancel_operation() def _check_for_mem_limit_exceeded(self, report, cursor, caught_exception): """To be called after a query failure to check for signs of failed due to a mem limit. The report will be updated accordingly. """ if cursor._last_operation_handle: try: report.profile = cursor.get_profile() except Exception as e: LOG.debug("Error getting profile for query with id %s: %s", cursor._last_operation_handle, e) if "memory limit exceeded" in str(caught_exception).lower(): report.mem_limit_exceeded = True return LOG.error("Non-mem limit error for query with id %s: %s", cursor._last_operation_handle, caught_exception, exc_info=True) report.non_mem_limit_error = caught_exception def _hash_result(self, cursor, timeout_unix_time): """Returns a hash that is independent of row order.""" # A value of 1 indicates that the hash thread should continue to work. should_continue = Value("i", 1) def hash_result_impl(): try: current_thread().result = 1 while should_continue.value: LOG.debug("Fetching result for query with id %s" % cursor._last_operation_handle) rows = cursor.fetchmany(self.BATCH_SIZE) if not rows: return for row in rows: for idx, val in enumerate(row): # Floats returned by Impala may not be deterministic, the ending # insignificant digits may differ. Only the first 6 digits will be used # after rounding. if isinstance(val, float): sval = "%f" % val dot_idx = sval.find(".") val = round(val, 6 - dot_idx) current_thread().result += (idx + 1) * hash(val) # Modulo the result to Keep it "small" otherwise the math ops can be slow # since python does infinite precision math. current_thread().result %= maxint except Exception as e: current_thread().error = e hash_thread = create_and_start_daemon_thread(hash_result_impl) hash_thread.join(max(timeout_unix_time - time(), 0)) if hash_thread.is_alive(): should_continue.value = 0 raise QueryTimeout() if hash_thread.error: raise hash_thread.error return hash_thread.result