Example #1
0
  def populate_db_with_random_data(self,
      db_name,
      db_connectors,
      min_number_of_tables,
      max_number_of_tables,
      min_number_of_cols,
      max_number_of_cols,
      min_number_of_rows,
      max_number_of_rows,
      allowed_storage_formats,
      create_files):
    '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
    connections = list()
    hive_connection = None
    for connector in db_connectors:
      connection = connector.create_connection(db_name=db_name)
      connections.append(connection)
      if connector.db_type == IMPALA:
        # The Impala table creator needs help from Hive for some storage formats.
        # Eventually Impala should be able to write in all formats and this can be
        # removed.
        hive_connection = DbConnector(HIVE).create_connection(db_name=db_name)
        connection.hive_connection = hive_connection
    for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)):
      table = self.create_random_table(
          'table_%s' % (table_idx + 1),
          min_number_of_cols,
          max_number_of_cols,
          allowed_storage_formats)

      for connection in connections:
        connection.bulk_load_data_file = open(
            "/tmp/%s_%s.data" % (table.name, connection.db_type.lower()), "w")
        connection.begin_bulk_load_table(table)

      row_count = randint(min_number_of_rows, max_number_of_rows)
      LOG.info('Inserting %s rows into %s', row_count, table.name)
      while row_count:
        batch_size = min(1000, row_count)
        rows = self.generate_table_data(table, number_of_rows=batch_size)
        row_count -= batch_size
        for connection in connections:
          connection.handle_bulk_load_table_data(rows)

      for connection in connections:
        connection.end_bulk_load_table()

    self.index_tables_in_database(connections)

    for connection in connections:
      connection.close()
    if hive_connection:
      hive_connection.close()
Example #2
0
    def populate_db_with_random_data(self, db_name, db_connectors,
                                     min_number_of_tables,
                                     max_number_of_tables, min_number_of_cols,
                                     max_number_of_cols, min_number_of_rows,
                                     max_number_of_rows,
                                     allowed_storage_formats, create_files):
        '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
        connections = list()
        hive_connection = None
        for connector in db_connectors:
            connection = connector.create_connection(db_name=db_name)
            connections.append(connection)
            if connector.db_type == IMPALA:
                # The Impala table creator needs help from Hive for some storage formats.
                # Eventually Impala should be able to write in all formats and this can be
                # removed.
                hive_connection = DbConnector(HIVE).create_connection(
                    db_name=db_name)
                connection.hive_connection = hive_connection
        for table_idx in xrange(
                randint(min_number_of_tables, max_number_of_tables)):
            table = self.create_random_table('table_%s' % (table_idx + 1),
                                             min_number_of_cols,
                                             max_number_of_cols,
                                             allowed_storage_formats)

            for connection in connections:
                connection.bulk_load_data_file = open(
                    "/tmp/%s_%s.data" %
                    (table.name, connection.db_type.lower()), "w")
                connection.begin_bulk_load_table(table)

            row_count = randint(min_number_of_rows, max_number_of_rows)
            LOG.info('Inserting %s rows into %s', row_count, table.name)
            while row_count:
                batch_size = min(1000, row_count)
                rows = self.generate_table_data(table,
                                                number_of_rows=batch_size)
                row_count -= batch_size
                for connection in connections:
                    connection.handle_bulk_load_table_data(rows)

            for connection in connections:
                connection.end_bulk_load_table()

        self.index_tables_in_database(connections)

        for connection in connections:
            connection.close()
        if hive_connection:
            hive_connection.close()
  def populate_db_with_random_data(self,
      db_name,
      db_connectors,
      min_number_of_tables,
      max_number_of_tables,
      min_number_of_cols,
      max_number_of_cols,
      min_number_of_rows,
      max_number_of_rows,
      allowed_storage_formats,
      create_files):
    '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
    connections = list()
    hive_connection = None
    for connector in db_connectors:
      connection = connector.create_connection(db_name=db_name)
      connections.append(connection)
      if connector.db_type == IMPALA:
        # The Impala table creator needs help from Hive for some storage formats.
        # Eventually Impala should be able to write in all formats and this can be
        # removed.
        hive_connection = DbConnector(HIVE).create_connection(db_name=db_name)
        connection.hive_connection = hive_connection
    tables = list()
    for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)):
      table = self.create_random_table(
          'table_%s' % (table_idx + 1),
          min_number_of_cols,
          max_number_of_cols,
          allowed_storage_formats)
      tables.append(table)

    self.populate_tables_with_random_data(
        tables,
        connections,
        min_number_of_rows,
        max_number_of_rows,
        create_tables=True)

    for connection in connections:
      connection.close()
    if hive_connection:
      hive_connection.close()
Example #4
0
    def populate_db_with_random_data(self, db_name, db_connectors,
                                     min_number_of_tables,
                                     max_number_of_tables, min_number_of_cols,
                                     max_number_of_cols, min_number_of_rows,
                                     max_number_of_rows,
                                     allowed_storage_formats, create_files):
        '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
        connections = list()
        hive_connection = None
        for connector in db_connectors:
            connection = connector.create_connection(db_name=db_name)
            connections.append(connection)
            if connector.db_type == IMPALA:
                # The Impala table creator needs help from Hive for some storage formats.
                # Eventually Impala should be able to write in all formats and this can be
                # removed.
                hive_connection = DbConnector(HIVE).create_connection(
                    db_name=db_name)
                connection.hive_connection = hive_connection
        tables = list()
        for table_idx in xrange(
                randint(min_number_of_tables, max_number_of_tables)):
            table = self.create_random_table('table_%s' % (table_idx + 1),
                                             min_number_of_cols,
                                             max_number_of_cols,
                                             allowed_storage_formats)
            tables.append(table)

        self.populate_tables_with_random_data(tables,
                                              connections,
                                              min_number_of_rows,
                                              max_number_of_rows,
                                              create_tables=True)

        for connection in connections:
            connection.close()
        if hive_connection:
            hive_connection.close()
class QueryRunner(object):
  """Encapsulates functionality to run a query and provide a runtime report."""

  SPILLED_PATTERN = re.compile("ExecOption:.*Spilled")
  BATCH_SIZE = 1024

  def __init__(self):
    self.impalad = None
    self.impalad_conn = None

  def connect(self):
    self.impalad_conn = DbConnector(
        IMPALA, host_name=self.impalad.host_name, port=self.impalad.port
        ).create_connection()

  def disconnect(self):
    if self.impalad_conn:
      self.impalad_conn.close()
      self.impalad_conn = None

  def run_query(self, query, timeout_secs, mem_limit_mb):
    """Run a query and return an execution report."""
    if not self.impalad_conn:
      raise Exception("connect() must first be called")

    timeout_unix_time = time() + timeout_secs
    report = QueryReport()
    try:
      with self.impalad_conn.open_cursor() as cursor:
        start_time = time()
        LOG.debug("Setting mem limit to %s MB", mem_limit_mb)
        cursor.execute("SET MEM_LIMIT=%sM" % mem_limit_mb)
        LOG.debug("Using %s database", query.db_name)
        cursor.execute("USE %s" % query.db_name)
        LOG.debug("Running query with %s MB mem limit at %s with timeout secs %s:\n%s",
            mem_limit_mb, self.impalad.host_name, timeout_secs, query.sql)
        error = None
        try:
          cursor.execute_async("/* Mem: %s MB. Coordinator: %s. */\n"
              % (mem_limit_mb, self.impalad.host_name) + query.sql)
          LOG.debug("Query id is %s", cursor._last_operation_handle)
          while cursor.is_executing():
            if time() > timeout_unix_time:
              self._cancel(cursor, report)
              return report
            sleep(0.1)
          try:
            report.result_hash = self._hash_result(cursor, timeout_unix_time)
          except QueryTimeout:
            self._cancel(cursor, report)
            return report
        except Exception as error:
          LOG.debug("Error running query with id %s: %s", cursor._last_operation_handle,
              error)
          self._check_for_mem_limit_exceeded(report, cursor, error)
        if report.non_mem_limit_error or report.mem_limit_exceeded:
          return report
        report.runtime_secs = time() - start_time
        report.profile = cursor.get_profile()
        report.mem_was_spilled = \
            QueryRunner.SPILLED_PATTERN.search(report.profile) is not None
    except Exception as error:
      # A mem limit error would have been caught above, no need to check for that here.
      report.non_mem_limit_error = error
    return report


  def _cancel(self, cursor, report):
    report.timed_out = True
    if cursor._last_operation_handle:
      LOG.debug("Attempting cancellation of query with id %s",
          cursor._last_operation_handle)
      cursor.cancel_operation()

  def _check_for_mem_limit_exceeded(self, report, cursor, caught_exception):
    """To be called after a query failure to check for signs of failed due to a
       mem limit. The report will be updated accordingly.
    """
    if cursor._last_operation_handle:
      try:
        report.profile = cursor.get_profile()
      except Exception as e:
        LOG.debug("Error getting profile for query with id %s: %s",
            cursor._last_operation_handle, e)
    if "memory limit exceeded" in str(caught_exception).lower():
      report.mem_limit_exceeded = True
      return
    LOG.error("Non-mem limit error for query with id %s: %s",
        cursor._last_operation_handle, caught_exception, exc_info=True)
    report.non_mem_limit_error = caught_exception

  def _hash_result(self, cursor, timeout_unix_time):
    """Returns a hash that is independent of row order."""
    # A value of 1 indicates that the hash thread should continue to work.
    should_continue = Value("i", 1)
    def hash_result_impl():
      try:
        current_thread().result = 1
        while should_continue.value:
          LOG.debug("Fetching result for query with id %s"
              % cursor._last_operation_handle)
          rows = cursor.fetchmany(self.BATCH_SIZE)
          if not rows:
            return
          for row in rows:
            for idx, val in enumerate(row):
              # Floats returned by Impala may not be deterministic, the ending
              # insignificant digits may differ. Only the first 6 digits will be used
              # after rounding.
              if isinstance(val, float):
                sval = "%f" % val
                dot_idx = sval.find(".")
                val = round(val, 6 - dot_idx)
              current_thread().result += (idx + 1) * hash(val)
              # Modulo the result to Keep it "small" otherwise the math ops can be slow
              # since python does infinite precision math.
              current_thread().result %= maxint
      except Exception as e:
        current_thread().error = e
    hash_thread = create_and_start_daemon_thread(hash_result_impl)
    hash_thread.join(max(timeout_unix_time - time(), 0))
    if hash_thread.is_alive():
      should_continue.value = 0
      raise QueryTimeout()
    if hash_thread.error:
      raise hash_thread.error
    return hash_thread.result