Example #1
0
    def execute_test_case_setup(self, setup_section, table_format):
        """
    Executes a test case 'SETUP' section

    The test case 'SETUP' section is mainly used for insert tests. These tests need to
    have some actions performed before each test case to ensure the target tables are
    empty. The current supported setup actions:
    RESET <table name> - Drop and recreate the table
    DROP PARTITIONS <table name> - Drop all partitions from the table
    """
        setup_section = QueryTestSectionReader.build_query(setup_section)
        for row in setup_section.split('\n'):
            row = row.lstrip()
            if row.startswith('RESET'):
                db_name, table_name = QueryTestSectionReader.get_table_name_components(\
                  table_format, row.split('RESET')[1])
                self.__reset_table(db_name, table_name)
                self.client.execute("invalidate metadata " + db_name + "." +
                                    table_name)
            elif row.startswith('DROP PARTITIONS'):
                db_name, table_name = QueryTestSectionReader.get_table_name_components(\
                  table_format, row.split('DROP PARTITIONS')[1])
                self.__drop_partitions(db_name, table_name)
                self.client.execute("invalidate metadata " + db_name + "." +
                                    table_name)
            else:
                assert False, 'Unsupported setup command: %s' % row
  def execute_test_case_setup(self, setup_section, table_format):
    """
    Executes a test case 'SETUP' section

    The test case 'SETUP' section is mainly used for insert tests. These tests need to
    have some actions performed before each test case to ensure the target tables are
    empty. The current supported setup actions:
    RESET <table name> - Drop and recreate the table
    DROP PARTITIONS <table name> - Drop all partitions from the table
    """
    setup_section = QueryTestSectionReader.build_query(setup_section)
    for row in setup_section.split('\n'):
      row = row.lstrip()
      if row.startswith('RESET'):
        db_name, table_name = QueryTestSectionReader.get_table_name_components(\
          table_format, row.split('RESET')[1])
        self.__reset_table(db_name, table_name)
        self.client.execute("invalidate metadata " + db_name + "." + table_name)
      elif row.startswith('DROP PARTITIONS'):
        db_name, table_name = QueryTestSectionReader.get_table_name_components(\
          table_format, row.split('DROP PARTITIONS')[1])
        self.__drop_partitions(db_name, table_name)
        self.client.execute("invalidate metadata " + db_name + "." + table_name)
      else:
        assert False, 'Unsupported setup command: %s' % row
Example #3
0
File: query.py Project: 1ack/Impala
 def __build_query(self):
   """Populates db, query_str, table_format_str"""
   self.db = QueryTestSectionReader.get_db_name(self.test_vector, self.scale_factor)
   self.query_str = QueryTestSectionReader.build_query(self.query_str.strip())
   self.table_format_str = '%s/%s/%s' % (self.test_vector.file_format,
                                         self.test_vector.compression_codec,
                                         self.test_vector.compression_type)
Example #4
0
 def _build_query(self):
     """Populates db, query_str, table_format_str"""
     self.db = QueryTestSectionReader.get_db_name(self.test_vector,
                                                  self.scale_factor)
     self.query_str = QueryTestSectionReader.build_query(
         self.query_str.strip())
     self.table_format_str = '%s/%s/%s' % (
         self.test_vector.file_format, self.test_vector.compression_codec,
         self.test_vector.compression_type)
Example #5
0
  def test_wide_table(self, vector):
    if vector.get_value('table_format').file_format == 'kudu':
      pytest.xfail("IMPALA-3718: Extend Kudu functional test support")

    NUM_COLS = vector.get_value('num_cols')
    # Due to the way HBase handles duplicate row keys, we have different number of
    # rows in HBase tables compared to HDFS tables.
    NUM_ROWS = 10 if vector.get_value('table_format').file_format != 'hbase' else 2
    DB_NAME = QueryTestSectionReader.get_db_name(vector.get_value('table_format'))
    TABLE_NAME = "%s.widetable_%s_cols" % (DB_NAME, NUM_COLS)

    result = self.client.execute("select count(*) from %s " % TABLE_NAME)
    assert result.data == [str(NUM_ROWS)]

    expected_result = widetable.get_data(NUM_COLS, NUM_ROWS, quote_strings=True)
    result = self.client.execute("select * from %s" % TABLE_NAME)

    if vector.get_value('table_format').file_format == 'hbase':
      assert len(result.data) == NUM_ROWS
      return

    types = parse_column_types(result.schema)
    labels = parse_column_labels(result.schema)
    expected = QueryTestResult(expected_result, types, labels, order_matters=False)
    actual = QueryTestResult(parse_result_rows(result), types, labels,
        order_matters=False)
    assert expected == actual
Example #6
0
  def test_fuzz_nested_types(self, vector, unique_database):
    table_format = vector.get_value('table_format')
    table_name = "complextypestbl"
    src_db = QueryTestSectionReader.get_db_name(table_format)

    if table_format.file_format != 'parquet': pytest.skip()
    self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10)
Example #7
0
    def test_fuzz_nested_types(self, vector, unique_database):
        table_format = vector.get_value('table_format')
        table_name = "complextypestbl"
        src_db = QueryTestSectionReader.get_db_name(table_format)

        if table_format.file_format not in ['parquet', 'orc']: pytest.skip()
        # Additional queries to scan the nested values.
        custom_queries = [
            "select count(*) from ("
            "  select distinct t.id, a.pos, a.item, aa.pos, aa.item, m.key, m.value,"
            "    ma.key, ma.value, t.nested_struct.* "
            "  from complextypestbl t, t.int_array a, t.int_array_array.item aa, "
            "    t.int_map m, t.int_map_array.item ma) q",
            "select count(*) from ("
            "  select t.id, t.nested_struct.a, b.pos, b.item, i.e, i.f, m.key,"
            "    arr.pos, arr.item "
            "  from complextypestbl t, t.nested_struct.b, t.nested_struct.c.d.item i,"
            "    t.nested_struct.g m, m.value.h.i arr) q",
        ]
        self.run_fuzz_test(vector,
                           src_db,
                           table_name,
                           unique_database,
                           table_name,
                           10,
                           custom_queries=custom_queries)
  def test_fuzz_nested_types(self, vector, unique_database):
    table_format = vector.get_value('table_format')
    table_name = "complextypestbl"
    src_db = QueryTestSectionReader.get_db_name(table_format)

    if table_format.file_format != 'parquet': pytest.skip()
    self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10)
Example #9
0
  def __init__(self, test_section, test_file_name, test_db_name):
    if 'CREATE_VIEW' not in test_section:
      assert 0, 'Error in test file %s. Test cases require a '\
          'CREATE_VIEW section.\n%s' %\
          (test_file_name, pprint.pformat(test_section))

    self.create_exp_res = None
    # get map of expected results from test sections
    if 'CREATE_VIEW_RESULTS' in test_section:
      self.create_exp_res =\
          self._get_expected_results(test_section['CREATE_VIEW_RESULTS'])
    else:
      assert 0, 'Error in test file %s. Test cases require a '\
          'CREATE_VIEW_RESULTS section.\n%s' %\
          (test_file_name, pprint.pformat(test_section))

    self.query_hive_exp_res = None
    if 'QUERY_HIVE_VIEW_RESULTS' in test_section:
      self.query_hive_exp_res =\
          self._get_expected_results(test_section['QUERY_HIVE_VIEW_RESULTS'])

    self.query_impala_exp_res = None
    if 'QUERY_IMPALA_VIEW_RESULTS' in test_section:
      self.query_impala_exp_res =\
          self._get_expected_results(test_section['QUERY_IMPALA_VIEW_RESULTS'])

    if self.query_hive_exp_res is None and self.query_impala_exp_res is None:
      assert 0, 'Error in test file %s. Test cases require a QUERY_HIVE_VIEW_RESULTS '\
          'or QUERY_IMPALA_VIEW_RESULTS section.\n%s' %\
          (test_file_name, pprint.pformat(test_section))

    # clean test section, remove comments etc.
    self.create_view_sql = QueryTestSectionReader.build_query(test_section['CREATE_VIEW'])

    view_name = self._get_view_name(self.create_view_sql)
    if view_name.find(".") != -1:
      assert 0, 'Error in test file %s. Found unexpected view name %s that is '\
          'qualified with a database' % (test_file_name, view_name)

    # add db prefix and suffixes to indicate which engine created the view
    self.hive_view_name = test_db_name + '.' + view_name + '_hive'
    self.impala_view_name = test_db_name + '.' + view_name + '_impala'

    self.hive_create_view_sql =\
        self.create_view_sql.replace(view_name, self.hive_view_name, 1)
    self.impala_create_view_sql =\
        self.create_view_sql.replace(view_name, self.impala_view_name, 1)

    # SQL to explain a simple query on the view created by Hive in Hive and Impala
    if self.query_hive_exp_res is not None:
      self.query_hive_view_sql = 'explain select * from %s' % (self.hive_view_name)

    # SQL to explain a simple query on the view created by Impala in Hive and Impala
    if self.query_impala_exp_res is not None:
      self.query_impala_view_sql = 'explain select * from %s' % (self.impala_view_name)

    self.drop_hive_view_sql = "drop view %s" % (self.hive_view_name)
    self.drop_impala_view_sql = "drop view %s" % (self.impala_view_name)
  def __init__(self, test_section, test_file_name, test_db_name):
    if 'CREATE_VIEW' not in test_section:
      assert 0, 'Error in test file %s. Test cases require a '\
          'CREATE_VIEW section.\n%s' %\
          (test_file_name, pprint.pformat(test_section))

    self.create_exp_res = None
    # get map of expected results from test sections
    if 'CREATE_VIEW_RESULTS' in test_section:
      self.create_exp_res =\
          self._get_expected_results(test_section['CREATE_VIEW_RESULTS'])
    else:
      assert 0, 'Error in test file %s. Test cases require a '\
          'CREATE_VIEW_RESULTS section.\n%s' %\
          (test_file_name, pprint.pformat(test_section))

    self.query_hive_exp_res = None
    if 'QUERY_HIVE_VIEW_RESULTS' in test_section:
      self.query_hive_exp_res =\
          self._get_expected_results(test_section['QUERY_HIVE_VIEW_RESULTS'])

    self.query_impala_exp_res = None
    if 'QUERY_IMPALA_VIEW_RESULTS' in test_section:
      self.query_impala_exp_res =\
          self._get_expected_results(test_section['QUERY_IMPALA_VIEW_RESULTS'])

    if self.query_hive_exp_res is None and self.query_impala_exp_res is None:
      assert 0, 'Error in test file %s. Test cases require a QUERY_HIVE_VIEW_RESULTS '\
          'or QUERY_IMPALA_VIEW_RESULTS section.\n%s' %\
          (test_file_name, pprint.pformat(test_section))

    # clean test section, remove comments etc.
    self.create_view_sql = QueryTestSectionReader.build_query(test_section['CREATE_VIEW'])

    view_name = self._get_view_name(self.create_view_sql)
    if view_name.find(".") != -1:
      assert 0, 'Error in test file %s. Found unexpected view name %s that is '\
          'qualified with a database' % (test_file_name, view_name)

    # add db prefix and suffixes to indicate which engine created the view
    self.hive_view_name = test_db_name + '.' + view_name + '_hive'
    self.impala_view_name = test_db_name + '.' + view_name + '_impala'

    self.hive_create_view_sql =\
        self.create_view_sql.replace(view_name, self.hive_view_name, 1)
    self.impala_create_view_sql =\
        self.create_view_sql.replace(view_name, self.impala_view_name, 1)

    # SQL to explain a simple query on the view created by Hive in Hive and Impala
    if self.query_hive_exp_res is not None:
      self.query_hive_view_sql = 'explain select * from %s' % (self.hive_view_name)

    # SQL to explain a simple query on the view created by Impala in Hive and Impala
    if self.query_impala_exp_res is not None:
      self.query_impala_view_sql = 'explain select * from %s' % (self.impala_view_name)

    self.drop_hive_view_sql = "drop view %s" % (self.hive_view_name)
    self.drop_impala_view_sql = "drop view %s" % (self.impala_view_name)
Example #11
0
 def __process_create_section(self, section, test_file_name, test_db_name, table_type):
   self.existing_table = False
   self.create_table_sql = QueryTestSectionReader.build_query(remove_comments(section))
   name = self.__get_table_name(self.create_table_sql, table_type)
   assert name.find(".") == -1, 'Error in test file %s. Found unexpected %s '\
       'name %s that is qualified with a database' % (table_type, test_file_name, name)
   self.table_name = test_db_name + '.' + name
   self.create_table_sql = self.create_table_sql.replace(name, self.table_name, 1)
   self.show_create_table_sql = 'show create %s %s' % (table_type, self.table_name)
   self.drop_table_sql = "drop %s %s" % (table_type, self.table_name)
 def __process_create_section(self, section, test_file_name, test_db_name, table_type):
   self.existing_table = False
   self.create_table_sql = QueryTestSectionReader.build_query(remove_comments(section))
   name = self.__get_table_name(self.create_table_sql, table_type)
   assert name.find(".") == -1, 'Error in test file %s. Found unexpected %s '\
       'name %s that is qualified with a database' % (table_type, test_file_name, name)
   self.table_name = test_db_name + '.' + name
   self.create_table_sql = self.create_table_sql.replace(name, self.table_name, 1)
   self.show_create_table_sql = 'show create %s %s' % (table_type, self.table_name)
   self.drop_table_sql = "drop %s %s" % (table_type, self.table_name)
 def change_database(cls, impala_client, table_format=None,
     db_name=None, scale_factor=None):
   if db_name == None:
     assert table_format != None
     db_name = QueryTestSectionReader.get_db_name(table_format,
         scale_factor if scale_factor else '')
   query = 'use %s' % db_name
   # Clear the exec_options before executing a USE statement.
   # The USE statement should not fail for negative exec_option tests.
   impala_client.clear_configuration()
   impala_client.execute(query)
 def change_database(cls, impala_client, table_format=None,
     db_name=None, scale_factor=None):
   if db_name == None:
     assert table_format != None
     db_name = QueryTestSectionReader.get_db_name(table_format,
         scale_factor if scale_factor else '')
   query = 'use %s' % db_name
   # Clear the exec_options before executing a USE statement.
   # The USE statement should not fail for negative exec_option tests.
   impala_client.clear_configuration()
   impala_client.execute(query)
Example #15
0
  def test_exprs(self, vector):
    # TODO: Enable some of these tests for Avro if possible
    # Don't attempt to evaluate timestamp expressions with Avro tables (which)
    # don't support a timestamp type)"
    table_format = vector.get_value('table_format')
    if table_format.file_format == 'avro':
      pytest.skip()
    if table_format.file_format == 'hbase':
      pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
    self.run_test_case('QueryTest/exprs', vector)

    # This will change the current database to matching table format and then execute
    # select current_database(). An error will be thrown if multiple values are returned.
    current_db = self.execute_scalar('select current_database()', vector=vector)
    assert current_db == QueryTestSectionReader.get_db_name(table_format)
Example #16
0
  def test_fuzz_decimal_tbl(self, vector, unique_database):
    table_format = vector.get_value('table_format')
    table_name = "decimal_tbl"
    if table_format.file_format == 'avro':
      table_name = "avro_decimal_tbl"
      if table_format.compression_codec != 'snap' or \
          table_format.compression_type != 'block':
        pytest.skip()
    elif table_format.file_format == 'text' and \
        table_format.compression_codec != 'none':
      # decimal_tbl is not present for these file formats
      pytest.skip()

    src_db = QueryTestSectionReader.get_db_name(table_format)
    self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10)
Example #17
0
  def test_exprs(self, vector):
    # TODO: Enable some of these tests for Avro if possible
    # Don't attempt to evaluate timestamp expressions with Avro tables (which)
    # don't support a timestamp type)"
    table_format = vector.get_value('table_format')
    if table_format.file_format == 'avro':
      pytest.skip()
    if table_format.file_format == 'hbase':
      pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
    self.run_test_case('QueryTest/exprs', vector)

    # This will change the current database to matching table format and then execute
    # select current_database(). An error will be thrown if multiple values are returned.
    current_db = self.execute_scalar('select current_database()', vector=vector)
    assert current_db == QueryTestSectionReader.get_db_name(table_format)
  def test_fuzz_decimal_tbl(self, vector, unique_database):
    table_format = vector.get_value('table_format')
    table_name = "decimal_tbl"
    if table_format.file_format == 'avro':
      table_name = "avro_decimal_tbl"
      if table_format.compression_codec != 'snap' or \
          table_format.compression_type != 'block':
        pytest.skip()
    elif table_format.file_format == 'rc' or \
      table_format.file_format == 'seq':
        pytest.skip()
    elif table_format.file_format == 'text' and \
        table_format.compression_codec != 'none':
      # decimal_tbl is not present for these file formats
      pytest.skip()

    src_db = QueryTestSectionReader.get_db_name(table_format)
    self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10)
Example #19
0
  def test_exprs(self, vector):
    vector.get_value('exec_option')['enable_expr_rewrites'] = \
        vector.get_value('enable_expr_rewrites')
    # TODO: Enable some of these tests for Avro if possible
    # Don't attempt to evaluate timestamp expressions with Avro tables (which don't
    # support a timestamp type)"
    table_format = vector.get_value('table_format')
    if table_format.file_format == 'avro':
      pytest.skip()
    if table_format.file_format == 'hbase':
      pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
    if table_format.file_format == 'kudu':
      # Can't load LikeTbl without KUDU-1570.
      pytest.xfail("Need support for Kudu tables with nullable PKs (KUDU-1570)")
    self.run_test_case('QueryTest/exprs', vector)

    # This will change the current database to matching table format and then execute
    # select current_database(). An error will be thrown if multiple values are returned.
    current_db = self.execute_scalar('select current_database()', vector=vector)
    assert current_db == QueryTestSectionReader.get_db_name(table_format)
  def test_wide_table(self, vector):
    NUM_COLS = vector.get_value('num_cols')
    # Due to the way HBase handles duplicate row keys, we have different number of
    # rows in HBase tables compared to HDFS tables.
    NUM_ROWS = 10 if vector.get_value('table_format').file_format != 'hbase' else 2
    DB_NAME = QueryTestSectionReader.get_db_name(vector.get_value('table_format'))
    TABLE_NAME = "%s.widetable_%s_cols" % (DB_NAME, NUM_COLS)

    result = self.client.execute("select count(*) from %s " % TABLE_NAME)
    assert result.data == [str(NUM_ROWS)]

    expected_result = widetable.get_data(NUM_COLS, NUM_ROWS, quote_strings=True)
    result = self.client.execute("select * from %s" % TABLE_NAME)

    if vector.get_value('table_format').file_format == 'hbase':
      assert len(result.data) == NUM_ROWS
      return

    types = parse_column_types(result.schema)
    labels = parse_column_labels(result.schema)
    expected = QueryTestResult(expected_result, types, labels, order_matters=False)
    actual = QueryTestResult(parse_result_rows(result), types, labels,
        order_matters=False)
    assert expected == actual
  def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False,
      encoding=None):
    """
    Runs the queries in the specified test based on the vector values

    Runs the query using targeting the file format/compression specified in the test
    vector and the exec options specified in the test vector. If multiple_impalad=True
    a connection to a random impalad will be chosen to execute each test section.
    Otherwise, the default impalad client will be used.
    Additionally, the encoding for all test data can be specified using the 'encoding'
    parameter. This is useful when data is ingested in a different encoding (ex.
    latin). If not set, the default system encoding will be used.
    """
    table_format_info = vector.get_value('table_format')
    exec_options = vector.get_value('exec_option')

    # Resolve the current user's primary group name.
    group_id = pwd.getpwnam(getuser()).pw_gid
    group_name = grp.getgrgid(group_id).gr_name

    target_impalad_clients = list()
    if multiple_impalad:
      target_impalad_clients =\
          map(ImpalaTestSuite.create_impala_client, IMPALAD_HOST_PORT_LIST)
    else:
      target_impalad_clients = [self.client]

    # Change the database to reflect the file_format, compression codec etc, or the
    # user specified database for all targeted impalad.
    for impalad_client in target_impalad_clients:
      ImpalaTestSuite.change_database(impalad_client,
          table_format_info, use_db, pytest.config.option.scale_factor)
      impalad_client.set_configuration(exec_options)

    sections = self.load_query_test_file(self.get_workload(), test_file_name,
        encoding=encoding)
    for test_section in sections:
      if 'SHELL' in test_section:
        assert len(test_section) == 1, \
          "SHELL test sections can't contain other sections"
        cmd = test_section['SHELL']\
          .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)\
          .replace('$IMPALA_HOME', IMPALA_HOME)
        if use_db: cmd = cmd.replace('$DATABASE', use_db)
        LOG.info("Shell command: " + cmd)
        check_call(cmd, shell=True)
        continue

      if 'QUERY' not in test_section:
        assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\
            (test_file_name, pprint.pformat(test_section))

      if 'SETUP' in test_section:
        self.execute_test_case_setup(test_section['SETUP'], table_format_info)

      # TODO: support running query tests against different scale factors
      query = QueryTestSectionReader.build_query(test_section['QUERY']
          .replace('$GROUP_NAME', group_name)
          .replace('$IMPALA_HOME', IMPALA_HOME)
          .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)
          .replace('$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str()))
      if use_db: query = query.replace('$DATABASE', use_db)

      if 'QUERY_NAME' in test_section:
        LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME'])

      # Support running multiple queries within the same test section, only verifying the
      # result of the final query. The main use case is to allow for 'USE database'
      # statements before a query executes, but it is not limited to that.
      # TODO: consider supporting result verification of all queries in the future
      result = None
      target_impalad_client = choice(target_impalad_clients)
      query_options_changed = []
      try:
        user = None
        if 'USER' in test_section:
          # Create a new client so the session will use the new username.
          user = test_section['USER'].strip()
          target_impalad_client = self.create_impala_client()
        for query in query.split(';'):
          set_pattern_match = SET_PATTERN.match(query)
          if set_pattern_match != None:
            query_options_changed.append(set_pattern_match.groups()[0])
          result = self.__execute_query(target_impalad_client, query, user=user)
      except Exception as e:
        if 'CATCH' in test_section:
          self.__verify_exceptions(test_section['CATCH'], str(e), use_db)
          continue
        raise
      finally:
        if len(query_options_changed) > 0:
          self.__restore_query_options(query_options_changed, target_impalad_client)

      if 'CATCH' in test_section:
        assert test_section['CATCH'].strip() == ''

      assert result is not None
      assert result.success

      # Decode the results read back if the data is stored with a specific encoding.
      if encoding: result.data = [row.decode(encoding) for row in result.data]
      # Replace $NAMENODE in the expected results with the actual namenode URI.
      if 'RESULTS' in test_section:
        self.__verify_results_and_errors(vector, test_section, result, use_db)
      else:
        # TODO: Can't validate errors without expected results for now.
        assert 'ERRORS' not in test_section,\
          "'ERRORS' sections must have accompanying 'RESULTS' sections"
      # If --update_results, then replace references to the namenode URI with $NAMENODE.
      if pytest.config.option.update_results and 'RESULTS' in test_section:
        test_section['RESULTS'] = test_section['RESULTS'] \
            .replace(NAMENODE, '$NAMENODE') \
            .replace('$IMPALA_HOME', IMPALA_HOME)
      if 'RUNTIME_PROFILE' in test_section:
        verify_runtime_profile(test_section['RUNTIME_PROFILE'], result.runtime_profile)
    if pytest.config.option.update_results:
      output_file = os.path.join('/tmp', test_file_name.replace('/','_') + ".test")
      write_test_file(output_file, sections, encoding=encoding)
Example #22
0
 def test_fuzz_alltypes(self, vector, unique_database):
     table_format = vector.get_value('table_format')
     src_db = QueryTestSectionReader.get_db_name(table_format)
     table_name = "alltypes"
     self.run_fuzz_test(vector, src_db, table_name, unique_database,
                        table_name)
  def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False,
      encoding=None, test_file_vars=None):
    """
    Runs the queries in the specified test based on the vector values

    Runs the query using targeting the file format/compression specified in the test
    vector and the exec options specified in the test vector. If multiple_impalad=True
    a connection to a random impalad will be chosen to execute each test section.
    Otherwise, the default impalad client will be used. If 'protocol' (either 'hs2' or
    'beeswax') is set in the vector, a client for that protocol is used. Otherwise we
    use the default: beeswax.

    Additionally, the encoding for all test data can be specified using the 'encoding'
    parameter. This is useful when data is ingested in a different encoding (ex.
    latin). If not set, the default system encoding will be used.
    If a dict 'test_file_vars' is provided, then all keys will be replaced with their
    values in queries before they are executed. Callers need to avoid using reserved key
    names, see 'reserved_keywords' below.
    """
    table_format_info = vector.get_value('table_format')
    exec_options = vector.get_value('exec_option')
    protocol = vector.get_value('protocol')

    # Resolve the current user's primary group name.
    group_id = pwd.getpwnam(getuser()).pw_gid
    group_name = grp.getgrgid(group_id).gr_name

    target_impalad_clients = list()
    if multiple_impalad:
      target_impalad_clients =\
          [ImpalaTestSuite.create_impala_client(host_port, protocol=protocol)
           for host_port in self.__get_cluster_host_ports(protocol)]
    else:
      if protocol == 'beeswax':
        target_impalad_clients = [self.client]
      else:
        assert protocol == 'hs2'
        target_impalad_clients = [self.hs2_client]

    # Change the database to reflect the file_format, compression codec etc, or the
    # user specified database for all targeted impalad.
    for impalad_client in target_impalad_clients:
      ImpalaTestSuite.change_database(impalad_client,
          table_format_info, use_db, pytest.config.option.scale_factor)
      impalad_client.set_configuration(exec_options)

    sections = self.load_query_test_file(self.get_workload(), test_file_name,
        encoding=encoding)
    for test_section in sections:
      if 'SHELL' in test_section:
        assert len(test_section) == 1, \
          "SHELL test sections can't contain other sections"
        cmd = test_section['SHELL']\
          .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)\
          .replace('$FILESYSTEM_NAME', FILESYSTEM_NAME)\
          .replace('$IMPALA_HOME', IMPALA_HOME)
        if use_db: cmd = cmd.replace('$DATABASE', use_db)
        LOG.info("Shell command: " + cmd)
        check_call(cmd, shell=True)
        continue

      if 'QUERY' not in test_section:
        assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\
            (test_file_name, pprint.pformat(test_section))

      if 'SETUP' in test_section:
        self.execute_test_case_setup(test_section['SETUP'], table_format_info)

      # TODO: support running query tests against different scale factors
      query = QueryTestSectionReader.build_query(test_section['QUERY']
          .replace('$GROUP_NAME', group_name)
          .replace('$IMPALA_HOME', IMPALA_HOME)
          .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)
          .replace('$FILESYSTEM_NAME', FILESYSTEM_NAME)
          .replace('$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str())
          .replace('$USER', getuser())
          .replace('$INTERNAL_LISTEN_HOST', INTERNAL_LISTEN_HOST)
          .replace('$INTERNAL_LISTEN_IP', INTERNAL_LISTEN_IP))
      if use_db: query = query.replace('$DATABASE', use_db)

      reserved_keywords = ["$DATABASE", "$FILESYSTEM_PREFIX", "$FILESYSTEM_NAME",
                           "$GROUP_NAME", "$IMPALA_HOME", "$NAMENODE", "$QUERY",
                           "$SECONDARY_FILESYSTEM", "$USER"]

      if test_file_vars:
        for key, value in test_file_vars.iteritems():
          if key in reserved_keywords:
            raise RuntimeError("Key {0} is reserved".format(key))
          query = query.replace(key, value)

      if 'QUERY_NAME' in test_section:
        LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME'])

      # Support running multiple queries within the same test section, only verifying the
      # result of the final query. The main use case is to allow for 'USE database'
      # statements before a query executes, but it is not limited to that.
      # TODO: consider supporting result verification of all queries in the future
      result = None
      target_impalad_client = choice(target_impalad_clients)
      query_options_changed = []
      try:
        user = None
        if 'USER' in test_section:
          # Create a new client so the session will use the new username.
          user = test_section['USER'].strip()
          target_impalad_client = self.create_impala_client(protocol=protocol)
        for query in query.split(';'):
          set_pattern_match = SET_PATTERN.match(query)
          if set_pattern_match != None:
            query_options_changed.append(set_pattern_match.groups()[0])
            assert set_pattern_match.groups()[0] not in vector.get_value("exec_option"), \
                "%s cannot be set in  the '.test' file since it is in the test vector. " \
                "Consider deepcopy()-ing the vector and removing this option in the " \
                "python test." % set_pattern_match.groups()[0]
          result = self.__execute_query(target_impalad_client, query, user=user)
      except Exception as e:
        if 'CATCH' in test_section:
          self.__verify_exceptions(test_section['CATCH'], str(e), use_db)
          continue
        raise
      finally:
        if len(query_options_changed) > 0:
          self.__restore_query_options(query_options_changed, target_impalad_client)

      if 'CATCH' in test_section and '__NO_ERROR__' not in test_section['CATCH']:
        expected_str = " or ".join(test_section['CATCH']).strip() \
          .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX) \
          .replace('$FILESYSTEM_NAME', FILESYSTEM_NAME) \
          .replace('$NAMENODE', NAMENODE) \
          .replace('$IMPALA_HOME', IMPALA_HOME)
        assert False, "Expected exception: %s" % expected_str

      assert result is not None
      assert result.success

      # Decode the results read back if the data is stored with a specific encoding.
      if encoding: result.data = [row.decode(encoding) for row in result.data]
      # Replace $NAMENODE in the expected results with the actual namenode URI.
      if 'RESULTS' in test_section:
        # Combining 'RESULTS' with 'DML_RESULTS" is currently unsupported because
        # __verify_results_and_errors calls verify_raw_results which always checks
        # ERRORS, TYPES, LABELS, etc. which doesn't make sense if there are two
        # different result sets to consider (IMPALA-4471).
        assert 'DML_RESULTS' not in test_section
        self.__verify_results_and_errors(vector, test_section, result, use_db)
      else:
        # TODO: Can't validate errors without expected results for now.
        assert 'ERRORS' not in test_section,\
          "'ERRORS' sections must have accompanying 'RESULTS' sections"
      # If --update_results, then replace references to the namenode URI with $NAMENODE.
      if pytest.config.option.update_results and 'RESULTS' in test_section:
        test_section['RESULTS'] = test_section['RESULTS'] \
            .replace(NAMENODE, '$NAMENODE') \
            .replace('$IMPALA_HOME', IMPALA_HOME) \
            .replace(INTERNAL_LISTEN_HOST, '$INTERNAL_LISTEN_HOST') \
            .replace(INTERNAL_LISTEN_IP, '$INTERNAL_LISTEN_IP')
      rt_profile_info = None
      if 'RUNTIME_PROFILE_%s' % table_format_info.file_format in test_section:
        # If this table format has a RUNTIME_PROFILE section specifically for it, evaluate
        # that section and ignore any general RUNTIME_PROFILE sections.
        rt_profile_info = 'RUNTIME_PROFILE_%s' % table_format_info.file_format
      elif 'RUNTIME_PROFILE' in test_section:
        rt_profile_info = 'RUNTIME_PROFILE'

      if rt_profile_info is not None:
        rt_profile = verify_runtime_profile(test_section[rt_profile_info],
                               result.runtime_profile,
                               update_section=pytest.config.option.update_results)
        if pytest.config.option.update_results:
          test_section[rt_profile_info] = "".join(rt_profile)

      if 'DML_RESULTS' in test_section:
        assert 'ERRORS' not in test_section
        # The limit is specified to ensure the queries aren't unbounded. We shouldn't have
        # test files that are checking the contents of tables larger than that anyways.
        dml_results_query = "select * from %s limit 1000" % \
            test_section['DML_RESULTS_TABLE']
        dml_result = self.__execute_query(target_impalad_client, dml_results_query)
        verify_raw_results(test_section, dml_result,
            vector.get_value('table_format').file_format, result_section='DML_RESULTS',
            update_section=pytest.config.option.update_results)
    if pytest.config.option.update_results:
      output_file = os.path.join(EE_TEST_LOGS_DIR,
                                 test_file_name.replace('/','_') + ".test")
      write_test_file(output_file, sections, encoding=encoding)
Example #24
0
    def run_test_case(self,
                      test_file_name,
                      vector,
                      use_db=None,
                      multiple_impalad=False,
                      encoding=None,
                      test_file_vars=None):
        """
    Runs the queries in the specified test based on the vector values

    Runs the query using targeting the file format/compression specified in the test
    vector and the exec options specified in the test vector. If multiple_impalad=True
    a connection to a random impalad will be chosen to execute each test section.
    Otherwise, the default impalad client will be used.
    Additionally, the encoding for all test data can be specified using the 'encoding'
    parameter. This is useful when data is ingested in a different encoding (ex.
    latin). If not set, the default system encoding will be used.
    If a dict 'test_file_vars' is provided, then all keys will be replaced with their
    values in queries before they are executed. Callers need to avoid using reserved key
    names, see 'reserved_keywords' below.
    """
        table_format_info = vector.get_value('table_format')
        exec_options = vector.get_value('exec_option')

        # Resolve the current user's primary group name.
        group_id = pwd.getpwnam(getuser()).pw_gid
        group_name = grp.getgrgid(group_id).gr_name

        target_impalad_clients = list()
        if multiple_impalad:
            target_impalad_clients =\
                map(ImpalaTestSuite.create_impala_client, IMPALAD_HOST_PORT_LIST)
        else:
            target_impalad_clients = [self.client]

        # Change the database to reflect the file_format, compression codec etc, or the
        # user specified database for all targeted impalad.
        for impalad_client in target_impalad_clients:
            ImpalaTestSuite.change_database(impalad_client, table_format_info,
                                            use_db,
                                            pytest.config.option.scale_factor)
            impalad_client.set_configuration(exec_options)

        sections = self.load_query_test_file(self.get_workload(),
                                             test_file_name,
                                             encoding=encoding)
        for test_section in sections:
            if 'SHELL' in test_section:
                assert len(test_section) == 1, \
                  "SHELL test sections can't contain other sections"
                cmd = test_section['SHELL']\
                  .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX)\
                  .replace('$IMPALA_HOME', IMPALA_HOME)
                if use_db: cmd = cmd.replace('$DATABASE', use_db)
                LOG.info("Shell command: " + cmd)
                check_call(cmd, shell=True)
                continue

            if 'QUERY' not in test_section:
                assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\
                    (test_file_name, pprint.pformat(test_section))

            if 'SETUP' in test_section:
                self.execute_test_case_setup(test_section['SETUP'],
                                             table_format_info)

            # TODO: support running query tests against different scale factors
            query = QueryTestSectionReader.build_query(
                test_section['QUERY'].replace(
                    '$GROUP_NAME',
                    group_name).replace('$IMPALA_HOME', IMPALA_HOME).replace(
                        '$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX).replace(
                            '$SECONDARY_FILESYSTEM',
                            os.getenv("SECONDARY_FILESYSTEM") or str()))
            if use_db: query = query.replace('$DATABASE', use_db)

            reserved_keywords = [
                "$DATABASE", "$FILESYSTEM_PREFIX", "$GROUP_NAME",
                "$IMPALA_HOME", "$NAMENODE", "$QUERY", "$SECONDARY_FILESYSTEM"
            ]

            if test_file_vars:
                for key, value in test_file_vars.iteritems():
                    if key in reserved_keywords:
                        raise RuntimeError("Key {0} is reserved".format(key))
                    query = query.replace(key, value)

            if 'QUERY_NAME' in test_section:
                LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME'])

            # Support running multiple queries within the same test section, only verifying the
            # result of the final query. The main use case is to allow for 'USE database'
            # statements before a query executes, but it is not limited to that.
            # TODO: consider supporting result verification of all queries in the future
            result = None
            target_impalad_client = choice(target_impalad_clients)
            query_options_changed = []
            try:
                user = None
                if 'USER' in test_section:
                    # Create a new client so the session will use the new username.
                    user = test_section['USER'].strip()
                    target_impalad_client = self.create_impala_client()
                for query in query.split(';'):
                    set_pattern_match = SET_PATTERN.match(query)
                    if set_pattern_match != None:
                        query_options_changed.append(
                            set_pattern_match.groups()[0])
                    result = self.__execute_query(target_impalad_client,
                                                  query,
                                                  user=user)
            except Exception as e:
                if 'CATCH' in test_section:
                    self.__verify_exceptions(test_section['CATCH'], str(e),
                                             use_db)
                    continue
                raise
            finally:
                if len(query_options_changed) > 0:
                    self.__restore_query_options(query_options_changed,
                                                 target_impalad_client)

            if 'CATCH' in test_section and '__NO_ERROR__' not in test_section[
                    'CATCH']:
                expected_str = " or ".join(test_section['CATCH']).strip() \
                  .replace('$FILESYSTEM_PREFIX', FILESYSTEM_PREFIX) \
                  .replace('$NAMENODE', NAMENODE) \
                  .replace('$IMPALA_HOME', IMPALA_HOME)
                assert False, "Expected exception: %s" % expected_str

            assert result is not None
            assert result.success

            # Decode the results read back if the data is stored with a specific encoding.
            if encoding:
                result.data = [row.decode(encoding) for row in result.data]
            # Replace $NAMENODE in the expected results with the actual namenode URI.
            if 'RESULTS' in test_section:
                # Combining 'RESULTS' with 'DML_RESULTS" is currently unsupported because
                # __verify_results_and_errors calls verify_raw_results which always checks
                # ERRORS, TYPES, LABELS, etc. which doesn't make sense if there are two
                # different result sets to consider (IMPALA-4471).
                assert 'DML_RESULTS' not in test_section
                self.__verify_results_and_errors(vector, test_section, result,
                                                 use_db)
            else:
                # TODO: Can't validate errors without expected results for now.
                assert 'ERRORS' not in test_section,\
                  "'ERRORS' sections must have accompanying 'RESULTS' sections"
            # If --update_results, then replace references to the namenode URI with $NAMENODE.
            if pytest.config.option.update_results and 'RESULTS' in test_section:
                test_section['RESULTS'] = test_section['RESULTS'] \
                    .replace(NAMENODE, '$NAMENODE') \
                    .replace('$IMPALA_HOME', IMPALA_HOME)
            if 'RUNTIME_PROFILE' in test_section:
                verify_runtime_profile(test_section['RUNTIME_PROFILE'],
                                       result.runtime_profile)

            if 'DML_RESULTS' in test_section:
                assert 'ERRORS' not in test_section
                # The limit is specified to ensure the queries aren't unbounded. We shouldn't have
                # test files that are checking the contents of tables larger than that anyways.
                dml_results_query = "select * from %s limit 1000" % \
                    test_section['DML_RESULTS_TABLE']
                dml_result = self.__execute_query(target_impalad_client,
                                                  dml_results_query)
                verify_raw_results(
                    test_section,
                    dml_result,
                    vector.get_value('table_format').file_format,
                    pytest.config.option.update_results,
                    result_section='DML_RESULTS')
        if pytest.config.option.update_results:
            output_file = os.path.join(
                EE_TEST_LOGS_DIR,
                test_file_name.replace('/', '_') + ".test")
            write_test_file(output_file, sections, encoding=encoding)
 def test_fuzz_alltypes(self, vector, unique_database):
   table_format = vector.get_value('table_format')
   src_db = QueryTestSectionReader.get_db_name(table_format)
   table_name = "alltypes"
   self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name)