def test_add_delete_data_to_hdfs_and_refresh(self, vector, unique_database):
   """
   Data added/deleted directly in HDFS is visible in impala after refresh of
   partition.
   """
   table_name = unique_database + '.' + "partition_test_table"
   table_location = get_fs_path("/test-warehouse/%s" % unique_database)
   file_name = "alltypes.parq"
   src_file = get_fs_path("/test-warehouse/alltypesagg_parquet/year=2010/month=1/"
     "day=9/*.parq")
   file_num_rows = 1000
   self.client.execute("""
     create table %s like functional.alltypes stored as parquet
     location '%s'
   """ % (table_name, table_location))
   self.client.execute("alter table %s add partition (year=2010, month=1)" %
       table_name)
   self.client.execute("refresh %s" % table_name)
   # Check that there is no data in table
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(0)]
   dst_path = "%s/year=2010/month=1/%s" % (table_location, file_name)
   check_call(["hadoop", "fs", "-cp", "-f", src_file, dst_path], shell=False)
   # Check that data added is not visible before refresh
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(0)]
   # Chech that data is visible after refresh
   self.client.execute("refresh %s partition (year=2010, month=1)" % table_name)
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(file_num_rows)]
   # Check that after deleting the file and refreshing, it returns zero rows
   check_call(["hadoop", "fs", "-rm", dst_path], shell=False)
   self.client.execute("refresh %s partition (year=2010, month=1)" % table_name)
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(0)]
  def setup_method(self, method):
    self.cleanup_db('impala_test_desc_db1')
    self.cleanup_db('impala_test_desc_db2')
    self.cleanup_db('impala_test_desc_db3')
    self.cleanup_db('impala_test_desc_db4')
    self.cleanup_db('hive_test_desc_db')
    self.cleanup_db('hive_test_db')

    self.client.execute("create database if not exists impala_test_desc_db1")
    self.client.execute(
        "create database if not exists impala_test_desc_db2 "
        "comment \"test comment\"")
    self.client.execute(
        "create database if not exists impala_test_desc_db3 "
        "location \"" + get_fs_path("/testdb") + "\"")
    self.client.execute(
        "create database if not exists impala_test_desc_db4 "
        "comment \"test comment\" location \"" + get_fs_path("/test2.db") + "\"")

    self.client.execute(
        "create table if not exists impala_test_desc_db1.complex_types_tbl ("
        "map_array_struct_col map<string, array<struct<f1:int, f2:string>>>, "
        "struct_array_struct_col "
        "struct<f1:int, f2:array<struct<f11:bigint, f12:string>>>, "
        "map_array_map_struct_col "
        "map<string, array<map<string, struct<f1:string, f2:int>>>>)")
Exemple #3
0
  def test_hive_udfs_missing_jar(self, vector):
    """ IMPALA-2365: Impalad shouldn't crash if the udf jar isn't present
    on HDFS"""
    # Copy hive-exec.jar to a temporary file
    jar_path = get_fs_path("/test-warehouse/" + get_random_id(5) + ".jar")
    hive_jar = get_fs_path("/test-warehouse/hive-exec.jar")
    check_call(["hadoop", "fs", "-cp", hive_jar, jar_path])
    drop_fn_stmt = "drop function if exists default.pi_missing_jar()"
    create_fn_stmt = "create function default.pi_missing_jar() returns double \
        location '%s' symbol='org.apache.hadoop.hive.ql.udf.UDFPI'" % jar_path

    cluster = ImpalaCluster()
    impalad = cluster.get_any_impalad()
    client = impalad.service.create_beeswax_client()
    # Create and drop functions with sync_ddl to make sure they are reflected
    # in every impalad.
    exec_option = vector.get_value('exec_option')
    exec_option['sync_ddl'] = 1

    self.execute_query_expect_success(client, drop_fn_stmt, exec_option)
    self.execute_query_expect_success(client, create_fn_stmt, exec_option)
    # Delete the udf jar
    check_call(["hadoop", "fs", "-rm", jar_path])

    different_impalad = cluster.get_different_impalad(impalad)
    client = different_impalad.service.create_beeswax_client()
    # Run a query using the udf from an impalad other than the one
    # we used to create the function. This is to bypass loading from
    # the cache
    try:
      self.execute_query_using_client(client,
          "select default.pi_missing_jar()", vector)
      assert False, "Query expected to fail"
    except ImpalaBeeswaxException, e:
      assert "Failed to get file info" in str(e)
 def test_java_udfs(self, vector):
   self.client.execute("create database if not exists java_udfs_test "
       "location '%s'" % get_fs_path('/test-warehouse/java_udf_test.db'))
   self.client.execute("create database if not exists udf_test "
       "location '%s'" % get_fs_path('/test-warehouse/udf_test.db'))
   try:
     self.run_test_case('QueryTest/load-java-udfs', vector)
     self.run_test_case('QueryTest/java-udf', vector)
   finally:
     self.client.execute("drop database if exists java_udfs_test cascade")
     self.client.execute("drop database if exists udf_test cascade")
Exemple #5
0
  def test_native_functions(self, vector):
    database = 'native_function_test'

    self.__load_functions(
      self.create_udfs_template, vector, database,
      get_fs_path('/test-warehouse/libTestUdfs.so'))
    self.__load_functions(
      self.create_udas_template, vector, database,
      get_fs_path('/test-warehouse/libudasample.so'))

    self.run_test_case('QueryTest/udf', vector, use_db=database)
    if not IS_S3: # S3 doesn't support INSERT
      self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
    self.run_test_case('QueryTest/uda', vector, use_db=database)
Exemple #6
0
    def test_def_level_encoding(self, vector, unique_database, tmpdir):
        """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a
    parquet table and running the parquet-reader tool on it, which performs sanity
    checking, such as that the correct number of definition levels were encoded.
    """
        table_name = "test_hdfs_parquet_table_writer"
        qualified_table_name = "%s.%s" % (unique_database, table_name)
        self.execute_query(
            "create table %s stored as parquet as select l_linenumber from "
            "tpch_parquet.lineitem limit 180000" % qualified_table_name)

        hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' %
                                (unique_database, table_name))
        check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmpdir.strpath])

        for root, subdirs, files in os.walk(tmpdir.strpath):
            for f in files:
                if not f.endswith('parq'):
                    continue
                check_call([
                    os.path.join(IMPALA_HOME, "bin/run-binary.sh"),
                    os.path.join(impalad_basedir, 'util/parquet-reader'),
                    '--file',
                    os.path.join(tmpdir.strpath, str(f))
                ])
Exemple #7
0
    def test_set_column_orders(self, vector, unique_database, tmpdir):
        """Tests that the Parquet writers set FileMetaData::column_orders."""
        source_table = "functional_parquet.alltypessmall"
        target_table = "test_set_column_orders"
        qualified_target_table = "{0}.{1}".format(unique_database,
                                                  target_table)
        hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(
            unique_database, target_table))

        # Create table
        query = "create table {0} like {1} stored as parquet".format(
            qualified_target_table, source_table)
        self.execute_query(query)

        # Insert data
        query = (
            "insert into {0} partition(year, month) select * from {1}").format(
                qualified_target_table, source_table)
        self.execute_query(query)

        # Download hdfs files and verify column orders
        file_metadata_list = get_parquet_metadata_from_hdfs_folder(
            hdfs_path, tmpdir.strpath)

        expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11

        for file_metadata in file_metadata_list:
            assert file_metadata.column_orders == expected_col_orders
  def test_write_statistics_multiple_row_groups(self, vector, unique_database, tmpdir):
    """Test that writing multiple row groups works as expected. This is done by inserting
    into a table using the SORT BY clause and then making sure that the min and max values
    of row groups don't overlap."""
    source_table = "tpch_parquet.orders"
    target_table = "test_hdfs_parquet_table_writer"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(
        unique_database, target_table))

    # Insert a large amount of data on a single backend with a limited parquet file size.
    # This will result in several files being written, exercising code that tracks
    # statistics for row groups.
    query = "create table {0} sort by (o_orderkey) like {1} stored as parquet".format(
        qualified_target_table, source_table)
    self.execute_query(query, vector.get_value('exec_option'))
    query = ("insert into {0} select * from {1}").format(
        qualified_target_table, source_table)
    vector.get_value('exec_option')['num_nodes'] = 1
    vector.get_value('exec_option')['parquet_file_size'] = 8 * 1024 * 1024
    self.execute_query(query, vector.get_value('exec_option'))

    # Get all stats for the o_orderkey column
    row_group_stats = self._get_row_group_stats_from_hdfs_folder(hdfs_path,
                                                                 tmpdir.strpath)
    assert len(row_group_stats) > 1
    orderkey_stats = [s[0] for s in row_group_stats]

    # Make sure that they don't overlap by ordering by the min value, then looking at
    # boundaries.
    orderkey_stats.sort(key = lambda s: s.min)
    for l, r in zip(orderkey_stats, orderkey_stats[1:]):
      assert l.max <= r.min
Exemple #9
0
    def test_insert_parquet_verify_size(self, vector, unique_database):
        # Test to verify that the result file size is close to what we expect.
        tbl_name = "parquet_insert_size"
        fq_tbl_name = unique_database + "." + tbl_name
        location = get_fs_path("test-warehouse/{0}.db/{1}/".format(
            unique_database, tbl_name))
        create = ("create table {0} like tpch_parquet.orders stored as parquet"
                  .format(fq_tbl_name, location))
        query = "insert overwrite {0} select * from tpch.orders".format(
            fq_tbl_name)
        block_size = 40 * 1024 * 1024

        self.execute_query(create)
        vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = block_size
        vector.get_value('exec_option')['COMPRESSION_CODEC'] =\
            vector.get_value('compression_codec')
        vector.get_value('exec_option')['num_nodes'] = 1
        self.execute_query(query, vector.get_value('exec_option'))

        # Get the files in hdfs and verify. There can be at most 1 file that is smaller
        # that the block_size. The rest should be within 80% of it and not over.
        found_small_file = False
        sizes = self.filesystem_client.get_all_file_sizes(location)
        for size in sizes:
            assert size < block_size, "File size greater than expected.\
          Expected: {0}, Got: {1}".format(block_size, size)
            if size < block_size * 0.80:
                assert not found_small_file
                found_small_file = True
Exemple #10
0
 def test_deprecated_stats(self, vector, unique_database):
     """Test that reading parquet files with statistics with deprecated 'min'/'max' fields
 works correctly. The statistics will be used for known-good types (boolean, integral,
 float) and will be ignored for all other types (string, decimal, timestamp)."""
     table_name = 'deprecated_stats'
     # We use CTAS instead of "create table like" to convert the partition columns into
     # normal table columns.
     self.client.execute(
         'create table %s.%s stored as parquet as select * from '
         'functional.alltypessmall limit 0' % (unique_database, table_name))
     table_location = get_fs_path('/test-warehouse/%s.db/%s' %
                                  (unique_database, table_name))
     local_file = os.path.join(
         os.environ['IMPALA_HOME'],
         'testdata/data/deprecated_statistics.parquet')
     assert os.path.isfile(local_file)
     check_call(
         ['hdfs', 'dfs', '-copyFromLocal', local_file, table_location])
     self.client.execute('invalidate metadata %s.%s' %
                         (unique_database, table_name))
     # The test makes assumptions about the number of row groups that are processed and
     # skipped inside a fragment, so we ensure that the tests run in a single fragment.
     vector.get_value('exec_option')['num_nodes'] = 1
     self.run_test_case('QueryTest/parquet-deprecated-stats', vector,
                        unique_database)
  def test_def_level_encoding(self, vector, unique_database):
    """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a
    parquet table and running the parquet-reader tool on it, which performs sanity
    checking, such as that the correct number of definition levels were encoded.
    """
    table_name = "test_hdfs_parquet_table_writer"
    qualified_table_name = "%s.%s" % (unique_database, table_name)
    self.execute_query("drop table if exists %s" % qualified_table_name)
    self.execute_query("create table %s stored as parquet as select l_linenumber from "
        "tpch_parquet.lineitem limit 180000" % qualified_table_name)

    tmp_dir = make_tmp_dir()
    try:
      hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq'
          % (unique_database, table_name))
      check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmp_dir])

      for root, subdirs, files in os.walk(tmp_dir):
        for f in files:
          if not f.endswith('parq'):
            continue
          check_call([os.path.join(impalad_basedir, 'util/parquet-reader'), '--file',
              os.path.join(tmp_dir, str(f))])
    finally:
      self.execute_query("drop table %s" % qualified_table_name)
      rmtree(tmp_dir)
  def _ctas_table_and_verify_stats(self, vector, unique_database, source_table,
                                   expected_values, hive_skip_col_idx = None):
    """Copies 'source_table' into a parquet table and makes sure that the row group
    statistics in the resulting parquet file match those in 'expected_values'. The
    comparison is performed against both Hive and Impala. For Hive, columns indexed by
    'hive_skip_col_idx' are excluded from the verification of the expected values.
    """
    table_name = "test_hdfs_parquet_table_writer"
    qualified_table_name = "{0}.{1}".format(unique_database, table_name)
    hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database,
                                                                 table_name))

    # Validate against Hive.
    self.execute_query("drop table if exists {0}".format(qualified_table_name))
    self.run_stmt_in_hive("create table {0} stored as parquet as select * from "
                          "{1}".format(qualified_table_name, source_table))
    self.execute_query("invalidate metadata {0}".format(qualified_table_name))
    self._validate_min_max_stats(hdfs_path, expected_values, hive_skip_col_idx)

    # Validate against Impala. Setting exec_single_node_rows_threshold and adding a limit
    # clause ensures that the query is executed on the coordinator, resulting in a single
    # parquet file being written.
    num_rows = self.execute_scalar("select count(*) from {0}".format(source_table))
    self.execute_query("drop table {0}".format(qualified_table_name))
    query = ("create table {0} stored as parquet as select * from {1} limit "
             "{2}").format(qualified_table_name, source_table, num_rows)
    vector.get_value('exec_option')['EXEC_SINGLE_NODE_ROWS_THRESHOLD'] = num_rows
    self.execute_query(query, vector.get_value('exec_option'))
    self._validate_min_max_stats(hdfs_path, expected_values)
  def test_write_statistics_multiple_row_groups(self, vector, unique_database, tmpdir):
    """Test that writing multiple row groups works as expected. This is done by inserting
    into a table using the SORT BY clause and then making sure that the min and max values
    of row groups don't overlap."""
    source_table = "tpch_parquet.orders"
    target_table = "test_hdfs_parquet_table_writer"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(
        unique_database, target_table))

    # Insert a large amount of data on a single backend with a limited parquet file size.
    # This will result in several files being written, exercising code that tracks
    # statistics for row groups.
    query = "create table {0} sort by (o_orderkey) like {1} stored as parquet".format(
        qualified_target_table, source_table)
    self.execute_query(query, vector.get_value('exec_option'))
    query = ("insert into {0} select * from {1}").format(
        qualified_target_table, source_table)
    vector.get_value('exec_option')['num_nodes'] = 1
    vector.get_value('exec_option')['parquet_file_size'] = 8 * 1024 * 1024
    self.execute_query(query, vector.get_value('exec_option'))

    # Get all stats for the o_orderkey column
    row_group_stats = self._get_row_group_stats_from_hdfs_folder(hdfs_path,
                                                                 tmpdir.strpath)
    assert len(row_group_stats) > 1
    orderkey_stats = [s[0] for s in row_group_stats]

    # Make sure that they don't overlap by ordering by the min value, then looking at
    # boundaries.
    orderkey_stats.sort(key = lambda s: s.min)
    for l, r in zip(orderkey_stats, orderkey_stats[1:]):
      assert l.max <= r.min
Exemple #14
0
    def test_insert_alter_partition_location(self):
        """Test that inserts after changing the location of a partition work correctly,
    including the creation of a non-existant partition dir"""
        PART_DIR = "tmp/test_insert_alter_partition_location"
        QUALIFIED_PART_DIR = get_fs_path('/' + PART_DIR)
        TBL_NAME = "functional.insert_alter_partition_location"

        self.execute_query_expect_success(self.client,
                                          "DROP TABLE IF EXISTS %s" % TBL_NAME)
        self.hdfs_client.delete_file_dir(PART_DIR, recursive=True)

        self.execute_query_expect_success(
            self.client,
            "CREATE TABLE  %s (c int) PARTITIONED BY (p int)" % TBL_NAME)
        self.execute_query_expect_success(
            self.client, "ALTER TABLE %s ADD PARTITION(p=1)" % TBL_NAME)
        self.execute_query_expect_success(
            self.client, "ALTER TABLE %s PARTITION(p=1) SET LOCATION '%s'" %
            (TBL_NAME, QUALIFIED_PART_DIR))
        self.execute_query_expect_success(
            self.client,
            "INSERT OVERWRITE %s PARTITION(p=1) VALUES(1)" % TBL_NAME)

        result = self.execute_query_expect_success(
            self.client, "SELECT COUNT(*) FROM %s" % TBL_NAME)
        assert int(result.get_data()) == 1

        # Should have created the partition dir, which should contain exactly one file (not in
        # a subdirectory)
        ls = self.hdfs_client.list_dir(PART_DIR)
        assert len(ls['FileStatuses']['FileStatus']) == 1
Exemple #15
0
    def test_insert_parquet_verify_size(self, vector):
        # Test to verify that the result file size is close to what we expect.i
        TBL = "parquet_insert_size"
        DROP = "drop table if exists {0}".format(TBL)
        CREATE = ("create table parquet_insert_size like tpch_parquet.orders"
                  " stored as parquet location '{0}/{1}'".format(
                      WAREHOUSE, TBL))
        QUERY = "insert overwrite {0} select * from tpch.orders".format(TBL)
        DIR = get_fs_path("test-warehouse/{0}/".format(TBL))
        BLOCK_SIZE = 40 * 1024 * 1024

        self.execute_query(DROP)
        self.execute_query(CREATE)

        vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = BLOCK_SIZE
        vector.get_value('exec_option')['COMPRESSION_CODEC'] =\
            vector.get_value('compression_codec')
        vector.get_value('exec_option')['num_nodes'] = 1
        self.execute_query(QUERY, vector.get_value('exec_option'))

        # Get the files in hdfs and verify. There can be at most 1 file that is smaller
        # that the BLOCK_SIZE. The rest should be within 80% of it and not over.
        found_small_file = False
        ls = self.hdfs_client.list_dir(DIR)
        for f in ls['FileStatuses']['FileStatus']:
            if f['type'] != 'FILE':
                continue
            length = f['length']
            print length
            assert length < BLOCK_SIZE
            if length < BLOCK_SIZE * 0.80:
                assert found_small_file == False
                found_small_file = True
Exemple #16
0
    def test_strings_utf8(self, vector, unique_database):
        # Create table
        table_name = "ice_str_utf8"
        qualified_table_name = "%s.%s" % (unique_database, table_name)
        query = 'create table %s (a string) stored as iceberg' % qualified_table_name
        self.client.execute(query)

        # Inserted string data should have UTF8 annotation regardless of query options.
        query = 'insert into %s values ("impala")' % qualified_table_name
        self.execute_query(query, {'parquet_annotate_strings_utf8': False})

        # Copy the created file to the local filesystem and parse metadata
        local_file = '/tmp/iceberg_utf8_test_%s.parq' % random.randint(
            0, 10000)
        LOG.info("test_strings_utf8 local file name: " + local_file)
        hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/data/*.parq' %
                                (unique_database, table_name))
        check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file])
        metadata = get_parquet_metadata(local_file)

        # Extract SchemaElements corresponding to the table column
        a_schema_element = metadata.schema[1]
        assert a_schema_element.name == 'a'

        # Check that the schema uses the UTF8 annotation
        assert a_schema_element.converted_type == ConvertedType.UTF8

        os.remove(local_file)
Exemple #17
0
 def test_permanent_udfs(self):
     # Make sure the pre-calculated count tallies with the number of
     # functions shown using "show [aggregate] functions" statement
     self.verify_function_count(
         "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count)
     self.verify_function_count(
         "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE),
         self.uda_count)
     # invalidate metadata and make sure the count tallies
     result = self.client.execute("INVALIDATE METADATA")
     self.verify_function_count(
         "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count)
     self.verify_function_count(
         "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE),
         self.uda_count)
     # Restart the cluster, this triggers a full metadata reload
     self.__restart_cluster()
     # Make sure the counts of udfs and udas match post restart
     self.verify_function_count(
         "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count)
     self.verify_function_count(
         "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE),
         self.uda_count)
     # Drop sample udas and verify the count matches pre and post restart
     self.__load_drop_functions(
         self.DROP_SAMPLE_UDAS_TEMPLATE, self.DATABASE,
         get_fs_path('/test-warehouse/libudasample.so'))
     self.verify_function_count(
         "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1)
     self.__restart_cluster()
     self.verify_function_count(
         "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1)
 def _ctas_table_and_verify_index(self,
                                  vector,
                                  unique_database,
                                  source_table,
                                  tmpdir,
                                  sorting_column=None):
     """Copies 'source_table' into a parquet table and makes sure that the index
 in the resulting parquet file is valid.
 """
     table_name = "test_hdfs_parquet_table_writer"
     qualified_table_name = "{0}.{1}".format(unique_database, table_name)
     hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(
         unique_database, table_name))
     # Setting num_nodes = 1 ensures that the query is executed on the coordinator,
     # resulting in a single parquet file being written.
     vector.get_value('exec_option')['num_nodes'] = 1
     self.execute_query(
         "drop table if exists {0}".format(qualified_table_name))
     if sorting_column is None:
         query = ("create table {0} stored as parquet as select * from {1}"
                  ).format(qualified_table_name, source_table)
     else:
         query = (
             "create table {0} sort by({1}) stored as parquet as select * from {2}"
         ).format(qualified_table_name, sorting_column, source_table)
     self.execute_query(query, vector.get_value('exec_option'))
     self._validate_parquet_page_index(hdfs_path, tmpdir.join(source_table))
Exemple #19
0
    def _ctas_and_get_metadata(self,
                               vector,
                               unique_database,
                               tmp_dir,
                               source_table,
                               table_name="test_hdfs_parquet_table_writer"):
        """CTAS 'source_table' into a Parquet table and returns its Parquet metadata."""
        qualified_table_name = "{0}.{1}".format(unique_database, table_name)
        hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(
            unique_database, table_name))

        # Setting num_nodes = 1 ensures that the query is executed on the coordinator,
        # resulting in a single parquet file being written.
        query = (
            "create table {0} stored as parquet as select * from {1}").format(
                qualified_table_name, source_table)
        vector.get_value('exec_option')['num_nodes'] = 1
        self.execute_query_expect_success(self.client, query,
                                          vector.get_value('exec_option'))

        file_metadata_list = get_parquet_metadata_from_hdfs_folder(
            hdfs_path, tmp_dir)
        assert len(file_metadata_list) == 1
        assert file_metadata_list[0] is not None
        return file_metadata_list[0]
  def test_sorting_columns(self, vector, unique_database, tmpdir):
    """Tests that RowGroup::sorting_columns gets populated when specifying a sortby()
    insert hint."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_write_sorting_columns"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    # TODO: Simplify once IMPALA-4167 (insert hints in CTAS) has been fixed.
    query = "create table {0} like {1} stored as parquet".format(qualified_target_table,
        source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) /* +sortby(int_col, id) */ "
        "select * from {1}").format(qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and extract rowgroup metadata
    row_groups = []
    check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath])

    for root, subdirs, files in os.walk(tmpdir.strpath):
      for f in files:
        parquet_file = os.path.join(root, str(f))
        file_meta_data = get_parquet_metadata(parquet_file)
        row_groups.extend(file_meta_data.row_groups)

    # Verify that the files have the sorted_columns set
    expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)]
    for row_group in row_groups:
      assert row_group.sorting_columns == expected
Exemple #21
0
    def _ctas_table_and_verify_stats(
            self,
            vector,
            unique_database,
            tmp_dir,
            source_table,
            expected_values,
            table_name="test_hdfs_parquet_table_writer"):
        """Copies 'source_table' into a parquet table and makes sure that the row group
    statistics in the resulting parquet file match those in 'expected_values'. 'tmp_dir'
    needs to be supplied by the caller and will be used to store temporary files. The
    caller is responsible for cleaning up 'tmp_dir'.
    """
        qualified_table_name = "{0}.{1}".format(unique_database, table_name)
        hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(
            unique_database, table_name))

        # Setting num_nodes = 1 ensures that the query is executed on the coordinator,
        # resulting in a single parquet file being written.
        self.execute_query(
            "drop table if exists {0}".format(qualified_table_name))
        query = (
            "create table {0} stored as parquet as select * from {1}").format(
                qualified_table_name, source_table)
        vector.get_value('exec_option')['num_nodes'] = 1
        self.execute_query(query, vector.get_value('exec_option'))
        self._validate_parquet_stats(hdfs_path, tmp_dir, expected_values)
 def test_libs_with_same_filenames(self, vector):
   self.client.execute("create database if not exists same_lib_filename_udf_test "
       "location '%s'" % get_fs_path('/test-warehouse/same_lib_filename_udf_test.db'))
   try:
     self.run_test_case('QueryTest/libs_with_same_filenames', vector)
   finally:
     self.client.execute("drop database if exists same_lib_filename_udf_test cascade")
Exemple #23
0
    def test_udf_constant_folding(self, vector, unique_database):
        """Test that constant folding of UDFs is handled correctly. Uses count_rows(),
    which returns a unique value every time it is evaluated in the same thread."""
        exec_options = copy(vector.get_value('exec_option'))
        # Execute on a single node so that all counter values will be unique.
        exec_options["num_nodes"] = 1
        create_fn_query = """create function {database}.count_rows() returns bigint
                         location '{location}' symbol='Count' prepare_fn='CountPrepare'
                         close_fn='CountClose'"""
        self._load_functions(create_fn_query, vector, unique_database,
                             get_fs_path('/test-warehouse/libTestUdfs.so'))

        # Only one distinct value if the expression is constant folded, otherwise one
        # value per row in alltypes
        expected_ndv = 1 if exec_options['enable_expr_rewrites'] else 7300

        # Test fully constant expression, evaluated in FE.
        query = "select `{0}`.count_rows() from functional.alltypes".format(
            unique_database)
        result = self.execute_query_expect_success(self.client, query,
                                                   exec_options)
        actual_ndv = len(set(result.data))
        assert actual_ndv == expected_ndv

        # Test constant argument to a non-constant expr. The argument value can be
        # cached in the backend.
        query = """select concat(cast(`{0}`.count_rows() as string), '-', string_col)
               from functional.alltypes""".format(unique_database)
        result = self.execute_query_expect_success(self.client, query,
                                                   exec_options)
        actual_ndv = len(set(value.split("-")[0] for value in result.data))
        assert actual_ndv == expected_ndv
Exemple #24
0
    def test_set_column_orders(self, vector, unique_database, tmpdir):
        """Tests that the Parquet writers set FileMetaData::column_orders."""
        source_table = "functional_parquet.alltypessmall"
        target_table = "test_set_column_orders"
        qualified_target_table = "{0}.{1}".format(unique_database,
                                                  target_table)
        hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(
            unique_database, target_table))

        # Create table
        query = "create table {0} like {1} stored as parquet".format(
            qualified_target_table, source_table)
        self.execute_query(query)

        # Insert data
        query = (
            "insert into {0} partition(year, month) select * from {1}").format(
                qualified_target_table, source_table)
        self.execute_query(query)

        # Download hdfs files and verify column orders
        check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath])

        expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11

        for root, subdirs, files in os.walk(tmpdir.strpath):
            for f in files:
                parquet_file = os.path.join(root, str(f))
                file_meta_data = get_parquet_metadata(parquet_file)
                assert file_meta_data.column_orders == expected_col_orders
Exemple #25
0
    def test_udf_invalid_symbol(self, vector, unique_database):
        """ IMPALA-1642: Impala crashes if the symbol for a Hive UDF doesn't exist
        Crashing is non-deterministic so we run the UDF several times."""
        src_udf_path = os.path.join(os.environ['IMPALA_HOME'],
                                    'testdata/udfs/impala-hive-udfs.jar')
        tgt_udf_path = get_fs_path(
            '/test-warehouse/{0}.db/impala-hive-udfs.jar'.format(
                unique_database))
        drop_fn_stmt = (
            "drop function if exists `{0}`.fn_invalid_symbol(STRING)".format(
                unique_database))
        create_fn_stmt = (
            "create function `{0}`.fn_invalid_symbol(STRING) returns "
            "STRING LOCATION '{1}' SYMBOL='not.a.Symbol'".format(
                unique_database, tgt_udf_path))
        query = "select `{0}`.fn_invalid_symbol('test')".format(
            unique_database)

        self.filesystem_client.copy_from_local(src_udf_path, tgt_udf_path)
        self.client.execute(drop_fn_stmt)
        self.client.execute(create_fn_stmt)
        for _ in xrange(5):
            ex = self.execute_query_expect_failure(self.client, query)
            assert "Unable to find class" in str(ex)
        self.client.execute(drop_fn_stmt)
Exemple #26
0
    def test_drop_function_while_running(self, vector):
        self.client.execute(
            "drop function if exists default.drop_while_running(BIGINT)")
        self.client.execute("create function default.drop_while_running(BIGINT) returns "\
            "BIGINT LOCATION '%s' SYMBOL='Identity'" %
            get_fs_path('/test-warehouse/libTestUdfs.so'))
        query = \
            "select default.drop_while_running(l_orderkey) from tpch.lineitem limit 10000"

        # Run this query asynchronously.
        handle = self.execute_query_async(
            query,
            vector.get_value('exec_option'),
            table_format=vector.get_value('table_format'))

        # Fetch some rows from the async query to make sure the UDF is being used
        results = self.client.fetch(query, handle, 1)
        assert results.success
        assert len(results.data) == 1

        # Drop the function while the original query is running.
        self.client.execute("drop function default.drop_while_running(BIGINT)")

        # Fetch the rest of the rows, this should still be able to run the UDF
        results = self.client.fetch(query, handle, -1)
        assert results.success
        assert len(results.data) == 9999
  def test_udf_update_via_drop(self, vector, unique_database):
    """Test updating the UDF binary without restarting Impala. Dropping
    the function should remove the binary from the local cache."""
    # Run with sync_ddl to guarantee the drop is processed by all impalads.
    exec_options = vector.get_value('exec_option')
    exec_options['sync_ddl'] = 1
    old_udf = os.path.join(
        os.environ['IMPALA_HOME'], 'testdata/udfs/impala-hive-udfs.jar')
    new_udf = os.path.join(
        os.environ['IMPALA_HOME'], 'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
    udf_dst = get_fs_path('/test-warehouse/impala-hive-udfs2.jar')

    drop_fn_stmt = (
        'drop function if exists `{0}`.`udf_update_test_drop`()'.format(unique_database))
    create_fn_stmt = (
        "create function `{0}`.`udf_update_test_drop`() returns string LOCATION '{1}' "
        "SYMBOL='com.cloudera.impala.TestUpdateUdf'".format(unique_database, udf_dst))
    query_stmt = "select `{0}`.`udf_update_test_drop`()".format(unique_database)

    # Put the old UDF binary on HDFS, make the UDF in Impala and run it.
    check_call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
    self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
    self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
    self.__run_query_all_impalads(exec_options, query_stmt, ["Old UDF"])

    # Update the binary, drop and create the function again. The new binary should
    # be running.
    check_call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
    self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
    self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
    self.__run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
  def test_sorting_columns(self, vector, unique_database, tmpdir):
    """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY
    columns."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_write_sorting_columns"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) select * from {1}").format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and extract rowgroup metadata
    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath)
    row_groups = []

    for file_metadata in file_metadata_list:
      row_groups.extend(file_metadata.row_groups)

    # Verify that the files have the sorted_columns set
    expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)]
    for row_group in row_groups:
      assert row_group.sorting_columns == expected
Exemple #29
0
    def test_udf_update_via_drop(self, vector):
        """Test updating the UDF binary without restarting Impala. Dropping
    the function should remove the binary from the local cache."""
        # Run with sync_ddl to guarantee the drop is processed by all impalads.
        exec_options = vector.get_value('exec_option')
        exec_options['sync_ddl'] = 1
        old_udf = os.path.join(os.environ['IMPALA_HOME'],
                               'testdata/udfs/impala-hive-udfs.jar')
        new_udf = os.path.join(
            os.environ['IMPALA_HOME'],
            'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
        udf_dst = get_fs_path('/test-warehouse/impala-hive-udfs2.jar')

        drop_fn_stmt = 'drop function if exists default.udf_update_test_drop()'
        create_fn_stmt = "create function default.udf_update_test_drop() returns string "\
            "LOCATION '" + udf_dst + "' SYMBOL='com.cloudera.impala.TestUpdateUdf'"
        query_stmt = "select default.udf_update_test_drop()"

        # Put the old UDF binary on HDFS, make the UDF in Impala and run it.
        call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
        self.execute_query_expect_success(self.client, drop_fn_stmt,
                                          exec_options)
        self.execute_query_expect_success(self.client, create_fn_stmt,
                                          exec_options)
        self.__run_query_all_impalads(exec_options, query_stmt, ["Old UDF"])

        # Update the binary, drop and create the function again. The new binary should
        # be running.
        call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
        self.execute_query_expect_success(self.client, drop_fn_stmt,
                                          exec_options)
        self.execute_query_expect_success(self.client, create_fn_stmt,
                                          exec_options)
        self.__run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
  def test_insert_alter_partition_location(self):
    """Test that inserts after changing the location of a partition work correctly,
    including the creation of a non-existant partition dir"""
    PART_DIR = "tmp/test_insert_alter_partition_location"
    QUALIFIED_PART_DIR = get_fs_path('/' + PART_DIR)
    TBL_NAME = "functional.insert_alter_partition_location"

    self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME)
    self.hdfs_client.delete_file_dir(PART_DIR, recursive=True)

    self.execute_query_expect_success(self.client,
        "CREATE TABLE  %s (c int) PARTITIONED BY (p int)" % TBL_NAME)
    self.execute_query_expect_success(self.client,
        "ALTER TABLE %s ADD PARTITION(p=1)" % TBL_NAME)
    self.execute_query_expect_success(self.client,
        "ALTER TABLE %s PARTITION(p=1) SET LOCATION '%s'" %
        (TBL_NAME, QUALIFIED_PART_DIR))
    self.execute_query_expect_success(self.client,
        "INSERT OVERWRITE %s PARTITION(p=1) VALUES(1)" % TBL_NAME)

    result = self.execute_query_expect_success(self.client,
        "SELECT COUNT(*) FROM %s" % TBL_NAME)
    assert int(result.get_data()) == 1

    # Should have created the partition dir, which should contain exactly one file (not in
    # a subdirectory)
    ls = self.hdfs_client.list_dir(PART_DIR)
    assert len(ls['FileStatuses']['FileStatus']) == 1
  def test_set_column_orders(self, vector, unique_database, tmpdir):
    """Tests that the Parquet writers set FileMetaData::column_orders."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_set_column_orders"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    query = "create table {0} like {1} stored as parquet".format(qualified_target_table,
        source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) select * from {1}").format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and verify column orders
    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath)

    expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11

    for file_metadata in file_metadata_list:
      assert file_metadata.column_orders == expected_col_orders
 def test_corrupt_rle_counts(self, vector, unique_database):
     """IMPALA-3646: Tests that a certain type of file corruption for plain
 dictionary encoded values is gracefully handled. Cases tested:
 - incorrect literal count of 0 for the RLE encoded dictionary indexes
 - incorrect repeat count of 0 for the RLE encoded dictionary indexes
 """
     # Create test table and copy the corrupt files into it.
     self.client.execute(
         "create table %s.bad_rle_counts (c bigint) stored as parquet" %
         unique_database)
     bad_rle_counts_tbl_loc =\
         get_fs_path("/test-warehouse/%s.db/%s" % (unique_database, "bad_rle_counts"))
     check_call([
         'hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
         "/testdata/data/bad_rle_literal_count.parquet",
         bad_rle_counts_tbl_loc
     ])
     check_call([
         'hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
         "/testdata/data/bad_rle_repeat_count.parquet",
         bad_rle_counts_tbl_loc
     ])
     # Querying the corrupted files should not DCHECK or crash.
     vector.get_value('exec_option')['abort_on_error'] = 0
     self.run_test_case('QueryTest/parquet-corrupt-rle-counts', vector,
                        unique_database)
     vector.get_value('exec_option')['abort_on_error'] = 1
     self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort',
                        vector, unique_database)
  def test_insert_parquet_verify_size(self, vector, unique_database):
    # Test to verify that the result file size is close to what we expect.
    tbl_name = "parquet_insert_size"
    fq_tbl_name = unique_database + "." + tbl_name
    location = get_fs_path("test-warehouse/{0}.db/{1}/"
                           .format(unique_database, tbl_name))
    create = ("create table {0} like tpch_parquet.orders stored as parquet"
              .format(fq_tbl_name, location))
    query = "insert overwrite {0} select * from tpch.orders".format(fq_tbl_name)
    block_size = 40 * 1024 * 1024

    self.execute_query(create)
    vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = block_size
    vector.get_value('exec_option')['COMPRESSION_CODEC'] =\
        vector.get_value('compression_codec')
    vector.get_value('exec_option')['num_nodes'] = 1
    self.execute_query(query, vector.get_value('exec_option'))

    # Get the files in hdfs and verify. There can be at most 1 file that is smaller
    # that the block_size. The rest should be within 80% of it and not over.
    found_small_file = False
    sizes = self.filesystem_client.get_all_file_sizes(location)
    for size in sizes:
      assert size < block_size, "File size greater than expected.\
          Expected: {0}, Got: {1}".format(block_size, size)
      if size < block_size * 0.80:
        assert not found_small_file
        found_small_file = True
  def test_sorting_columns(self, vector, unique_database, tmpdir):
    """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY
    columns."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_write_sorting_columns"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) select * from {1}").format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and extract rowgroup metadata
    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath)
    row_groups = []

    for file_metadata in file_metadata_list:
      row_groups.extend(file_metadata.row_groups)

    # Verify that the files have the sorted_columns set
    expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)]
    for row_group in row_groups:
      assert row_group.sorting_columns == expected
  def test_drop_function_while_running(self, vector, unique_database):
    self.client.execute("drop function if exists `{0}`.drop_while_running(BIGINT)"
                        .format(unique_database))
    self.client.execute(
        "create function `{0}`.drop_while_running(BIGINT) returns "
        "BIGINT LOCATION '{1}' SYMBOL='Identity'".format(
            unique_database,
            get_fs_path('/test-warehouse/libTestUdfs.so')))
    query = ("select `{0}`.drop_while_running(l_orderkey) from tpch.lineitem limit 10000"
             .format(unique_database))

    # Run this query asynchronously.
    handle = self.execute_query_async(query, vector.get_value('exec_option'),
                                      table_format=vector.get_value('table_format'))

    # Fetch some rows from the async query to make sure the UDF is being used
    results = self.client.fetch(query, handle, 1)
    assert results.success
    assert len(results.data) == 1

    # Drop the function while the original query is running.
    self.client.execute(
        "drop function `{0}`.drop_while_running(BIGINT)".format(unique_database))

    # Fetch the rest of the rows, this should still be able to run the UDF
    results = self.client.fetch(query, handle, -1)
    assert results.success
    assert len(results.data) == 9999
  def test_insert_alter_partition_location(self, unique_database):
    """Test that inserts after changing the location of a partition work correctly,
    including the creation of a non-existant partition dir"""
    part_dir = "tmp/{0}".format(unique_database)
    qualified_part_dir = get_fs_path('/' + part_dir)
    table_name = "`{0}`.`insert_alter_partition_location`".format(unique_database)

    self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % table_name)
    self.filesystem_client.delete_file_dir(part_dir, recursive=True)

    self.execute_query_expect_success(
        self.client,
        "CREATE TABLE  %s (c int) PARTITIONED BY (p int)" % table_name)
    self.execute_query_expect_success(
        self.client,
        "ALTER TABLE %s ADD PARTITION(p=1)" % table_name)
    self.execute_query_expect_success(
        self.client,
        "ALTER TABLE %s PARTITION(p=1) SET LOCATION '%s'" % (table_name,
                                                             qualified_part_dir))
    self.execute_query_expect_success(
        self.client,
        "INSERT OVERWRITE %s PARTITION(p=1) VALUES(1)" % table_name)

    result = self.execute_query_expect_success(
        self.client,
        "SELECT COUNT(*) FROM %s" % table_name)
    assert int(result.get_data()) == 1

    # Should have created the partition dir, which should contain exactly one file (not in
    # a subdirectory)
    assert len(self.filesystem_client.ls(part_dir)) == 1
  def test_clustered_partition_single_file(self, unique_database):
    """IMPALA-2523: Tests that clustered insert creates one file per partition, even when
    inserting over multiple row batches."""
    # On s3 this test takes about 220 seconds and we are unlikely to break it, so only run
    # it in exhaustive strategy.
    if self.exploration_strategy() != 'exhaustive' and IS_S3:
      pytest.skip("only runs in exhaustive")
    table = "{0}.insert_clustered".format(unique_database)
    table_path = "test-warehouse/{0}.db/insert_clustered".format(unique_database)
    table_location = get_fs_path("/" + table_path)

    create_stmt = """create table {0} like functional.alltypes""".format(table)
    self.execute_query_expect_success(self.client, create_stmt)

    set_location_stmt = """alter table {0} set location '{1}'""".format(
        table, table_location)
    self.execute_query_expect_success(self.client, set_location_stmt)

    # Setting a lower batch size will result in multiple row batches being written.
    self.execute_query_expect_success(self.client, "set batch_size=10")

    insert_stmt = """insert into {0} partition(year, month) /*+ clustered,shuffle */
                     select * from functional.alltypes""".format(table)
    self.execute_query_expect_success(self.client, insert_stmt)

    # We expect exactly one partition per year and month, since subsequent row batches of
    # a partition will be written into the same file.
    expected_partitions = \
        ["year=%s/month=%s" % (y, m) for y in [2009, 2010] for m in range(1,13)]

    for partition in expected_partitions:
      partition_path = "{0}/{1}".format(table_path, partition)
      files = self.filesystem_client.ls(partition_path)
      assert len(files) == 1, "%s: %s" % (partition, files)
  def test_udf_constant_folding(self, vector, unique_database):
    """Test that constant folding of UDFs is handled correctly. Uses count_rows(),
    which returns a unique value every time it is evaluated in the same thread."""
    exec_options = copy(vector.get_value('exec_option'))
    # Execute on a single node so that all counter values will be unique.
    exec_options["num_nodes"] = 1
    create_fn_query = """create function {database}.count_rows() returns bigint
                         location '{location}' symbol='Count' prepare_fn='CountPrepare'
                         close_fn='CountClose'"""
    self._load_functions(create_fn_query, vector, unique_database,
        get_fs_path('/test-warehouse/libTestUdfs.so'))

    # Only one distinct value if the expression is constant folded, otherwise one
    # value per row in alltypes
    expected_ndv = 1 if exec_options['enable_expr_rewrites'] else 7300

    # Test fully constant expression, evaluated in FE.
    query = "select `{0}`.count_rows() from functional.alltypes".format(unique_database)
    result = self.execute_query_expect_success(self.client, query, exec_options)
    actual_ndv = len(set(result.data))
    assert actual_ndv == expected_ndv

    # Test constant argument to a non-constant expr. The argument value can be
    # cached in the backend.
    query = """select concat(cast(`{0}`.count_rows() as string), '-', string_col)
               from functional.alltypes""".format(unique_database)
    result = self.execute_query_expect_success(self.client, query, exec_options)
    actual_ndv = len(set(value.split("-")[0] for value in result.data))
    assert actual_ndv == expected_ndv
  def test_insert_parquet_verify_size(self, vector):
    # Test to verify that the result file size is close to what we expect.i
    TBL = "parquet_insert_size"
    DROP = "drop table if exists {0}".format(TBL)
    CREATE = ("create table parquet_insert_size like tpch_parquet.orders"
              " stored as parquet location '{0}/{1}'".format(WAREHOUSE, TBL))
    QUERY = "insert overwrite {0} select * from tpch.orders".format(TBL)
    DIR = get_fs_path("test-warehouse/{0}/".format(TBL))
    BLOCK_SIZE = 40 * 1024 * 1024

    self.execute_query(DROP)
    self.execute_query(CREATE)

    vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = BLOCK_SIZE
    vector.get_value('exec_option')['COMPRESSION_CODEC'] =\
        vector.get_value('compression_codec')
    vector.get_value('exec_option')['num_nodes'] = 1
    self.execute_query(QUERY, vector.get_value('exec_option'))

    # Get the files in hdfs and verify. There can be at most 1 file that is smaller
    # that the BLOCK_SIZE. The rest should be within 80% of it and not over.
    found_small_file = False
    sizes = self.filesystem_client.get_all_file_sizes(DIR)
    for size in sizes:
      assert size < BLOCK_SIZE, "File size greater than expected.\
          Expected: {0}, Got: {1}".format(BLOCK_SIZE, size)
      if size < BLOCK_SIZE * 0.80:
        assert found_small_file == False
        found_small_file = True
 def test_permanent_udfs(self):
   # Make sure the pre-calculated count tallies with the number of
   # functions shown using "show [aggregate] functions" statement
   self.verify_function_count(
           "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count);
   self.verify_function_count(
           "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count)
   # invalidate metadata and make sure the count tallies
   result = self.client.execute("INVALIDATE METADATA")
   self.verify_function_count(
           "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count);
   self.verify_function_count(
           "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count)
   # Restart the cluster, this triggers a full metadata reload
   self.__restart_cluster()
   # Make sure the counts of udfs and udas match post restart
   self.verify_function_count(
           "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count);
   self.verify_function_count(
           "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count)
   # Drop sample udas and verify the count matches pre and post restart
   self.__load_drop_functions(
       self.DROP_SAMPLE_UDAS_TEMPLATE, self.DATABASE,
       get_fs_path('/test-warehouse/libudasample.so'))
   self.verify_function_count(
           "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1)
   self.__restart_cluster()
   self.verify_function_count(
           "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1)
  def test_insert_parquet_verify_size(self, vector):
    # Test to verify that the result file size is close to what we expect.i
    TBL = "parquet_insert_size"
    DROP = "drop table if exists {0}".format(TBL)
    CREATE = ("create table parquet_insert_size like tpch_parquet.orders"
              " stored as parquet location '{0}/{1}'".format(WAREHOUSE, TBL))
    QUERY = "insert overwrite {0} select * from tpch.orders".format(TBL)
    DIR = get_fs_path("test-warehouse/{0}/".format(TBL))
    BLOCK_SIZE = 40 * 1024 * 1024

    self.execute_query(DROP)
    self.execute_query(CREATE)

    vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = BLOCK_SIZE
    vector.get_value('exec_option')['COMPRESSION_CODEC'] =\
        vector.get_value('compression_codec')
    vector.get_value('exec_option')['num_nodes'] = 1
    self.execute_query(QUERY, vector.get_value('exec_option'))

    # Get the files in hdfs and verify. There can be at most 1 file that is smaller
    # that the BLOCK_SIZE. The rest should be within 80% of it and not over.
    found_small_file = False
    ls = self.hdfs_client.list_dir(DIR)
    for f in ls['FileStatuses']['FileStatus']:
      if f['type'] != 'FILE':
        continue
      length = f['length']
      print length
      assert length < BLOCK_SIZE
      if length < BLOCK_SIZE * 0.80:
        assert found_small_file == False
        found_small_file = True
Exemple #42
0
    def test_drop_then_add_function_while_running(self, vector,
                                                  unique_database):
        self.client.execute(
            "drop function if exists `{0}`.drop_while_running(BIGINT)".format(
                unique_database))
        self.client.execute(
            "create function `{0}`.drop_while_running(BIGINT) returns "
            "BIGINT LOCATION '{1}' SYMBOL='Identity'".format(
                unique_database,
                get_fs_path('/test-warehouse/libTestUdfs.so')))
        query = (
            "select `{0}`.drop_while_running(l_orderkey) from tpch.lineitem limit 10000"
            .format(unique_database))

        # Run this query asynchronously.
        handle = self.execute_query_async(
            query,
            vector.get_value('exec_option'),
            table_format=vector.get_value('table_format'))

        # Fetch some rows from the async query to make sure the UDF is being used
        results = self.client.fetch(query, handle, 1)
        assert results.success
        assert len(results.data) == 1

        # Drop the function while the original query is running.
        self.client.execute(
            "drop function `{0}`.drop_while_running(BIGINT)".format(
                unique_database))

        # Fetch some rows from the async query to make sure the UDF is being used
        results = self.client.fetch(query, handle, 1)
        assert results.success
        assert len(results.data) == 1

        # Re-create function associated with the same binary while the original query is
        # running
        self.client.execute(
            "create function `{0}`.drop_while_running(BIGINT) returns "
            "BIGINT LOCATION '{1}' SYMBOL='Identity'".format(
                unique_database,
                get_fs_path('/test-warehouse/libTestUdfs.so')))

        # Fetch the rest of the rows, this should still be able to run the UDF
        results = self.client.fetch(query, handle, -1)
        assert results.success
        assert len(results.data) == 9998
Exemple #43
0
 def _create_test_table(self, dbname, tablename, filename, columns):
   """Creates a table in the given database with the given name and columns. Copies
   the file with the given name from TESTFILE_DIR into the table."""
   location = get_fs_path("/test-warehouse/%s.db/%s" % (dbname, tablename))
   self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" %
                       (dbname, tablename, columns, location))
   local_path = self.TESTFILE_DIR + "/" + filename
   self.filesystem_client.copy_from_local(local_path, location)
Exemple #44
0
 def test_confirm_individual_refresh(self, vector, unique_database):
     """
 Data added directly to HDFS is only visible for the partition refreshed
 """
     table_name = unique_database + '.' + "partition_test_table"
     table_location = get_fs_path("/test-warehouse/%s" % unique_database)
     file_name = "alltypes.parq"
     src_file = get_fs_path(
         "/test-warehouse/alltypesagg_parquet/year=2010/month=1/"
         "day=9/*.parq")
     file_num_rows = 1000
     self.client.execute("""
   create table %s like functional.alltypes stored as parquet
   location '%s'
 """ % (table_name, table_location))
     for month in [1, 2]:
         self.client.execute(
             "alter table %s add partition (year=2010, month=%s)" %
             (table_name, month))
     self.client.execute("refresh %s" % table_name)
     # Check that there is no data in table
     result = self.client.execute("select count(*) from %s" % table_name)
     assert result.data == [str(0)]
     dst_path = table_location + "/year=2010/month=%s/" + file_name
     for month in [1, 2]:
         check_call(
             ["hadoop", "fs", "-cp", "-f", src_file, dst_path % month],
             shell=False)
     # Check that data added is not visible before refresh
     result = self.client.execute("select count(*) from %s" % table_name)
     assert result.data == [str(0)]
     # Check that data is visible after refresh on the first partition only
     self.client.execute("refresh %s partition (year=2010, month=1)" %
                         table_name)
     result = self.client.execute("select count(*) from %s" % table_name)
     assert result.data == [str(file_num_rows)]
     # Check that the data is not yet visible for the second partition
     # that was not refreshed
     result = self.client.execute(
         "select count(*) from %s where year=2010 and month=2" % table_name)
     assert result.data == [str(0)]
     # Check that data is visible for the second partition after refresh
     self.client.execute("refresh %s partition (year=2010, month=2)" %
                         table_name)
     result = self.client.execute("select count(*) from %s" % table_name)
     assert result.data == [str(file_num_rows * 2)]
Exemple #45
0
    def test_native_functions(self, vector):
        database = 'native_function_test'

        self.__load_functions(self.create_udfs_template, vector, database,
                              get_fs_path('/test-warehouse/libTestUdfs.so'))
        self.__load_functions(self.create_sample_udas_template, vector,
                              database,
                              get_fs_path('/test-warehouse/libudasample.so'))
        self.__load_functions(self.create_test_udas_template, vector, database,
                              get_fs_path('/test-warehouse/libTestUdas.so'))

        self.run_test_case('QueryTest/udf', vector, use_db=database)
        if not IS_S3:  # S3 doesn't support INSERT
            self.run_test_case('QueryTest/udf-init-close',
                               vector,
                               use_db=database)
        self.run_test_case('QueryTest/uda', vector, use_db=database)
Exemple #46
0
 def test_describe_db(self, vector):
   self.__test_describe_db_cleanup()
   try:
     self.client.execute("create database impala_test_desc_db1")
     self.client.execute("create database impala_test_desc_db2 "
                         "comment 'test comment'")
     self.client.execute("create database impala_test_desc_db3 "
                         "location '" + get_fs_path("/testdb") + "'")
     self.client.execute("create database impala_test_desc_db4 comment 'test comment' "
                         "location \"" + get_fs_path("/test2.db") + "\"")
     self.run_stmt_in_hive("create database hive_test_desc_db comment 'test comment' "
                          "with dbproperties('pi' = '3.14', 'e' = '2.82')")
     self.run_stmt_in_hive("alter database hive_test_desc_db set owner user test")
     self.client.execute("invalidate metadata")
     self.run_test_case('QueryTest/describe-db', vector)
   finally:
     self.__test_describe_db_cleanup()
 def test_ir_functions(self, vector):
   database = 'ir_function_test'
   self.__load_functions(
     self.create_udfs_template, vector, database,
     get_fs_path('/test-warehouse/test-udfs.ll'))
   self.run_test_case('QueryTest/udf', vector, use_db=database)
   if not IS_S3: # S3 doesn't support INSERT
     self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
 def _create_test_table(self, dbname, tablename, filename, columns):
   """Creates a table in the given database with the given name and columns. Copies
   the file with the given name from TESTFILE_DIR into the table."""
   location = get_fs_path("/test-warehouse/%s.db/%s" % (dbname, tablename))
   self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" %
                       (dbname, tablename, columns, location))
   local_path = self.TESTFILE_DIR + "/" + filename
   check_call(["hadoop", "fs", "-put", local_path, location], shell=False)
Exemple #49
0
 def _create_test_table(self, dbname, tablename, filename, columns):
   """Creates a table in the given database with the given name and columns. Copies
   the file with the given name from TESTFILE_DIR into the table."""
   location = get_fs_path("/test-warehouse/%s.db/%s" % (dbname, tablename))
   self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" %
                       (dbname, tablename, columns, location))
   local_path = self.TESTFILE_DIR + "/" + filename
   check_call(["hadoop", "fs", "-put", local_path, location], shell=False)
 def test_scan_lazy_timestamp(self, vector, unique_database):
   self.client.execute(("""CREATE TABLE {0}.lazy_ts (ts TIMESTAMP)""").format
         (unique_database))
   tbl_loc = get_fs_path("/test-warehouse/%s.db/%s" % (unique_database,
         "lazy_ts"))
   check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
         "/testdata/data/lazy_timestamp.csv", tbl_loc])
   self.run_test_case('QueryTest/select-lazy-timestamp', vector, unique_database)
Exemple #51
0
  def test_hdfs_caching_fallback_path(self, vector, unique_database, testid_checksum):
    """ This tests the code path of the query execution where the hdfs cache read fails
    and the execution falls back to the normal read path. To reproduce this situation we
    rely on IMPALA-3679, where zcrs are not supported with encryption zones. This makes
    sure ReadFromCache() fails and falls back to ReadRange() to read the scan range."""

    if self.exploration_strategy() != 'exhaustive' or\
        vector.get_value('table_format').file_format != 'text':
      pytest.skip()

    # Create a new encryption zone and copy the tpch.nation table data into it.
    encrypted_table_dir = get_fs_path("/test-warehouse/" + testid_checksum)
    create_query_sql = "CREATE EXTERNAL TABLE %s.cached_nation like tpch.nation "\
        "LOCATION '%s'" % (unique_database, encrypted_table_dir)
    check_call(["hdfs", "dfs", "-mkdir", encrypted_table_dir], shell=False)
    check_call(["hdfs", "crypto", "-createZone", "-keyName", "testKey1", "-path",\
        encrypted_table_dir], shell=False)
    check_call(["hdfs", "dfs", "-cp", get_fs_path("/test-warehouse/tpch.nation/*.tbl"),\
        encrypted_table_dir], shell=False)
    # Reduce the scan range size to force the query to have multiple scan ranges.
    exec_options = vector.get_value('exec_option')
    exec_options['max_scan_range_length'] = 1024
    try:
      self.execute_query_expect_success(self.client, create_query_sql)
      # Cache the table data
      self.execute_query_expect_success(self.client, "ALTER TABLE %s.cached_nation set "
         "cached in 'testPool'" % unique_database)
      # Wait till the whole path is cached. We set a deadline of 20 seconds for the path
      # to be cached to make sure this doesn't loop forever in case of caching errors.
      caching_deadline = time.time() + 20
      while not is_path_fully_cached(encrypted_table_dir):
        if time.time() > caching_deadline:
          pytest.fail("Timed out caching path: " + encrypted_table_dir)
        time.sleep(2)
      self.execute_query_expect_success(self.client, "invalidate metadata "
          "%s.cached_nation" % unique_database);
      result = self.execute_query_expect_success(self.client, "select count(*) from "
          "%s.cached_nation" % unique_database, exec_options)
      assert(len(result.data) == 1)
      assert(result.data[0] == '25')
    except Exception as e:
      pytest.fail("Failure in test_hdfs_caching_fallback_path: " + str(e))
    finally:
      check_call(["hdfs", "dfs", "-rm", "-r", "-f", "-skipTrash", encrypted_table_dir],\
          shell=False)
  def test_hdfs_caching_fallback_path(self, vector, unique_database, testid_checksum):
    """ This tests the code path of the query execution where the hdfs cache read fails
    and the execution falls back to the normal read path. To reproduce this situation we
    rely on IMPALA-3679, where zcrs are not supported with encryption zones. This makes
    sure ReadFromCache() fails and falls back to ReadRange() to read the scan range."""

    if self.exploration_strategy() != 'exhaustive' or\
        vector.get_value('table_format').file_format != 'text':
      pytest.skip()

    # Create a new encryption zone and copy the tpch.nation table data into it.
    encrypted_table_dir = get_fs_path("/test-warehouse/" + testid_checksum)
    create_query_sql = "CREATE EXTERNAL TABLE %s.cached_nation like tpch.nation "\
        "LOCATION '%s'" % (unique_database, encrypted_table_dir)
    check_call(["hdfs", "dfs", "-mkdir", encrypted_table_dir], shell=False)
    check_call(["hdfs", "crypto", "-createZone", "-keyName", "testKey1", "-path",\
        encrypted_table_dir], shell=False)
    check_call(["hdfs", "dfs", "-cp", get_fs_path("/test-warehouse/tpch.nation/*.tbl"),\
        encrypted_table_dir], shell=False)
    # Reduce the scan range size to force the query to have multiple scan ranges.
    exec_options = vector.get_value('exec_option')
    exec_options['max_scan_range_length'] = 1024
    try:
      self.execute_query_expect_success(self.client, create_query_sql)
      # Cache the table data
      self.execute_query_expect_success(self.client, "ALTER TABLE %s.cached_nation set "
         "cached in 'testPool'" % unique_database)
      # Wait till the whole path is cached. We set a deadline of 20 seconds for the path
      # to be cached to make sure this doesn't loop forever in case of caching errors.
      caching_deadline = time.time() + 20
      while not is_path_fully_cached(encrypted_table_dir):
        if time.time() > caching_deadline:
          pytest.fail("Timed out caching path: " + encrypted_table_dir)
        time.sleep(2)
      self.execute_query_expect_success(self.client, "invalidate metadata "
          "%s.cached_nation" % unique_database);
      result = self.execute_query_expect_success(self.client, "select count(*) from "
          "%s.cached_nation" % unique_database, exec_options)
      assert(len(result.data) == 1)
      assert(result.data[0] == '25')
    except Exception as e:
      pytest.fail("Failure in test_hdfs_caching_fallback_path: " + str(e))
    finally:
      check_call(["hdfs", "dfs", "-rm", "-r", "-f", "-skipTrash", encrypted_table_dir],\
          shell=False)
Exemple #53
0
    def test_refresh_native(self):
        ''' This test checks that a native function is visible in Impala after a
    REFRESH FUNCTIONS command. We will add the native function through Hive
    by setting DBPROPERTIES of a database.'''
        # First we create the function in Impala.
        create_func_impala = (
            "create function {database}.identity_tmp(bigint) "
            "returns bigint location '{location}' symbol='Identity'")
        self.client.execute(
            create_func_impala.format(
                database=self.HIVE_IMPALA_INTEGRATION_DB,
                location=get_fs_path('/test-warehouse/libTestUdfs.so')))

        # Impala puts the native function into a database property table. We extract the key
        # value pair that represents the function from the table.
        describe_db_hive = "DESCRIBE DATABASE EXTENDED {database}".format(
            database=self.HIVE_IMPALA_INTEGRATION_DB)
        result = self.run_stmt_in_hive(describe_db_hive)
        regex = r"{(.*?)=(.*?)}"
        match = re.search(regex, result)
        func_name = match.group(1)
        func_contents = match.group(2)

        # Recreate the database, this deletes the function.
        self.client.execute("DROP DATABASE {database} CASCADE".format(
            database=self.HIVE_IMPALA_INTEGRATION_DB))
        self.client.execute("CREATE DATABASE {database}".format(
            database=self.HIVE_IMPALA_INTEGRATION_DB))
        result = self.client.execute("SHOW FUNCTIONS IN {database}".format(
            database=self.HIVE_IMPALA_INTEGRATION_DB))
        assert result is not None and len(result.data) == 0

        # Place the function into the recreated database by modifying it's properties.
        alter_db_hive = "ALTER DATABASE {database} SET DBPROPERTIES ('{fn_name}'='{fn_val}')"
        self.run_stmt_in_hive(
            alter_db_hive.format(database=self.HIVE_IMPALA_INTEGRATION_DB,
                                 fn_name=func_name,
                                 fn_val=func_contents))
        result = self.client.execute("SHOW FUNCTIONS IN {database}".format(
            database=self.HIVE_IMPALA_INTEGRATION_DB))
        assert result is not None and len(result.data) == 0

        # The function should be visible in Impala after a REFRESH FUNCTIONS.
        self.client.execute("REFRESH FUNCTIONS {database}".format(
            database=self.HIVE_IMPALA_INTEGRATION_DB))
        result = self.client.execute("SHOW FUNCTIONS IN {database}".format(
            database=self.HIVE_IMPALA_INTEGRATION_DB))
        assert result is not None and len(result.data) > 0 and\
            "identity_tmp" in str(result.data)

        # Verify that the function returns a correct result.
        result = self.client.execute(
            "SELECT {database}.identity_tmp(10)".format(
                database=self.HIVE_IMPALA_INTEGRATION_DB))
        assert result.data[0] == "10"
        # Make sure we deleted all the temporary jars we copied to the local fs
        assert len(glob.glob(self.LOCAL_LIBRARY_DIR + "/*.jar")) == 0
Exemple #54
0
 def test_ir_functions(self, vector):
     database = 'ir_function_test'
     self.__load_functions(self.create_udfs_template, vector, database,
                           get_fs_path('/test-warehouse/test-udfs.ll'))
     self.run_test_case('QueryTest/udf', vector, use_db=database)
     if not IS_S3:  # S3 doesn't support INSERT
         self.run_test_case('QueryTest/udf-init-close',
                            vector,
                            use_db=database)
 def _populate_hdfs_partitions(self):
     """ Copy some data to defaultFS HDFS filesystem so that the test can verify tables
 that span the default (HDFS) and secondary filesystem (e.g. S3A)."""
     check_call([
         "hadoop", "fs", "-cp",
         get_fs_path("/test-warehouse/alltypes_parquet"),
         "/test-warehouse/%s.db/" % self.TEST_DB
     ],
                shell=False)
 def test_hidden_symbol(self, vector, unique_database):
   """Test that symbols in the test UDFs are hidden by default and that therefore
   they cannot be used as a UDF entry point."""
   symbol = "_Z16UnexportedSymbolPN10impala_udf15FunctionContextE"
   ex = self.execute_query_expect_failure(self.client, """
       create function `{0}`.unexported() returns BIGINT LOCATION '{1}'
       SYMBOL='{2}'""".format(
       unique_database, get_fs_path('/test-warehouse/libTestUdfs.so'), symbol))
   assert "Could not find symbol '{0}'".format(symbol) in str(ex), str(ex)
   # IMPALA-8196: IR UDFs ignore whether symbol is hidden or not. Exercise the current
   # behaviour, where the UDF can be created and executed.
   result = self.execute_query_expect_success(self.client, """
       create function `{0}`.unexported() returns BIGINT LOCATION '{1}'
       SYMBOL='{2}'""".format(
       unique_database, get_fs_path('/test-warehouse/test-udfs.ll'), symbol))
   result = self.execute_query_expect_success(self.client,
       "select `{0}`.unexported()".format(unique_database))
   assert result.data[0][0] == '5'
 def test_confirm_individual_refresh(self, vector, unique_database):
   """
   Data added directly to HDFS is only visible for the partition refreshed
   """
   table_name = unique_database + '.' + "partition_test_table"
   table_location = get_fs_path("/test-warehouse/%s" % unique_database)
   file_name = "alltypes.parq"
   src_file = get_fs_path("/test-warehouse/alltypesagg_parquet/year=2010/month=1/"
     "day=9/*.parq")
   file_num_rows = 1000
   self.client.execute("""
     create table %s like functional.alltypes stored as parquet
     location '%s'
   """ % (table_name, table_location))
   for month in [1, 2]:
       self.client.execute("alter table %s add partition (year=2010, month=%s)" %
       (table_name, month))
   self.client.execute("refresh %s" % table_name)
   # Check that there is no data in table
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(0)]
   dst_path = table_location + "/year=2010/month=%s/" + file_name
   for month in [1, 2]:
       check_call(["hadoop", "fs", "-cp", "-f", src_file, dst_path % month],
                  shell=False)
   # Check that data added is not visible before refresh
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(0)]
   # Check that data is visible after refresh on the first partition only
   self.client.execute("refresh %s partition (year=2010, month=1)" %
       table_name)
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(file_num_rows)]
   # Check that the data is not yet visible for the second partition
   # that was not refreshed
   result = self.client.execute(
       "select count(*) from %s where year=2010 and month=2" % table_name)
   assert result.data == [str(0)]
   # Check that data is visible for the second partition after refresh
   self.client.execute("refresh %s partition (year=2010, month=2)" % table_name)
   result = self.client.execute("select count(*) from %s" % table_name)
   assert result.data == [str(file_num_rows*2)]
  def test_refresh_native(self):
    ''' This test checks that a native function is visible in Impala after a
    REFRESH FUNCTIONS command. We will add the native function through Hive
    by setting DBPROPERTIES of a database.'''
    # First we create the function in Impala.
    create_func_impala = ("create function {database}.identity_tmp(bigint) "
        "returns bigint location '{location}' symbol='Identity'")
    self.client.execute(create_func_impala.format(
        database=self.HIVE_IMPALA_INTEGRATION_DB,
        location=get_fs_path('/test-warehouse/libTestUdfs.so')))

    # Impala puts the native function into a database property table. We extract the key
    # value pair that represents the function from the table.
    describe_db_hive = "DESCRIBE DATABASE EXTENDED {database}".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB)
    result = self.run_stmt_in_hive(describe_db_hive)
    regex = r"{(.*?)=(.*?)}"
    match = re.search(regex, result)
    func_name = match.group(1)
    func_contents = match.group(2)

    # Recreate the database, this deletes the function.
    self.client.execute("DROP DATABASE {database} CASCADE".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB))
    self.client.execute("CREATE DATABASE {database}".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB))
    result = self.client.execute("SHOW FUNCTIONS IN {database}".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB))
    assert result is not None and len(result.data) == 0

    # Place the function into the recreated database by modifying it's properties.
    alter_db_hive = "ALTER DATABASE {database} SET DBPROPERTIES ('{fn_name}'='{fn_val}')"
    self.run_stmt_in_hive(alter_db_hive.format(
        database=self.HIVE_IMPALA_INTEGRATION_DB,
        fn_name=func_name,
        fn_val=func_contents))
    result = self.client.execute("SHOW FUNCTIONS IN {database}".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB))
    assert result is not None and len(result.data) == 0

    # The function should be visible in Impala after a REFRESH FUNCTIONS.
    self.client.execute("REFRESH FUNCTIONS {database}".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB))
    result = self.client.execute("SHOW FUNCTIONS IN {database}".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB))
    assert result is not None and len(result.data) > 0 and\
        "identity_tmp" in str(result.data)

    # Verify that the function returns a correct result.
    result = self.client.execute("SELECT {database}.identity_tmp(10)".format(
        database=self.HIVE_IMPALA_INTEGRATION_DB))
    assert result.data[0] == "10"
    # Make sure we deleted all the temporary jars we copied to the local fs
    assert len(glob.glob(self.LOCAL_LIBRARY_DIR + "/*.jar")) == 0
 def _create_test_table(self, tablename, filename, columns):
   """Returns a unique tablename based on the input 'tablename'. This allows multiple
   instances of the same test to be run in parallel (e.g. during an exhaustive run)."""
   tablename = "%s_%s" % (tablename, random.randint(0, 10**5))
   location = get_fs_path("/test-warehouse/%s_%s" % (self.DATABASE, tablename))
   self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" %
                         (self.DATABASE, tablename, columns, location))
   local_path = self.TESTFILE_DIR + "/" + filename
   check_call(["hadoop", "fs", "-put", local_path, location], shell=False)
   self.client.execute("invalidate metadata %s.%s" % (self.DATABASE, tablename))
   return tablename