def test_add_delete_data_to_hdfs_and_refresh(self, vector, unique_database): """ Data added/deleted directly in HDFS is visible in impala after refresh of partition. """ table_name = unique_database + '.' + "partition_test_table" table_location = get_fs_path("/test-warehouse/%s" % unique_database) file_name = "alltypes.parq" src_file = get_fs_path("/test-warehouse/alltypesagg_parquet/year=2010/month=1/" "day=9/*.parq") file_num_rows = 1000 self.client.execute(""" create table %s like functional.alltypes stored as parquet location '%s' """ % (table_name, table_location)) self.client.execute("alter table %s add partition (year=2010, month=1)" % table_name) self.client.execute("refresh %s" % table_name) # Check that there is no data in table result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(0)] dst_path = "%s/year=2010/month=1/%s" % (table_location, file_name) check_call(["hadoop", "fs", "-cp", "-f", src_file, dst_path], shell=False) # Check that data added is not visible before refresh result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(0)] # Chech that data is visible after refresh self.client.execute("refresh %s partition (year=2010, month=1)" % table_name) result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(file_num_rows)] # Check that after deleting the file and refreshing, it returns zero rows check_call(["hadoop", "fs", "-rm", dst_path], shell=False) self.client.execute("refresh %s partition (year=2010, month=1)" % table_name) result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(0)]
def setup_method(self, method): self.cleanup_db('impala_test_desc_db1') self.cleanup_db('impala_test_desc_db2') self.cleanup_db('impala_test_desc_db3') self.cleanup_db('impala_test_desc_db4') self.cleanup_db('hive_test_desc_db') self.cleanup_db('hive_test_db') self.client.execute("create database if not exists impala_test_desc_db1") self.client.execute( "create database if not exists impala_test_desc_db2 " "comment \"test comment\"") self.client.execute( "create database if not exists impala_test_desc_db3 " "location \"" + get_fs_path("/testdb") + "\"") self.client.execute( "create database if not exists impala_test_desc_db4 " "comment \"test comment\" location \"" + get_fs_path("/test2.db") + "\"") self.client.execute( "create table if not exists impala_test_desc_db1.complex_types_tbl (" "map_array_struct_col map<string, array<struct<f1:int, f2:string>>>, " "struct_array_struct_col " "struct<f1:int, f2:array<struct<f11:bigint, f12:string>>>, " "map_array_map_struct_col " "map<string, array<map<string, struct<f1:string, f2:int>>>>)")
def test_hive_udfs_missing_jar(self, vector): """ IMPALA-2365: Impalad shouldn't crash if the udf jar isn't present on HDFS""" # Copy hive-exec.jar to a temporary file jar_path = get_fs_path("/test-warehouse/" + get_random_id(5) + ".jar") hive_jar = get_fs_path("/test-warehouse/hive-exec.jar") check_call(["hadoop", "fs", "-cp", hive_jar, jar_path]) drop_fn_stmt = "drop function if exists default.pi_missing_jar()" create_fn_stmt = "create function default.pi_missing_jar() returns double \ location '%s' symbol='org.apache.hadoop.hive.ql.udf.UDFPI'" % jar_path cluster = ImpalaCluster() impalad = cluster.get_any_impalad() client = impalad.service.create_beeswax_client() # Create and drop functions with sync_ddl to make sure they are reflected # in every impalad. exec_option = vector.get_value('exec_option') exec_option['sync_ddl'] = 1 self.execute_query_expect_success(client, drop_fn_stmt, exec_option) self.execute_query_expect_success(client, create_fn_stmt, exec_option) # Delete the udf jar check_call(["hadoop", "fs", "-rm", jar_path]) different_impalad = cluster.get_different_impalad(impalad) client = different_impalad.service.create_beeswax_client() # Run a query using the udf from an impalad other than the one # we used to create the function. This is to bypass loading from # the cache try: self.execute_query_using_client(client, "select default.pi_missing_jar()", vector) assert False, "Query expected to fail" except ImpalaBeeswaxException, e: assert "Failed to get file info" in str(e)
def test_java_udfs(self, vector): self.client.execute("create database if not exists java_udfs_test " "location '%s'" % get_fs_path('/test-warehouse/java_udf_test.db')) self.client.execute("create database if not exists udf_test " "location '%s'" % get_fs_path('/test-warehouse/udf_test.db')) try: self.run_test_case('QueryTest/load-java-udfs', vector) self.run_test_case('QueryTest/java-udf', vector) finally: self.client.execute("drop database if exists java_udfs_test cascade") self.client.execute("drop database if exists udf_test cascade")
def test_native_functions(self, vector): database = 'native_function_test' self.__load_functions( self.create_udfs_template, vector, database, get_fs_path('/test-warehouse/libTestUdfs.so')) self.__load_functions( self.create_udas_template, vector, database, get_fs_path('/test-warehouse/libudasample.so')) self.run_test_case('QueryTest/udf', vector, use_db=database) if not IS_S3: # S3 doesn't support INSERT self.run_test_case('QueryTest/udf-init-close', vector, use_db=database) self.run_test_case('QueryTest/uda', vector, use_db=database)
def test_def_level_encoding(self, vector, unique_database, tmpdir): """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a parquet table and running the parquet-reader tool on it, which performs sanity checking, such as that the correct number of definition levels were encoded. """ table_name = "test_hdfs_parquet_table_writer" qualified_table_name = "%s.%s" % (unique_database, table_name) self.execute_query( "create table %s stored as parquet as select l_linenumber from " "tpch_parquet.lineitem limit 180000" % qualified_table_name) hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' % (unique_database, table_name)) check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmpdir.strpath]) for root, subdirs, files in os.walk(tmpdir.strpath): for f in files: if not f.endswith('parq'): continue check_call([ os.path.join(IMPALA_HOME, "bin/run-binary.sh"), os.path.join(impalad_basedir, 'util/parquet-reader'), '--file', os.path.join(tmpdir.strpath, str(f)) ])
def test_set_column_orders(self, vector, unique_database, tmpdir): """Tests that the Parquet writers set FileMetaData::column_orders.""" source_table = "functional_parquet.alltypessmall" target_table = "test_set_column_orders" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format( unique_database, target_table)) # Create table query = "create table {0} like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query) # Insert data query = ( "insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and verify column orders file_metadata_list = get_parquet_metadata_from_hdfs_folder( hdfs_path, tmpdir.strpath) expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11 for file_metadata in file_metadata_list: assert file_metadata.column_orders == expected_col_orders
def test_write_statistics_multiple_row_groups(self, vector, unique_database, tmpdir): """Test that writing multiple row groups works as expected. This is done by inserting into a table using the SORT BY clause and then making sure that the min and max values of row groups don't overlap.""" source_table = "tpch_parquet.orders" target_table = "test_hdfs_parquet_table_writer" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format( unique_database, target_table)) # Insert a large amount of data on a single backend with a limited parquet file size. # This will result in several files being written, exercising code that tracks # statistics for row groups. query = "create table {0} sort by (o_orderkey) like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query, vector.get_value('exec_option')) query = ("insert into {0} select * from {1}").format( qualified_target_table, source_table) vector.get_value('exec_option')['num_nodes'] = 1 vector.get_value('exec_option')['parquet_file_size'] = 8 * 1024 * 1024 self.execute_query(query, vector.get_value('exec_option')) # Get all stats for the o_orderkey column row_group_stats = self._get_row_group_stats_from_hdfs_folder(hdfs_path, tmpdir.strpath) assert len(row_group_stats) > 1 orderkey_stats = [s[0] for s in row_group_stats] # Make sure that they don't overlap by ordering by the min value, then looking at # boundaries. orderkey_stats.sort(key = lambda s: s.min) for l, r in zip(orderkey_stats, orderkey_stats[1:]): assert l.max <= r.min
def test_insert_parquet_verify_size(self, vector, unique_database): # Test to verify that the result file size is close to what we expect. tbl_name = "parquet_insert_size" fq_tbl_name = unique_database + "." + tbl_name location = get_fs_path("test-warehouse/{0}.db/{1}/".format( unique_database, tbl_name)) create = ("create table {0} like tpch_parquet.orders stored as parquet" .format(fq_tbl_name, location)) query = "insert overwrite {0} select * from tpch.orders".format( fq_tbl_name) block_size = 40 * 1024 * 1024 self.execute_query(create) vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = block_size vector.get_value('exec_option')['COMPRESSION_CODEC'] =\ vector.get_value('compression_codec') vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query(query, vector.get_value('exec_option')) # Get the files in hdfs and verify. There can be at most 1 file that is smaller # that the block_size. The rest should be within 80% of it and not over. found_small_file = False sizes = self.filesystem_client.get_all_file_sizes(location) for size in sizes: assert size < block_size, "File size greater than expected.\ Expected: {0}, Got: {1}".format(block_size, size) if size < block_size * 0.80: assert not found_small_file found_small_file = True
def test_deprecated_stats(self, vector, unique_database): """Test that reading parquet files with statistics with deprecated 'min'/'max' fields works correctly. The statistics will be used for known-good types (boolean, integral, float) and will be ignored for all other types (string, decimal, timestamp).""" table_name = 'deprecated_stats' # We use CTAS instead of "create table like" to convert the partition columns into # normal table columns. self.client.execute( 'create table %s.%s stored as parquet as select * from ' 'functional.alltypessmall limit 0' % (unique_database, table_name)) table_location = get_fs_path('/test-warehouse/%s.db/%s' % (unique_database, table_name)) local_file = os.path.join( os.environ['IMPALA_HOME'], 'testdata/data/deprecated_statistics.parquet') assert os.path.isfile(local_file) check_call( ['hdfs', 'dfs', '-copyFromLocal', local_file, table_location]) self.client.execute('invalidate metadata %s.%s' % (unique_database, table_name)) # The test makes assumptions about the number of row groups that are processed and # skipped inside a fragment, so we ensure that the tests run in a single fragment. vector.get_value('exec_option')['num_nodes'] = 1 self.run_test_case('QueryTest/parquet-deprecated-stats', vector, unique_database)
def test_def_level_encoding(self, vector, unique_database): """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a parquet table and running the parquet-reader tool on it, which performs sanity checking, such as that the correct number of definition levels were encoded. """ table_name = "test_hdfs_parquet_table_writer" qualified_table_name = "%s.%s" % (unique_database, table_name) self.execute_query("drop table if exists %s" % qualified_table_name) self.execute_query("create table %s stored as parquet as select l_linenumber from " "tpch_parquet.lineitem limit 180000" % qualified_table_name) tmp_dir = make_tmp_dir() try: hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' % (unique_database, table_name)) check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmp_dir]) for root, subdirs, files in os.walk(tmp_dir): for f in files: if not f.endswith('parq'): continue check_call([os.path.join(impalad_basedir, 'util/parquet-reader'), '--file', os.path.join(tmp_dir, str(f))]) finally: self.execute_query("drop table %s" % qualified_table_name) rmtree(tmp_dir)
def _ctas_table_and_verify_stats(self, vector, unique_database, source_table, expected_values, hive_skip_col_idx = None): """Copies 'source_table' into a parquet table and makes sure that the row group statistics in the resulting parquet file match those in 'expected_values'. The comparison is performed against both Hive and Impala. For Hive, columns indexed by 'hive_skip_col_idx' are excluded from the verification of the expected values. """ table_name = "test_hdfs_parquet_table_writer" qualified_table_name = "{0}.{1}".format(unique_database, table_name) hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database, table_name)) # Validate against Hive. self.execute_query("drop table if exists {0}".format(qualified_table_name)) self.run_stmt_in_hive("create table {0} stored as parquet as select * from " "{1}".format(qualified_table_name, source_table)) self.execute_query("invalidate metadata {0}".format(qualified_table_name)) self._validate_min_max_stats(hdfs_path, expected_values, hive_skip_col_idx) # Validate against Impala. Setting exec_single_node_rows_threshold and adding a limit # clause ensures that the query is executed on the coordinator, resulting in a single # parquet file being written. num_rows = self.execute_scalar("select count(*) from {0}".format(source_table)) self.execute_query("drop table {0}".format(qualified_table_name)) query = ("create table {0} stored as parquet as select * from {1} limit " "{2}").format(qualified_table_name, source_table, num_rows) vector.get_value('exec_option')['EXEC_SINGLE_NODE_ROWS_THRESHOLD'] = num_rows self.execute_query(query, vector.get_value('exec_option')) self._validate_min_max_stats(hdfs_path, expected_values)
def test_insert_alter_partition_location(self): """Test that inserts after changing the location of a partition work correctly, including the creation of a non-existant partition dir""" PART_DIR = "tmp/test_insert_alter_partition_location" QUALIFIED_PART_DIR = get_fs_path('/' + PART_DIR) TBL_NAME = "functional.insert_alter_partition_location" self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME) self.hdfs_client.delete_file_dir(PART_DIR, recursive=True) self.execute_query_expect_success( self.client, "CREATE TABLE %s (c int) PARTITIONED BY (p int)" % TBL_NAME) self.execute_query_expect_success( self.client, "ALTER TABLE %s ADD PARTITION(p=1)" % TBL_NAME) self.execute_query_expect_success( self.client, "ALTER TABLE %s PARTITION(p=1) SET LOCATION '%s'" % (TBL_NAME, QUALIFIED_PART_DIR)) self.execute_query_expect_success( self.client, "INSERT OVERWRITE %s PARTITION(p=1) VALUES(1)" % TBL_NAME) result = self.execute_query_expect_success( self.client, "SELECT COUNT(*) FROM %s" % TBL_NAME) assert int(result.get_data()) == 1 # Should have created the partition dir, which should contain exactly one file (not in # a subdirectory) ls = self.hdfs_client.list_dir(PART_DIR) assert len(ls['FileStatuses']['FileStatus']) == 1
def test_insert_parquet_verify_size(self, vector): # Test to verify that the result file size is close to what we expect.i TBL = "parquet_insert_size" DROP = "drop table if exists {0}".format(TBL) CREATE = ("create table parquet_insert_size like tpch_parquet.orders" " stored as parquet location '{0}/{1}'".format( WAREHOUSE, TBL)) QUERY = "insert overwrite {0} select * from tpch.orders".format(TBL) DIR = get_fs_path("test-warehouse/{0}/".format(TBL)) BLOCK_SIZE = 40 * 1024 * 1024 self.execute_query(DROP) self.execute_query(CREATE) vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = BLOCK_SIZE vector.get_value('exec_option')['COMPRESSION_CODEC'] =\ vector.get_value('compression_codec') vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query(QUERY, vector.get_value('exec_option')) # Get the files in hdfs and verify. There can be at most 1 file that is smaller # that the BLOCK_SIZE. The rest should be within 80% of it and not over. found_small_file = False ls = self.hdfs_client.list_dir(DIR) for f in ls['FileStatuses']['FileStatus']: if f['type'] != 'FILE': continue length = f['length'] print length assert length < BLOCK_SIZE if length < BLOCK_SIZE * 0.80: assert found_small_file == False found_small_file = True
def test_strings_utf8(self, vector, unique_database): # Create table table_name = "ice_str_utf8" qualified_table_name = "%s.%s" % (unique_database, table_name) query = 'create table %s (a string) stored as iceberg' % qualified_table_name self.client.execute(query) # Inserted string data should have UTF8 annotation regardless of query options. query = 'insert into %s values ("impala")' % qualified_table_name self.execute_query(query, {'parquet_annotate_strings_utf8': False}) # Copy the created file to the local filesystem and parse metadata local_file = '/tmp/iceberg_utf8_test_%s.parq' % random.randint( 0, 10000) LOG.info("test_strings_utf8 local file name: " + local_file) hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/data/*.parq' % (unique_database, table_name)) check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file]) metadata = get_parquet_metadata(local_file) # Extract SchemaElements corresponding to the table column a_schema_element = metadata.schema[1] assert a_schema_element.name == 'a' # Check that the schema uses the UTF8 annotation assert a_schema_element.converted_type == ConvertedType.UTF8 os.remove(local_file)
def test_permanent_udfs(self): # Make sure the pre-calculated count tallies with the number of # functions shown using "show [aggregate] functions" statement self.verify_function_count( "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count) self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count) # invalidate metadata and make sure the count tallies result = self.client.execute("INVALIDATE METADATA") self.verify_function_count( "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count) self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count) # Restart the cluster, this triggers a full metadata reload self.__restart_cluster() # Make sure the counts of udfs and udas match post restart self.verify_function_count( "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count) self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count) # Drop sample udas and verify the count matches pre and post restart self.__load_drop_functions( self.DROP_SAMPLE_UDAS_TEMPLATE, self.DATABASE, get_fs_path('/test-warehouse/libudasample.so')) self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1) self.__restart_cluster() self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1)
def _ctas_table_and_verify_index(self, vector, unique_database, source_table, tmpdir, sorting_column=None): """Copies 'source_table' into a parquet table and makes sure that the index in the resulting parquet file is valid. """ table_name = "test_hdfs_parquet_table_writer" qualified_table_name = "{0}.{1}".format(unique_database, table_name) hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format( unique_database, table_name)) # Setting num_nodes = 1 ensures that the query is executed on the coordinator, # resulting in a single parquet file being written. vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query( "drop table if exists {0}".format(qualified_table_name)) if sorting_column is None: query = ("create table {0} stored as parquet as select * from {1}" ).format(qualified_table_name, source_table) else: query = ( "create table {0} sort by({1}) stored as parquet as select * from {2}" ).format(qualified_table_name, sorting_column, source_table) self.execute_query(query, vector.get_value('exec_option')) self._validate_parquet_page_index(hdfs_path, tmpdir.join(source_table))
def _ctas_and_get_metadata(self, vector, unique_database, tmp_dir, source_table, table_name="test_hdfs_parquet_table_writer"): """CTAS 'source_table' into a Parquet table and returns its Parquet metadata.""" qualified_table_name = "{0}.{1}".format(unique_database, table_name) hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format( unique_database, table_name)) # Setting num_nodes = 1 ensures that the query is executed on the coordinator, # resulting in a single parquet file being written. query = ( "create table {0} stored as parquet as select * from {1}").format( qualified_table_name, source_table) vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query_expect_success(self.client, query, vector.get_value('exec_option')) file_metadata_list = get_parquet_metadata_from_hdfs_folder( hdfs_path, tmp_dir) assert len(file_metadata_list) == 1 assert file_metadata_list[0] is not None return file_metadata_list[0]
def test_sorting_columns(self, vector, unique_database, tmpdir): """Tests that RowGroup::sorting_columns gets populated when specifying a sortby() insert hint.""" source_table = "functional_parquet.alltypessmall" target_table = "test_write_sorting_columns" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database, target_table)) # Create table # TODO: Simplify once IMPALA-4167 (insert hints in CTAS) has been fixed. query = "create table {0} like {1} stored as parquet".format(qualified_target_table, source_table) self.execute_query(query) # Insert data query = ("insert into {0} partition(year, month) /* +sortby(int_col, id) */ " "select * from {1}").format(qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and extract rowgroup metadata row_groups = [] check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath]) for root, subdirs, files in os.walk(tmpdir.strpath): for f in files: parquet_file = os.path.join(root, str(f)) file_meta_data = get_parquet_metadata(parquet_file) row_groups.extend(file_meta_data.row_groups) # Verify that the files have the sorted_columns set expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)] for row_group in row_groups: assert row_group.sorting_columns == expected
def _ctas_table_and_verify_stats( self, vector, unique_database, tmp_dir, source_table, expected_values, table_name="test_hdfs_parquet_table_writer"): """Copies 'source_table' into a parquet table and makes sure that the row group statistics in the resulting parquet file match those in 'expected_values'. 'tmp_dir' needs to be supplied by the caller and will be used to store temporary files. The caller is responsible for cleaning up 'tmp_dir'. """ qualified_table_name = "{0}.{1}".format(unique_database, table_name) hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format( unique_database, table_name)) # Setting num_nodes = 1 ensures that the query is executed on the coordinator, # resulting in a single parquet file being written. self.execute_query( "drop table if exists {0}".format(qualified_table_name)) query = ( "create table {0} stored as parquet as select * from {1}").format( qualified_table_name, source_table) vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query(query, vector.get_value('exec_option')) self._validate_parquet_stats(hdfs_path, tmp_dir, expected_values)
def test_libs_with_same_filenames(self, vector): self.client.execute("create database if not exists same_lib_filename_udf_test " "location '%s'" % get_fs_path('/test-warehouse/same_lib_filename_udf_test.db')) try: self.run_test_case('QueryTest/libs_with_same_filenames', vector) finally: self.client.execute("drop database if exists same_lib_filename_udf_test cascade")
def test_udf_constant_folding(self, vector, unique_database): """Test that constant folding of UDFs is handled correctly. Uses count_rows(), which returns a unique value every time it is evaluated in the same thread.""" exec_options = copy(vector.get_value('exec_option')) # Execute on a single node so that all counter values will be unique. exec_options["num_nodes"] = 1 create_fn_query = """create function {database}.count_rows() returns bigint location '{location}' symbol='Count' prepare_fn='CountPrepare' close_fn='CountClose'""" self._load_functions(create_fn_query, vector, unique_database, get_fs_path('/test-warehouse/libTestUdfs.so')) # Only one distinct value if the expression is constant folded, otherwise one # value per row in alltypes expected_ndv = 1 if exec_options['enable_expr_rewrites'] else 7300 # Test fully constant expression, evaluated in FE. query = "select `{0}`.count_rows() from functional.alltypes".format( unique_database) result = self.execute_query_expect_success(self.client, query, exec_options) actual_ndv = len(set(result.data)) assert actual_ndv == expected_ndv # Test constant argument to a non-constant expr. The argument value can be # cached in the backend. query = """select concat(cast(`{0}`.count_rows() as string), '-', string_col) from functional.alltypes""".format(unique_database) result = self.execute_query_expect_success(self.client, query, exec_options) actual_ndv = len(set(value.split("-")[0] for value in result.data)) assert actual_ndv == expected_ndv
def test_set_column_orders(self, vector, unique_database, tmpdir): """Tests that the Parquet writers set FileMetaData::column_orders.""" source_table = "functional_parquet.alltypessmall" target_table = "test_set_column_orders" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format( unique_database, target_table)) # Create table query = "create table {0} like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query) # Insert data query = ( "insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and verify column orders check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath]) expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11 for root, subdirs, files in os.walk(tmpdir.strpath): for f in files: parquet_file = os.path.join(root, str(f)) file_meta_data = get_parquet_metadata(parquet_file) assert file_meta_data.column_orders == expected_col_orders
def test_udf_invalid_symbol(self, vector, unique_database): """ IMPALA-1642: Impala crashes if the symbol for a Hive UDF doesn't exist Crashing is non-deterministic so we run the UDF several times.""" src_udf_path = os.path.join(os.environ['IMPALA_HOME'], 'testdata/udfs/impala-hive-udfs.jar') tgt_udf_path = get_fs_path( '/test-warehouse/{0}.db/impala-hive-udfs.jar'.format( unique_database)) drop_fn_stmt = ( "drop function if exists `{0}`.fn_invalid_symbol(STRING)".format( unique_database)) create_fn_stmt = ( "create function `{0}`.fn_invalid_symbol(STRING) returns " "STRING LOCATION '{1}' SYMBOL='not.a.Symbol'".format( unique_database, tgt_udf_path)) query = "select `{0}`.fn_invalid_symbol('test')".format( unique_database) self.filesystem_client.copy_from_local(src_udf_path, tgt_udf_path) self.client.execute(drop_fn_stmt) self.client.execute(create_fn_stmt) for _ in xrange(5): ex = self.execute_query_expect_failure(self.client, query) assert "Unable to find class" in str(ex) self.client.execute(drop_fn_stmt)
def test_drop_function_while_running(self, vector): self.client.execute( "drop function if exists default.drop_while_running(BIGINT)") self.client.execute("create function default.drop_while_running(BIGINT) returns "\ "BIGINT LOCATION '%s' SYMBOL='Identity'" % get_fs_path('/test-warehouse/libTestUdfs.so')) query = \ "select default.drop_while_running(l_orderkey) from tpch.lineitem limit 10000" # Run this query asynchronously. handle = self.execute_query_async( query, vector.get_value('exec_option'), table_format=vector.get_value('table_format')) # Fetch some rows from the async query to make sure the UDF is being used results = self.client.fetch(query, handle, 1) assert results.success assert len(results.data) == 1 # Drop the function while the original query is running. self.client.execute("drop function default.drop_while_running(BIGINT)") # Fetch the rest of the rows, this should still be able to run the UDF results = self.client.fetch(query, handle, -1) assert results.success assert len(results.data) == 9999
def test_udf_update_via_drop(self, vector, unique_database): """Test updating the UDF binary without restarting Impala. Dropping the function should remove the binary from the local cache.""" # Run with sync_ddl to guarantee the drop is processed by all impalads. exec_options = vector.get_value('exec_option') exec_options['sync_ddl'] = 1 old_udf = os.path.join( os.environ['IMPALA_HOME'], 'testdata/udfs/impala-hive-udfs.jar') new_udf = os.path.join( os.environ['IMPALA_HOME'], 'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar') udf_dst = get_fs_path('/test-warehouse/impala-hive-udfs2.jar') drop_fn_stmt = ( 'drop function if exists `{0}`.`udf_update_test_drop`()'.format(unique_database)) create_fn_stmt = ( "create function `{0}`.`udf_update_test_drop`() returns string LOCATION '{1}' " "SYMBOL='com.cloudera.impala.TestUpdateUdf'".format(unique_database, udf_dst)) query_stmt = "select `{0}`.`udf_update_test_drop`()".format(unique_database) # Put the old UDF binary on HDFS, make the UDF in Impala and run it. check_call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst]) self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options) self.execute_query_expect_success(self.client, create_fn_stmt, exec_options) self.__run_query_all_impalads(exec_options, query_stmt, ["Old UDF"]) # Update the binary, drop and create the function again. The new binary should # be running. check_call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst]) self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options) self.execute_query_expect_success(self.client, create_fn_stmt, exec_options) self.__run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
def test_sorting_columns(self, vector, unique_database, tmpdir): """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY columns.""" source_table = "functional_parquet.alltypessmall" target_table = "test_write_sorting_columns" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database, target_table)) # Create table query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format( qualified_target_table, source_table) self.execute_query(query) # Insert data query = ("insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and extract rowgroup metadata file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath) row_groups = [] for file_metadata in file_metadata_list: row_groups.extend(file_metadata.row_groups) # Verify that the files have the sorted_columns set expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)] for row_group in row_groups: assert row_group.sorting_columns == expected
def test_udf_update_via_drop(self, vector): """Test updating the UDF binary without restarting Impala. Dropping the function should remove the binary from the local cache.""" # Run with sync_ddl to guarantee the drop is processed by all impalads. exec_options = vector.get_value('exec_option') exec_options['sync_ddl'] = 1 old_udf = os.path.join(os.environ['IMPALA_HOME'], 'testdata/udfs/impala-hive-udfs.jar') new_udf = os.path.join( os.environ['IMPALA_HOME'], 'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar') udf_dst = get_fs_path('/test-warehouse/impala-hive-udfs2.jar') drop_fn_stmt = 'drop function if exists default.udf_update_test_drop()' create_fn_stmt = "create function default.udf_update_test_drop() returns string "\ "LOCATION '" + udf_dst + "' SYMBOL='com.cloudera.impala.TestUpdateUdf'" query_stmt = "select default.udf_update_test_drop()" # Put the old UDF binary on HDFS, make the UDF in Impala and run it. call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst]) self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options) self.execute_query_expect_success(self.client, create_fn_stmt, exec_options) self.__run_query_all_impalads(exec_options, query_stmt, ["Old UDF"]) # Update the binary, drop and create the function again. The new binary should # be running. call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst]) self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options) self.execute_query_expect_success(self.client, create_fn_stmt, exec_options) self.__run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
def test_insert_alter_partition_location(self): """Test that inserts after changing the location of a partition work correctly, including the creation of a non-existant partition dir""" PART_DIR = "tmp/test_insert_alter_partition_location" QUALIFIED_PART_DIR = get_fs_path('/' + PART_DIR) TBL_NAME = "functional.insert_alter_partition_location" self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME) self.hdfs_client.delete_file_dir(PART_DIR, recursive=True) self.execute_query_expect_success(self.client, "CREATE TABLE %s (c int) PARTITIONED BY (p int)" % TBL_NAME) self.execute_query_expect_success(self.client, "ALTER TABLE %s ADD PARTITION(p=1)" % TBL_NAME) self.execute_query_expect_success(self.client, "ALTER TABLE %s PARTITION(p=1) SET LOCATION '%s'" % (TBL_NAME, QUALIFIED_PART_DIR)) self.execute_query_expect_success(self.client, "INSERT OVERWRITE %s PARTITION(p=1) VALUES(1)" % TBL_NAME) result = self.execute_query_expect_success(self.client, "SELECT COUNT(*) FROM %s" % TBL_NAME) assert int(result.get_data()) == 1 # Should have created the partition dir, which should contain exactly one file (not in # a subdirectory) ls = self.hdfs_client.list_dir(PART_DIR) assert len(ls['FileStatuses']['FileStatus']) == 1
def test_set_column_orders(self, vector, unique_database, tmpdir): """Tests that the Parquet writers set FileMetaData::column_orders.""" source_table = "functional_parquet.alltypessmall" target_table = "test_set_column_orders" qualified_target_table = "{0}.{1}".format(unique_database, target_table) hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database, target_table)) # Create table query = "create table {0} like {1} stored as parquet".format(qualified_target_table, source_table) self.execute_query(query) # Insert data query = ("insert into {0} partition(year, month) select * from {1}").format( qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and verify column orders file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath) expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11 for file_metadata in file_metadata_list: assert file_metadata.column_orders == expected_col_orders
def test_corrupt_rle_counts(self, vector, unique_database): """IMPALA-3646: Tests that a certain type of file corruption for plain dictionary encoded values is gracefully handled. Cases tested: - incorrect literal count of 0 for the RLE encoded dictionary indexes - incorrect repeat count of 0 for the RLE encoded dictionary indexes """ # Create test table and copy the corrupt files into it. self.client.execute( "create table %s.bad_rle_counts (c bigint) stored as parquet" % unique_database) bad_rle_counts_tbl_loc =\ get_fs_path("/test-warehouse/%s.db/%s" % (unique_database, "bad_rle_counts")) check_call([ 'hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] + "/testdata/data/bad_rle_literal_count.parquet", bad_rle_counts_tbl_loc ]) check_call([ 'hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] + "/testdata/data/bad_rle_repeat_count.parquet", bad_rle_counts_tbl_loc ]) # Querying the corrupted files should not DCHECK or crash. vector.get_value('exec_option')['abort_on_error'] = 0 self.run_test_case('QueryTest/parquet-corrupt-rle-counts', vector, unique_database) vector.get_value('exec_option')['abort_on_error'] = 1 self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort', vector, unique_database)
def test_insert_parquet_verify_size(self, vector, unique_database): # Test to verify that the result file size is close to what we expect. tbl_name = "parquet_insert_size" fq_tbl_name = unique_database + "." + tbl_name location = get_fs_path("test-warehouse/{0}.db/{1}/" .format(unique_database, tbl_name)) create = ("create table {0} like tpch_parquet.orders stored as parquet" .format(fq_tbl_name, location)) query = "insert overwrite {0} select * from tpch.orders".format(fq_tbl_name) block_size = 40 * 1024 * 1024 self.execute_query(create) vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = block_size vector.get_value('exec_option')['COMPRESSION_CODEC'] =\ vector.get_value('compression_codec') vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query(query, vector.get_value('exec_option')) # Get the files in hdfs and verify. There can be at most 1 file that is smaller # that the block_size. The rest should be within 80% of it and not over. found_small_file = False sizes = self.filesystem_client.get_all_file_sizes(location) for size in sizes: assert size < block_size, "File size greater than expected.\ Expected: {0}, Got: {1}".format(block_size, size) if size < block_size * 0.80: assert not found_small_file found_small_file = True
def test_drop_function_while_running(self, vector, unique_database): self.client.execute("drop function if exists `{0}`.drop_while_running(BIGINT)" .format(unique_database)) self.client.execute( "create function `{0}`.drop_while_running(BIGINT) returns " "BIGINT LOCATION '{1}' SYMBOL='Identity'".format( unique_database, get_fs_path('/test-warehouse/libTestUdfs.so'))) query = ("select `{0}`.drop_while_running(l_orderkey) from tpch.lineitem limit 10000" .format(unique_database)) # Run this query asynchronously. handle = self.execute_query_async(query, vector.get_value('exec_option'), table_format=vector.get_value('table_format')) # Fetch some rows from the async query to make sure the UDF is being used results = self.client.fetch(query, handle, 1) assert results.success assert len(results.data) == 1 # Drop the function while the original query is running. self.client.execute( "drop function `{0}`.drop_while_running(BIGINT)".format(unique_database)) # Fetch the rest of the rows, this should still be able to run the UDF results = self.client.fetch(query, handle, -1) assert results.success assert len(results.data) == 9999
def test_insert_alter_partition_location(self, unique_database): """Test that inserts after changing the location of a partition work correctly, including the creation of a non-existant partition dir""" part_dir = "tmp/{0}".format(unique_database) qualified_part_dir = get_fs_path('/' + part_dir) table_name = "`{0}`.`insert_alter_partition_location`".format(unique_database) self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % table_name) self.filesystem_client.delete_file_dir(part_dir, recursive=True) self.execute_query_expect_success( self.client, "CREATE TABLE %s (c int) PARTITIONED BY (p int)" % table_name) self.execute_query_expect_success( self.client, "ALTER TABLE %s ADD PARTITION(p=1)" % table_name) self.execute_query_expect_success( self.client, "ALTER TABLE %s PARTITION(p=1) SET LOCATION '%s'" % (table_name, qualified_part_dir)) self.execute_query_expect_success( self.client, "INSERT OVERWRITE %s PARTITION(p=1) VALUES(1)" % table_name) result = self.execute_query_expect_success( self.client, "SELECT COUNT(*) FROM %s" % table_name) assert int(result.get_data()) == 1 # Should have created the partition dir, which should contain exactly one file (not in # a subdirectory) assert len(self.filesystem_client.ls(part_dir)) == 1
def test_clustered_partition_single_file(self, unique_database): """IMPALA-2523: Tests that clustered insert creates one file per partition, even when inserting over multiple row batches.""" # On s3 this test takes about 220 seconds and we are unlikely to break it, so only run # it in exhaustive strategy. if self.exploration_strategy() != 'exhaustive' and IS_S3: pytest.skip("only runs in exhaustive") table = "{0}.insert_clustered".format(unique_database) table_path = "test-warehouse/{0}.db/insert_clustered".format(unique_database) table_location = get_fs_path("/" + table_path) create_stmt = """create table {0} like functional.alltypes""".format(table) self.execute_query_expect_success(self.client, create_stmt) set_location_stmt = """alter table {0} set location '{1}'""".format( table, table_location) self.execute_query_expect_success(self.client, set_location_stmt) # Setting a lower batch size will result in multiple row batches being written. self.execute_query_expect_success(self.client, "set batch_size=10") insert_stmt = """insert into {0} partition(year, month) /*+ clustered,shuffle */ select * from functional.alltypes""".format(table) self.execute_query_expect_success(self.client, insert_stmt) # We expect exactly one partition per year and month, since subsequent row batches of # a partition will be written into the same file. expected_partitions = \ ["year=%s/month=%s" % (y, m) for y in [2009, 2010] for m in range(1,13)] for partition in expected_partitions: partition_path = "{0}/{1}".format(table_path, partition) files = self.filesystem_client.ls(partition_path) assert len(files) == 1, "%s: %s" % (partition, files)
def test_udf_constant_folding(self, vector, unique_database): """Test that constant folding of UDFs is handled correctly. Uses count_rows(), which returns a unique value every time it is evaluated in the same thread.""" exec_options = copy(vector.get_value('exec_option')) # Execute on a single node so that all counter values will be unique. exec_options["num_nodes"] = 1 create_fn_query = """create function {database}.count_rows() returns bigint location '{location}' symbol='Count' prepare_fn='CountPrepare' close_fn='CountClose'""" self._load_functions(create_fn_query, vector, unique_database, get_fs_path('/test-warehouse/libTestUdfs.so')) # Only one distinct value if the expression is constant folded, otherwise one # value per row in alltypes expected_ndv = 1 if exec_options['enable_expr_rewrites'] else 7300 # Test fully constant expression, evaluated in FE. query = "select `{0}`.count_rows() from functional.alltypes".format(unique_database) result = self.execute_query_expect_success(self.client, query, exec_options) actual_ndv = len(set(result.data)) assert actual_ndv == expected_ndv # Test constant argument to a non-constant expr. The argument value can be # cached in the backend. query = """select concat(cast(`{0}`.count_rows() as string), '-', string_col) from functional.alltypes""".format(unique_database) result = self.execute_query_expect_success(self.client, query, exec_options) actual_ndv = len(set(value.split("-")[0] for value in result.data)) assert actual_ndv == expected_ndv
def test_insert_parquet_verify_size(self, vector): # Test to verify that the result file size is close to what we expect.i TBL = "parquet_insert_size" DROP = "drop table if exists {0}".format(TBL) CREATE = ("create table parquet_insert_size like tpch_parquet.orders" " stored as parquet location '{0}/{1}'".format(WAREHOUSE, TBL)) QUERY = "insert overwrite {0} select * from tpch.orders".format(TBL) DIR = get_fs_path("test-warehouse/{0}/".format(TBL)) BLOCK_SIZE = 40 * 1024 * 1024 self.execute_query(DROP) self.execute_query(CREATE) vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = BLOCK_SIZE vector.get_value('exec_option')['COMPRESSION_CODEC'] =\ vector.get_value('compression_codec') vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query(QUERY, vector.get_value('exec_option')) # Get the files in hdfs and verify. There can be at most 1 file that is smaller # that the BLOCK_SIZE. The rest should be within 80% of it and not over. found_small_file = False sizes = self.filesystem_client.get_all_file_sizes(DIR) for size in sizes: assert size < BLOCK_SIZE, "File size greater than expected.\ Expected: {0}, Got: {1}".format(BLOCK_SIZE, size) if size < BLOCK_SIZE * 0.80: assert found_small_file == False found_small_file = True
def test_permanent_udfs(self): # Make sure the pre-calculated count tallies with the number of # functions shown using "show [aggregate] functions" statement self.verify_function_count( "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count); self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count) # invalidate metadata and make sure the count tallies result = self.client.execute("INVALIDATE METADATA") self.verify_function_count( "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count); self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count) # Restart the cluster, this triggers a full metadata reload self.__restart_cluster() # Make sure the counts of udfs and udas match post restart self.verify_function_count( "SHOW FUNCTIONS in {0}".format(self.DATABASE), self.udf_count); self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), self.uda_count) # Drop sample udas and verify the count matches pre and post restart self.__load_drop_functions( self.DROP_SAMPLE_UDAS_TEMPLATE, self.DATABASE, get_fs_path('/test-warehouse/libudasample.so')) self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1) self.__restart_cluster() self.verify_function_count( "SHOW AGGREGATE FUNCTIONS in {0}".format(self.DATABASE), 1)
def test_insert_parquet_verify_size(self, vector): # Test to verify that the result file size is close to what we expect.i TBL = "parquet_insert_size" DROP = "drop table if exists {0}".format(TBL) CREATE = ("create table parquet_insert_size like tpch_parquet.orders" " stored as parquet location '{0}/{1}'".format(WAREHOUSE, TBL)) QUERY = "insert overwrite {0} select * from tpch.orders".format(TBL) DIR = get_fs_path("test-warehouse/{0}/".format(TBL)) BLOCK_SIZE = 40 * 1024 * 1024 self.execute_query(DROP) self.execute_query(CREATE) vector.get_value('exec_option')['PARQUET_FILE_SIZE'] = BLOCK_SIZE vector.get_value('exec_option')['COMPRESSION_CODEC'] =\ vector.get_value('compression_codec') vector.get_value('exec_option')['num_nodes'] = 1 self.execute_query(QUERY, vector.get_value('exec_option')) # Get the files in hdfs and verify. There can be at most 1 file that is smaller # that the BLOCK_SIZE. The rest should be within 80% of it and not over. found_small_file = False ls = self.hdfs_client.list_dir(DIR) for f in ls['FileStatuses']['FileStatus']: if f['type'] != 'FILE': continue length = f['length'] print length assert length < BLOCK_SIZE if length < BLOCK_SIZE * 0.80: assert found_small_file == False found_small_file = True
def test_drop_then_add_function_while_running(self, vector, unique_database): self.client.execute( "drop function if exists `{0}`.drop_while_running(BIGINT)".format( unique_database)) self.client.execute( "create function `{0}`.drop_while_running(BIGINT) returns " "BIGINT LOCATION '{1}' SYMBOL='Identity'".format( unique_database, get_fs_path('/test-warehouse/libTestUdfs.so'))) query = ( "select `{0}`.drop_while_running(l_orderkey) from tpch.lineitem limit 10000" .format(unique_database)) # Run this query asynchronously. handle = self.execute_query_async( query, vector.get_value('exec_option'), table_format=vector.get_value('table_format')) # Fetch some rows from the async query to make sure the UDF is being used results = self.client.fetch(query, handle, 1) assert results.success assert len(results.data) == 1 # Drop the function while the original query is running. self.client.execute( "drop function `{0}`.drop_while_running(BIGINT)".format( unique_database)) # Fetch some rows from the async query to make sure the UDF is being used results = self.client.fetch(query, handle, 1) assert results.success assert len(results.data) == 1 # Re-create function associated with the same binary while the original query is # running self.client.execute( "create function `{0}`.drop_while_running(BIGINT) returns " "BIGINT LOCATION '{1}' SYMBOL='Identity'".format( unique_database, get_fs_path('/test-warehouse/libTestUdfs.so'))) # Fetch the rest of the rows, this should still be able to run the UDF results = self.client.fetch(query, handle, -1) assert results.success assert len(results.data) == 9998
def _create_test_table(self, dbname, tablename, filename, columns): """Creates a table in the given database with the given name and columns. Copies the file with the given name from TESTFILE_DIR into the table.""" location = get_fs_path("/test-warehouse/%s.db/%s" % (dbname, tablename)) self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" % (dbname, tablename, columns, location)) local_path = self.TESTFILE_DIR + "/" + filename self.filesystem_client.copy_from_local(local_path, location)
def test_confirm_individual_refresh(self, vector, unique_database): """ Data added directly to HDFS is only visible for the partition refreshed """ table_name = unique_database + '.' + "partition_test_table" table_location = get_fs_path("/test-warehouse/%s" % unique_database) file_name = "alltypes.parq" src_file = get_fs_path( "/test-warehouse/alltypesagg_parquet/year=2010/month=1/" "day=9/*.parq") file_num_rows = 1000 self.client.execute(""" create table %s like functional.alltypes stored as parquet location '%s' """ % (table_name, table_location)) for month in [1, 2]: self.client.execute( "alter table %s add partition (year=2010, month=%s)" % (table_name, month)) self.client.execute("refresh %s" % table_name) # Check that there is no data in table result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(0)] dst_path = table_location + "/year=2010/month=%s/" + file_name for month in [1, 2]: check_call( ["hadoop", "fs", "-cp", "-f", src_file, dst_path % month], shell=False) # Check that data added is not visible before refresh result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(0)] # Check that data is visible after refresh on the first partition only self.client.execute("refresh %s partition (year=2010, month=1)" % table_name) result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(file_num_rows)] # Check that the data is not yet visible for the second partition # that was not refreshed result = self.client.execute( "select count(*) from %s where year=2010 and month=2" % table_name) assert result.data == [str(0)] # Check that data is visible for the second partition after refresh self.client.execute("refresh %s partition (year=2010, month=2)" % table_name) result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(file_num_rows * 2)]
def test_native_functions(self, vector): database = 'native_function_test' self.__load_functions(self.create_udfs_template, vector, database, get_fs_path('/test-warehouse/libTestUdfs.so')) self.__load_functions(self.create_sample_udas_template, vector, database, get_fs_path('/test-warehouse/libudasample.so')) self.__load_functions(self.create_test_udas_template, vector, database, get_fs_path('/test-warehouse/libTestUdas.so')) self.run_test_case('QueryTest/udf', vector, use_db=database) if not IS_S3: # S3 doesn't support INSERT self.run_test_case('QueryTest/udf-init-close', vector, use_db=database) self.run_test_case('QueryTest/uda', vector, use_db=database)
def test_describe_db(self, vector): self.__test_describe_db_cleanup() try: self.client.execute("create database impala_test_desc_db1") self.client.execute("create database impala_test_desc_db2 " "comment 'test comment'") self.client.execute("create database impala_test_desc_db3 " "location '" + get_fs_path("/testdb") + "'") self.client.execute("create database impala_test_desc_db4 comment 'test comment' " "location \"" + get_fs_path("/test2.db") + "\"") self.run_stmt_in_hive("create database hive_test_desc_db comment 'test comment' " "with dbproperties('pi' = '3.14', 'e' = '2.82')") self.run_stmt_in_hive("alter database hive_test_desc_db set owner user test") self.client.execute("invalidate metadata") self.run_test_case('QueryTest/describe-db', vector) finally: self.__test_describe_db_cleanup()
def test_ir_functions(self, vector): database = 'ir_function_test' self.__load_functions( self.create_udfs_template, vector, database, get_fs_path('/test-warehouse/test-udfs.ll')) self.run_test_case('QueryTest/udf', vector, use_db=database) if not IS_S3: # S3 doesn't support INSERT self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
def _create_test_table(self, dbname, tablename, filename, columns): """Creates a table in the given database with the given name and columns. Copies the file with the given name from TESTFILE_DIR into the table.""" location = get_fs_path("/test-warehouse/%s.db/%s" % (dbname, tablename)) self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" % (dbname, tablename, columns, location)) local_path = self.TESTFILE_DIR + "/" + filename check_call(["hadoop", "fs", "-put", local_path, location], shell=False)
def test_scan_lazy_timestamp(self, vector, unique_database): self.client.execute(("""CREATE TABLE {0}.lazy_ts (ts TIMESTAMP)""").format (unique_database)) tbl_loc = get_fs_path("/test-warehouse/%s.db/%s" % (unique_database, "lazy_ts")) check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] + "/testdata/data/lazy_timestamp.csv", tbl_loc]) self.run_test_case('QueryTest/select-lazy-timestamp', vector, unique_database)
def test_hdfs_caching_fallback_path(self, vector, unique_database, testid_checksum): """ This tests the code path of the query execution where the hdfs cache read fails and the execution falls back to the normal read path. To reproduce this situation we rely on IMPALA-3679, where zcrs are not supported with encryption zones. This makes sure ReadFromCache() fails and falls back to ReadRange() to read the scan range.""" if self.exploration_strategy() != 'exhaustive' or\ vector.get_value('table_format').file_format != 'text': pytest.skip() # Create a new encryption zone and copy the tpch.nation table data into it. encrypted_table_dir = get_fs_path("/test-warehouse/" + testid_checksum) create_query_sql = "CREATE EXTERNAL TABLE %s.cached_nation like tpch.nation "\ "LOCATION '%s'" % (unique_database, encrypted_table_dir) check_call(["hdfs", "dfs", "-mkdir", encrypted_table_dir], shell=False) check_call(["hdfs", "crypto", "-createZone", "-keyName", "testKey1", "-path",\ encrypted_table_dir], shell=False) check_call(["hdfs", "dfs", "-cp", get_fs_path("/test-warehouse/tpch.nation/*.tbl"),\ encrypted_table_dir], shell=False) # Reduce the scan range size to force the query to have multiple scan ranges. exec_options = vector.get_value('exec_option') exec_options['max_scan_range_length'] = 1024 try: self.execute_query_expect_success(self.client, create_query_sql) # Cache the table data self.execute_query_expect_success(self.client, "ALTER TABLE %s.cached_nation set " "cached in 'testPool'" % unique_database) # Wait till the whole path is cached. We set a deadline of 20 seconds for the path # to be cached to make sure this doesn't loop forever in case of caching errors. caching_deadline = time.time() + 20 while not is_path_fully_cached(encrypted_table_dir): if time.time() > caching_deadline: pytest.fail("Timed out caching path: " + encrypted_table_dir) time.sleep(2) self.execute_query_expect_success(self.client, "invalidate metadata " "%s.cached_nation" % unique_database); result = self.execute_query_expect_success(self.client, "select count(*) from " "%s.cached_nation" % unique_database, exec_options) assert(len(result.data) == 1) assert(result.data[0] == '25') except Exception as e: pytest.fail("Failure in test_hdfs_caching_fallback_path: " + str(e)) finally: check_call(["hdfs", "dfs", "-rm", "-r", "-f", "-skipTrash", encrypted_table_dir],\ shell=False)
def test_refresh_native(self): ''' This test checks that a native function is visible in Impala after a REFRESH FUNCTIONS command. We will add the native function through Hive by setting DBPROPERTIES of a database.''' # First we create the function in Impala. create_func_impala = ( "create function {database}.identity_tmp(bigint) " "returns bigint location '{location}' symbol='Identity'") self.client.execute( create_func_impala.format( database=self.HIVE_IMPALA_INTEGRATION_DB, location=get_fs_path('/test-warehouse/libTestUdfs.so'))) # Impala puts the native function into a database property table. We extract the key # value pair that represents the function from the table. describe_db_hive = "DESCRIBE DATABASE EXTENDED {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB) result = self.run_stmt_in_hive(describe_db_hive) regex = r"{(.*?)=(.*?)}" match = re.search(regex, result) func_name = match.group(1) func_contents = match.group(2) # Recreate the database, this deletes the function. self.client.execute("DROP DATABASE {database} CASCADE".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) self.client.execute("CREATE DATABASE {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) result = self.client.execute("SHOW FUNCTIONS IN {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result is not None and len(result.data) == 0 # Place the function into the recreated database by modifying it's properties. alter_db_hive = "ALTER DATABASE {database} SET DBPROPERTIES ('{fn_name}'='{fn_val}')" self.run_stmt_in_hive( alter_db_hive.format(database=self.HIVE_IMPALA_INTEGRATION_DB, fn_name=func_name, fn_val=func_contents)) result = self.client.execute("SHOW FUNCTIONS IN {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result is not None and len(result.data) == 0 # The function should be visible in Impala after a REFRESH FUNCTIONS. self.client.execute("REFRESH FUNCTIONS {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) result = self.client.execute("SHOW FUNCTIONS IN {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result is not None and len(result.data) > 0 and\ "identity_tmp" in str(result.data) # Verify that the function returns a correct result. result = self.client.execute( "SELECT {database}.identity_tmp(10)".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result.data[0] == "10" # Make sure we deleted all the temporary jars we copied to the local fs assert len(glob.glob(self.LOCAL_LIBRARY_DIR + "/*.jar")) == 0
def test_ir_functions(self, vector): database = 'ir_function_test' self.__load_functions(self.create_udfs_template, vector, database, get_fs_path('/test-warehouse/test-udfs.ll')) self.run_test_case('QueryTest/udf', vector, use_db=database) if not IS_S3: # S3 doesn't support INSERT self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
def _populate_hdfs_partitions(self): """ Copy some data to defaultFS HDFS filesystem so that the test can verify tables that span the default (HDFS) and secondary filesystem (e.g. S3A).""" check_call([ "hadoop", "fs", "-cp", get_fs_path("/test-warehouse/alltypes_parquet"), "/test-warehouse/%s.db/" % self.TEST_DB ], shell=False)
def test_hidden_symbol(self, vector, unique_database): """Test that symbols in the test UDFs are hidden by default and that therefore they cannot be used as a UDF entry point.""" symbol = "_Z16UnexportedSymbolPN10impala_udf15FunctionContextE" ex = self.execute_query_expect_failure(self.client, """ create function `{0}`.unexported() returns BIGINT LOCATION '{1}' SYMBOL='{2}'""".format( unique_database, get_fs_path('/test-warehouse/libTestUdfs.so'), symbol)) assert "Could not find symbol '{0}'".format(symbol) in str(ex), str(ex) # IMPALA-8196: IR UDFs ignore whether symbol is hidden or not. Exercise the current # behaviour, where the UDF can be created and executed. result = self.execute_query_expect_success(self.client, """ create function `{0}`.unexported() returns BIGINT LOCATION '{1}' SYMBOL='{2}'""".format( unique_database, get_fs_path('/test-warehouse/test-udfs.ll'), symbol)) result = self.execute_query_expect_success(self.client, "select `{0}`.unexported()".format(unique_database)) assert result.data[0][0] == '5'
def test_confirm_individual_refresh(self, vector, unique_database): """ Data added directly to HDFS is only visible for the partition refreshed """ table_name = unique_database + '.' + "partition_test_table" table_location = get_fs_path("/test-warehouse/%s" % unique_database) file_name = "alltypes.parq" src_file = get_fs_path("/test-warehouse/alltypesagg_parquet/year=2010/month=1/" "day=9/*.parq") file_num_rows = 1000 self.client.execute(""" create table %s like functional.alltypes stored as parquet location '%s' """ % (table_name, table_location)) for month in [1, 2]: self.client.execute("alter table %s add partition (year=2010, month=%s)" % (table_name, month)) self.client.execute("refresh %s" % table_name) # Check that there is no data in table result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(0)] dst_path = table_location + "/year=2010/month=%s/" + file_name for month in [1, 2]: check_call(["hadoop", "fs", "-cp", "-f", src_file, dst_path % month], shell=False) # Check that data added is not visible before refresh result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(0)] # Check that data is visible after refresh on the first partition only self.client.execute("refresh %s partition (year=2010, month=1)" % table_name) result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(file_num_rows)] # Check that the data is not yet visible for the second partition # that was not refreshed result = self.client.execute( "select count(*) from %s where year=2010 and month=2" % table_name) assert result.data == [str(0)] # Check that data is visible for the second partition after refresh self.client.execute("refresh %s partition (year=2010, month=2)" % table_name) result = self.client.execute("select count(*) from %s" % table_name) assert result.data == [str(file_num_rows*2)]
def test_refresh_native(self): ''' This test checks that a native function is visible in Impala after a REFRESH FUNCTIONS command. We will add the native function through Hive by setting DBPROPERTIES of a database.''' # First we create the function in Impala. create_func_impala = ("create function {database}.identity_tmp(bigint) " "returns bigint location '{location}' symbol='Identity'") self.client.execute(create_func_impala.format( database=self.HIVE_IMPALA_INTEGRATION_DB, location=get_fs_path('/test-warehouse/libTestUdfs.so'))) # Impala puts the native function into a database property table. We extract the key # value pair that represents the function from the table. describe_db_hive = "DESCRIBE DATABASE EXTENDED {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB) result = self.run_stmt_in_hive(describe_db_hive) regex = r"{(.*?)=(.*?)}" match = re.search(regex, result) func_name = match.group(1) func_contents = match.group(2) # Recreate the database, this deletes the function. self.client.execute("DROP DATABASE {database} CASCADE".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) self.client.execute("CREATE DATABASE {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) result = self.client.execute("SHOW FUNCTIONS IN {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result is not None and len(result.data) == 0 # Place the function into the recreated database by modifying it's properties. alter_db_hive = "ALTER DATABASE {database} SET DBPROPERTIES ('{fn_name}'='{fn_val}')" self.run_stmt_in_hive(alter_db_hive.format( database=self.HIVE_IMPALA_INTEGRATION_DB, fn_name=func_name, fn_val=func_contents)) result = self.client.execute("SHOW FUNCTIONS IN {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result is not None and len(result.data) == 0 # The function should be visible in Impala after a REFRESH FUNCTIONS. self.client.execute("REFRESH FUNCTIONS {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) result = self.client.execute("SHOW FUNCTIONS IN {database}".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result is not None and len(result.data) > 0 and\ "identity_tmp" in str(result.data) # Verify that the function returns a correct result. result = self.client.execute("SELECT {database}.identity_tmp(10)".format( database=self.HIVE_IMPALA_INTEGRATION_DB)) assert result.data[0] == "10" # Make sure we deleted all the temporary jars we copied to the local fs assert len(glob.glob(self.LOCAL_LIBRARY_DIR + "/*.jar")) == 0
def _create_test_table(self, tablename, filename, columns): """Returns a unique tablename based on the input 'tablename'. This allows multiple instances of the same test to be run in parallel (e.g. during an exhaustive run).""" tablename = "%s_%s" % (tablename, random.randint(0, 10**5)) location = get_fs_path("/test-warehouse/%s_%s" % (self.DATABASE, tablename)) self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" % (self.DATABASE, tablename, columns, location)) local_path = self.TESTFILE_DIR + "/" + filename check_call(["hadoop", "fs", "-put", local_path, location], shell=False) self.client.execute("invalidate metadata %s.%s" % (self.DATABASE, tablename)) return tablename