def test_change_parquet_column_type(self, vector): """ Changing column types in Parquet doesn't work in Hive and it causes 'select *' to fail in Impala as well, after invalidating metadata. This is a known issue with changing column types in Hive/parquet. """ with HiveDbWrapper(self, self.unique_string()) as db_name: with HiveTableWrapper(self, db_name + '.' + self.unique_string(), '(x int, y int) stored as parquet') as table_name: self.run_stmt_in_hive( 'insert into table %s values (33,44)' % table_name) assert '33,44' == self.run_stmt_in_hive( 'select * from %s' % table_name).split('\n')[1] self.client.execute('invalidate metadata') assert '33\t44' == self.client.execute( 'select * from %s' % table_name).get_data() self.run_stmt_in_hive('alter table %s change y y string' % table_name) self.assert_sql_error( self.run_stmt_in_hive, 'select * from %s' % table_name, 'Cannot inspect org.apache.hadoop.io.IntWritable') self.client.execute('invalidate metadata %s' % table_name) self.assert_sql_error( self.client.execute, 'select * from %s' % table_name, "Column type: STRING, Parquet schema:")
def test_drop_table_events(self): """IMPALA-10187: Event processing fails on multiple events + DROP TABLE. This test issues ALTER TABLE + DROP in quick succession and checks whether event processing still works. """ event_proc_timeout = 15 db_name = ImpalaTestSuite.get_random_name("drop_event_db_") with HiveDbWrapper(self, db_name): tbl_name = "foo" self.run_stmt_in_hive(""" drop table if exists {db}.{tbl}; create table {db}.{tbl} (id int); insert into {db}.{tbl} values(1);""".format(db=db_name, tbl=tbl_name)) # With MetastoreEventProcessor running, the insert event will be processed. Query # the table from Impala. EventProcessorUtils.wait_for_event_processing( self, event_proc_timeout) # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s" % (db_name, tbl_name)) assert data == '1' # Execute ALTER TABLE + DROP in quick succession so they will be processed in the # same event batch. self.run_stmt_in_hive(""" alter table {db}.{tbl} set tblproperties ('foo'='bar'); drop table {db}.{tbl};""".format(db=db_name, tbl=tbl_name)) EventProcessorUtils.wait_for_event_processing( self, event_proc_timeout) # Check that the event processor status is still ACTIVE. assert EventProcessorUtils.get_event_processor_status() == "ACTIVE"
def test_change_parquet_column_type(self, vector): """ Changing column types in Parquet doesn't always work in Hive and it causes 'select *' to fail in Impala as well, after invalidating metadata. This is a known issue with changing column types in Hive/parquet. """ with HiveDbWrapper(self, self.unique_string()) as db_name: with HiveTableWrapper( self, db_name + '.' + self.unique_string(), '(x int, y int) stored as parquet') as table_name: # The following INSERT statement creates a Parquet file with INT columns. self.run_stmt_in_hive('insert into table %s values (33,44)' % table_name) assert '33,44' == self.run_stmt_in_hive( 'select * from %s' % table_name).split('\n')[1] self.client.execute('invalidate metadata') assert '33\t44' == self.client.execute('select * from %s' % table_name).get_data() # Modify table metadata. After this statement, the table metadata in HMS # and the Parquet file metadata won't agree on the type of column 'y'. self.run_stmt_in_hive('alter table %s change y y string' % table_name) if HIVE_MAJOR_VERSION == 2: # Hive 2 doesn't allow implicit conversion from INT to STRING. self.assert_sql_error( self.run_stmt_in_hive, 'select * from %s' % table_name, 'Cannot inspect org.apache.hadoop.io.IntWritable') else: # Hive 3 implicitly converts INTs to STRINGs. assert '33,44' == self.run_stmt_in_hive( 'select * from %s' % table_name).split('\n')[1] self.client.execute('invalidate metadata %s' % table_name) # Impala doesn't convert INTs to STRINGs implicitly. self.assert_sql_error(self.client.execute, 'select * from %s' % table_name, "Column type: STRING, Parquet schema:") # Insert STRING value, it will create a Parquet file where column 'y' # has type STRING. self.run_stmt_in_hive( 'insert into table %s values (33,\'100\')' % table_name) # Modify HMS table metadata again, change the type of column 'y' back to INT. self.run_stmt_in_hive('alter table %s change y y int' % table_name) # Neither Hive 2 and 3, nor Impala converts STRINGs to INTs implicitly. self.assert_sql_error( self.run_stmt_in_hive, 'select * from %s' % table_name, 'org.apache.hadoop.io.Text cannot be ' 'cast to org.apache.hadoop.io.IntWritable') self.client.execute('invalidate metadata %s' % table_name) self.assert_sql_error(self.client.execute, 'select * from %s' % table_name, "Column type: INT, Parquet schema:")
def test_drop_database(self, vector): """ If a DB is created, then dropped, in Hive, Impala can create one with the same name without invalidating metadata. """ test_db = self.unique_string() with HiveDbWrapper(self, test_db) as db_name: pass self.assert_sql_error( self.client.execute, 'create table %s.%s (x int)' % (test_db, self.unique_string()), 'Database does not exist: %s' % test_db) with self.ImpalaDbWrapper(self, test_db) as db_name: pass
def test_change_column_type(self, vector): """Hive column type changes propagate to Impala.""" with HiveDbWrapper(self, self.unique_string()) as db_name: with HiveTableWrapper(self, db_name + '.' + self.unique_string(), '(x int, y int)') as table_name: self.run_stmt_in_hive( 'insert into table %s values (33,44)' % table_name) self.run_stmt_in_hive('alter table %s change y y string' % table_name) assert '33,44' == self.run_stmt_in_hive( 'select * from %s' % table_name).split('\n')[1] self.client.execute('invalidate metadata %s' % table_name) assert '33\t44' == self.client.execute( 'select * from %s' % table_name).get_data() assert 'string' == self.impala_columns(table_name)['y']['type']
def test_table_format_change(self, vector): """ Hive storage format changes propagate to Impala. """ # TODO: check results of insert, then select * before and after # storage format change. with HiveDbWrapper(self, self.unique_string()) as db_name: with HiveTableWrapper(self, db_name + '.' + self.unique_string(), '(x int, y int) stored as parquet') as table_name: self.client.execute('invalidate metadata') self.client.execute('invalidate metadata %s' % table_name) print self.impala_table_stats(table_name) assert 'PARQUET' == self.impala_table_stats(table_name)[()]['format'] self.run_stmt_in_hive( 'alter table %s set fileformat avro' % table_name) self.client.execute('invalidate metadata %s' % table_name) assert 'AVRO' == self.impala_table_stats(table_name)[()]['format']
def test_drop_database(self, vector): """ If a DB is created, then dropped, in Hive, Impala can create one with the same name without invalidating metadata. """ test_db = self.unique_string() with HiveDbWrapper(self, test_db) as db_name: pass # if events processing is turned on we should make sure that the drop # database event above is processed to avoid flakiness EventProcessorUtils.wait_for_event_processing(self) self.assert_sql_error( self.client.execute, 'create table %s.%s (x int)' % (test_db, self.unique_string()), 'Database does not exist: %s' % test_db) with self.ImpalaDbWrapper(self, test_db) as db_name: pass
def test_catalog_restart(self, testid_checksum): """ IMPALA-6948: reproduces the issue by deleting a table from Hive while the catalogd is down. When catalogd is restarted, if the regression is present, the deleted table will still be present at the impalads.""" db_name = "test_catalog_restart_%s" % testid_checksum try: with HiveDbWrapper(self, db_name): # Issue several invalidates to boost the version for the current incarnation of the # catalog. As a result, the table we'll add to Hive will get a version that's easier # to see is higher than the highest version of the restarted catalogd incarnation. for i in xrange(0, 50): self.client.execute( "invalidate metadata functional.alltypes") assert self.cluster.catalogd.service.get_catalog_version( ) >= 50 # Creates a database and table with Hive and makes it visible to Impala. self.run_stmt_in_hive("create table %s.x (a string)" % db_name) self.client.execute("invalidate metadata %s.x" % db_name) assert "x" in self.client.execute("show tables in %s" % db_name).data # Stops the catalog self.cluster.catalogd.kill() # Drops the table from the catalog using Hive. self.run_stmt_in_hive("drop table %s.x" % db_name) # Restarts the catalog self.cluster.catalogd.start() # Refreshes the state of the catalogd process. self.cluster.refresh() # Wait until the impalad catalog versions agree with the catalogd's version. catalogd_version = self.cluster.catalogd.service.get_catalog_version( ) for impalad in self.cluster.impalads: impalad.service.wait_for_metric_value( "catalog.curr-version", catalogd_version) self.__validate_metadata() except Exception as e: assert False, "Unexpected exception: " + str(e) finally: # Hack to work-around IMPALA-5695. self.cluster.catalogd.kill()
def test_change_table_name(self, vector): """ Changing the table name in Hive propagates to Impala after 'invalidate metadata'. """ with HiveDbWrapper(self, self.unique_string()) as db_name: with HiveTableWrapper(self, db_name + '.' + self.unique_string(), '(x int, y int)') as table_name: self.client.execute('invalidate metadata') int_column = {'type': 'int', 'comment': ''} expected_columns = {'x': int_column, 'y': int_column} assert expected_columns == self.impala_columns(table_name) new_name = table_name + '2' self.run_stmt_in_hive('alter table %s rename to %s' % (table_name, new_name)) self.client.execute('invalidate metadata') assert expected_columns == self.impala_columns(new_name) self.assert_sql_error(self.client.execute, 'describe %s' % table_name, 'Could not resolve path')
def test_compute_stats_get_to_impala(self, vector): """Column stats computed in Hive are also visible in Impala.""" with HiveDbWrapper(self, self.unique_string()) as db_name: with HiveTableWrapper(self, db_name + '.' + self.unique_string(), '(x int)') as table_name: hive_stats = self.hive_column_stats(table_name, 'x') self.client.execute('invalidate metadata') self.client.execute('refresh %s' % table_name) impala_stats = self.impala_all_column_stats(table_name) self.run_stmt_in_hive('insert into table %s values (33)' % table_name) self.run_stmt_in_hive( 'use %s; analyze table %s compute statistics for columns' % (db_name, table_name.split('.')[1])) new_hive_stats = self.hive_column_stats(table_name, 'x') assert hive_stats != new_hive_stats assert '33' == new_hive_stats['min'] assert '33' == new_hive_stats['max'] assert '0' == new_hive_stats['num_nulls'] self.client.execute('refresh %s' % table_name) new_impala_stats = self.impala_all_column_stats(table_name) assert impala_stats != new_impala_stats assert '0' == new_impala_stats['x']['#nulls']
def run_test_insert_events(self, is_transactional=False): """Test for insert event processing. Events are created in Hive and processed in Impala. The following cases are tested : Insert into table --> for partitioned and non-partitioned table Insert overwrite table --> for partitioned and non-partitioned table Insert into partition --> for partitioned table """ db_name = 'test_db' with HiveDbWrapper(self, db_name): # Test table with no partitions. TBL_INSERT_NOPART = 'tbl_insert_nopart' self.run_stmt_in_hive("drop table if exists %s.%s" % (db_name, TBL_INSERT_NOPART)) last_synced_event_id = self.get_last_synced_event_id() TBLPROPERTIES = "" if is_transactional: TBLPROPERTIES = "TBLPROPERTIES ('transactional'='true'," \ "'transactional_properties'='insert_only')" self.run_stmt_in_hive("create table %s.%s (id int, val int) %s" % (db_name, TBL_INSERT_NOPART, TBLPROPERTIES)) # Test insert into table, this will fire an insert event. self.run_stmt_in_hive("insert into %s.%s values(101, 200)" % (db_name, TBL_INSERT_NOPART)) # With MetastoreEventProcessor running, the insert event will be processed. Query the # table from Impala. assert self.wait_for_insert_event_processing( last_synced_event_id) is True # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s" % (db_name, TBL_INSERT_NOPART)) assert data.split('\t') == ['101', '200'] # Test insert overwrite. Overwrite the existing value. last_synced_event_id = self.get_last_synced_event_id() self.run_stmt_in_hive( "insert overwrite table %s.%s values(101, 201)" % (db_name, TBL_INSERT_NOPART)) # Make sure the event has been processed. assert self.wait_for_insert_event_processing( last_synced_event_id) is True # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s" % (db_name, TBL_INSERT_NOPART)) assert data.split('\t') == ['101', '201'] # Test partitioned table. last_synced_event_id = self.get_last_synced_event_id() TBL_INSERT_PART = 'tbl_insert_part' self.run_stmt_in_hive("drop table if exists %s.%s" % (db_name, TBL_INSERT_PART)) self.run_stmt_in_hive( "create table %s.%s (id int, name string) " "partitioned by(day int, month int, year int) %s" % (db_name, TBL_INSERT_PART, TBLPROPERTIES)) # Insert data into partitions. self.run_stmt_in_hive( "insert into %s.%s partition(day=28, month=03, year=2019)" "values(101, 'x')" % (db_name, TBL_INSERT_PART)) # Make sure the event has been processed. assert self.wait_for_insert_event_processing( last_synced_event_id) is True # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s" % (db_name, TBL_INSERT_PART)) assert data.split('\t') == ['101', 'x', '28', '3', '2019'] # Test inserting into existing partitions. last_synced_event_id = self.get_last_synced_event_id() self.run_stmt_in_hive( "insert into %s.%s partition(day=28, month=03, year=2019)" "values(102, 'y')" % (db_name, TBL_INSERT_PART)) assert self.wait_for_insert_event_processing( last_synced_event_id) is True # Verify that the data is present in Impala. data = self.execute_scalar( "select count(*) from %s.%s where day=28 and month=3 " "and year=2019" % (db_name, TBL_INSERT_PART)) assert data.split('\t') == ['2'] # Test insert overwrite into existing partitions last_synced_event_id = self.get_last_synced_event_id() self.run_stmt_in_hive( "insert overwrite table %s.%s partition(day=28, month=03, " "year=2019)" "values(101, 'z')" % (db_name, TBL_INSERT_PART)) assert self.wait_for_insert_event_processing( last_synced_event_id) is True # Verify that the data is present in Impala. data = self.execute_scalar( "select * from %s.%s where day=28 and month=3 and" " year=2019 and id=101" % (db_name, TBL_INSERT_PART)) assert data.split('\t') == ['101', 'z', '28', '3', '2019']
def run_test_insert_events(self, is_transactional=False): """Test for insert event processing. Events are created in Hive and processed in Impala. The following cases are tested : Insert into table --> for partitioned and non-partitioned table Insert overwrite table --> for partitioned and non-partitioned table Insert into partition --> for partitioned table """ db_name = self.__get_random_name("insert_event_db_") tblproperties = self.__get_transactional_tblproperties(is_transactional) with HiveDbWrapper(self, db_name): # Test table with no partitions. test_tbl_name = 'tbl_insert_nopart' self.run_stmt_in_hive("drop table if exists %s.%s" % (db_name, test_tbl_name)) self.run_stmt_in_hive("create table %s.%s (id int, val int) %s" % (db_name, test_tbl_name, tblproperties)) # Test insert into table, this will fire an insert event. self.run_stmt_in_hive("insert into %s.%s values(101, 200)" % (db_name, test_tbl_name)) # With MetastoreEventProcessor running, the insert event will be processed. Query # the table from Impala. EventProcessorUtils.wait_for_event_processing(self) # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s" % (db_name, test_tbl_name)) assert data.split('\t') == ['101', '200'] # Test insert overwrite. Overwrite the existing value. self.run_stmt_in_hive("insert overwrite table %s.%s values(101, 201)" % (db_name, test_tbl_name)) # Make sure the event has been processed. EventProcessorUtils.wait_for_event_processing(self) # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s" % (db_name, test_tbl_name)) assert data.split('\t') == ['101', '201'] # Test partitioned table. test_part_tblname = 'tbl_insert_part' self.run_stmt_in_hive("drop table if exists %s.%s" % (db_name, test_part_tblname)) self.run_stmt_in_hive("create table %s.%s (id int, name string) " "partitioned by(day int, month int, year int) %s" % (db_name, test_part_tblname, tblproperties)) # Insert data into partitions. self.run_stmt_in_hive("insert into %s.%s partition(day=28, month=03, year=2019)" "values(101, 'x')" % (db_name, test_part_tblname)) # Make sure the event has been processed. EventProcessorUtils.wait_for_event_processing(self) # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s" % (db_name, test_part_tblname)) assert data.split('\t') == ['101', 'x', '28', '3', '2019'] # Test inserting into existing partitions. self.run_stmt_in_hive("insert into %s.%s partition(day=28, month=03, year=2019)" "values(102, 'y')" % (db_name, test_part_tblname)) EventProcessorUtils.wait_for_event_processing(self) # Verify that the data is present in Impala. data = self.execute_scalar("select count(*) from %s.%s where day=28 and month=3 " "and year=2019" % (db_name, test_part_tblname)) assert data.split('\t') == ['2'] # Test insert overwrite into existing partitions self.run_stmt_in_hive("insert overwrite table %s.%s partition(day=28, month=03, " "year=2019)" "values(101, 'z')" % (db_name, test_part_tblname)) EventProcessorUtils.wait_for_event_processing(self) # Verify that the data is present in Impala. data = self.execute_scalar("select * from %s.%s where day=28 and month=3 and" " year=2019 and id=101" % (db_name, test_part_tblname)) assert data.split('\t') == ['101', 'z', '28', '3', '2019']