Beispiel #1
0
  def test_change_parquet_column_type(self, vector):
    """
    Changing column types in Parquet doesn't work in Hive and it causes
    'select *' to fail in Impala as well, after invalidating metadata. This is a
    known issue with changing column types in Hive/parquet.
    """

    with HiveDbWrapper(self, self.unique_string()) as db_name:
      with HiveTableWrapper(self, db_name + '.' + self.unique_string(),
                            '(x int, y int) stored as parquet') as table_name:
        self.run_stmt_in_hive(
            'insert into table %s values (33,44)' % table_name)
        assert '33,44' == self.run_stmt_in_hive(
            'select * from %s' % table_name).split('\n')[1]
        self.client.execute('invalidate metadata')
        assert '33\t44' == self.client.execute(
            'select * from %s' % table_name).get_data()
        self.run_stmt_in_hive('alter table %s change y y string' % table_name)
        self.assert_sql_error(
            self.run_stmt_in_hive, 'select * from %s' %
            table_name, 'Cannot inspect org.apache.hadoop.io.IntWritable')
        self.client.execute('invalidate metadata %s' % table_name)
        self.assert_sql_error(
            self.client.execute,
            'select * from %s' %
            table_name,
            "Column type: STRING, Parquet schema:")
Beispiel #2
0
 def test_drop_table_events(self):
     """IMPALA-10187: Event processing fails on multiple events + DROP TABLE.
 This test issues ALTER TABLE + DROP in quick succession and checks whether event
 processing still works.
 """
     event_proc_timeout = 15
     db_name = ImpalaTestSuite.get_random_name("drop_event_db_")
     with HiveDbWrapper(self, db_name):
         tbl_name = "foo"
         self.run_stmt_in_hive("""
       drop table if exists {db}.{tbl};
       create table {db}.{tbl} (id int);
       insert into {db}.{tbl} values(1);""".format(db=db_name,
                                                   tbl=tbl_name))
         # With MetastoreEventProcessor running, the insert event will be processed. Query
         # the table from Impala.
         EventProcessorUtils.wait_for_event_processing(
             self, event_proc_timeout)
         # Verify that the data is present in Impala.
         data = self.execute_scalar("select * from %s.%s" %
                                    (db_name, tbl_name))
         assert data == '1'
         # Execute ALTER TABLE + DROP in quick succession so they will be processed in the
         # same event batch.
         self.run_stmt_in_hive("""
       alter table {db}.{tbl} set tblproperties ('foo'='bar');
       drop table {db}.{tbl};""".format(db=db_name, tbl=tbl_name))
         EventProcessorUtils.wait_for_event_processing(
             self, event_proc_timeout)
         # Check that the event processor status is still ACTIVE.
         assert EventProcessorUtils.get_event_processor_status() == "ACTIVE"
Beispiel #3
0
    def test_change_parquet_column_type(self, vector):
        """
    Changing column types in Parquet doesn't always work in Hive and it causes
    'select *' to fail in Impala as well, after invalidating metadata. This is a
    known issue with changing column types in Hive/parquet.
    """

        with HiveDbWrapper(self, self.unique_string()) as db_name:
            with HiveTableWrapper(
                    self, db_name + '.' + self.unique_string(),
                    '(x int, y int) stored as parquet') as table_name:
                # The following INSERT statement creates a Parquet file with INT columns.
                self.run_stmt_in_hive('insert into table %s values (33,44)' %
                                      table_name)
                assert '33,44' == self.run_stmt_in_hive(
                    'select * from %s' % table_name).split('\n')[1]
                self.client.execute('invalidate metadata')
                assert '33\t44' == self.client.execute('select * from %s' %
                                                       table_name).get_data()
                # Modify table metadata. After this statement, the table metadata in HMS
                # and the Parquet file metadata won't agree on the type of column 'y'.
                self.run_stmt_in_hive('alter table %s change y y string' %
                                      table_name)
                if HIVE_MAJOR_VERSION == 2:
                    # Hive 2 doesn't allow implicit conversion from INT to STRING.
                    self.assert_sql_error(
                        self.run_stmt_in_hive, 'select * from %s' % table_name,
                        'Cannot inspect org.apache.hadoop.io.IntWritable')
                else:
                    # Hive 3 implicitly converts INTs to STRINGs.
                    assert '33,44' == self.run_stmt_in_hive(
                        'select * from %s' % table_name).split('\n')[1]
                self.client.execute('invalidate metadata %s' % table_name)
                # Impala doesn't convert INTs to STRINGs implicitly.
                self.assert_sql_error(self.client.execute,
                                      'select * from %s' % table_name,
                                      "Column type: STRING, Parquet schema:")
                # Insert STRING value, it will create a Parquet file where column 'y'
                # has type STRING.
                self.run_stmt_in_hive(
                    'insert into table %s values (33,\'100\')' % table_name)
                # Modify HMS table metadata again, change the type of column 'y' back to INT.
                self.run_stmt_in_hive('alter table %s change y y int' %
                                      table_name)
                # Neither Hive 2 and 3, nor Impala converts STRINGs to INTs implicitly.
                self.assert_sql_error(
                    self.run_stmt_in_hive, 'select * from %s' % table_name,
                    'org.apache.hadoop.io.Text cannot be '
                    'cast to org.apache.hadoop.io.IntWritable')
                self.client.execute('invalidate metadata %s' % table_name)
                self.assert_sql_error(self.client.execute,
                                      'select * from %s' % table_name,
                                      "Column type: INT, Parquet schema:")
Beispiel #4
0
    def test_drop_database(self, vector):
        """
    If a DB is created, then dropped, in Hive, Impala can create one with the
    same name without invalidating metadata.
    """

        test_db = self.unique_string()
        with HiveDbWrapper(self, test_db) as db_name:
            pass
        self.assert_sql_error(
            self.client.execute,
            'create table %s.%s (x int)' % (test_db, self.unique_string()),
            'Database does not exist: %s' % test_db)
        with self.ImpalaDbWrapper(self, test_db) as db_name:
            pass
Beispiel #5
0
  def test_change_column_type(self, vector):
    """Hive column type changes propagate to Impala."""

    with HiveDbWrapper(self, self.unique_string()) as db_name:
      with HiveTableWrapper(self, db_name + '.' + self.unique_string(),
                            '(x int, y int)') as table_name:
        self.run_stmt_in_hive(
            'insert into table %s values (33,44)' % table_name)
        self.run_stmt_in_hive('alter table %s change y y string' % table_name)
        assert '33,44' == self.run_stmt_in_hive(
            'select * from %s' % table_name).split('\n')[1]
        self.client.execute('invalidate metadata %s' % table_name)
        assert '33\t44' == self.client.execute(
            'select * from %s' % table_name).get_data()
        assert 'string' == self.impala_columns(table_name)['y']['type']
Beispiel #6
0
 def test_table_format_change(self, vector):
   """
   Hive storage format changes propagate to Impala.
   """
   # TODO: check results of insert, then select * before and after
   # storage format change.
   with HiveDbWrapper(self, self.unique_string()) as db_name:
     with HiveTableWrapper(self, db_name + '.' + self.unique_string(),
                           '(x int, y int) stored as parquet') as table_name:
       self.client.execute('invalidate metadata')
       self.client.execute('invalidate metadata %s' % table_name)
       print self.impala_table_stats(table_name)
       assert 'PARQUET' == self.impala_table_stats(table_name)[()]['format']
       self.run_stmt_in_hive(
           'alter table %s set fileformat avro' % table_name)
       self.client.execute('invalidate metadata %s' % table_name)
       assert 'AVRO' == self.impala_table_stats(table_name)[()]['format']
Beispiel #7
0
    def test_drop_database(self, vector):
        """
    If a DB is created, then dropped, in Hive, Impala can create one with the
    same name without invalidating metadata.
    """

        test_db = self.unique_string()
        with HiveDbWrapper(self, test_db) as db_name:
            pass
        # if events processing is turned on we should make sure that the drop
        # database event above is processed to avoid flakiness
        EventProcessorUtils.wait_for_event_processing(self)
        self.assert_sql_error(
            self.client.execute,
            'create table %s.%s (x int)' % (test_db, self.unique_string()),
            'Database does not exist: %s' % test_db)
        with self.ImpalaDbWrapper(self, test_db) as db_name:
            pass
Beispiel #8
0
    def test_catalog_restart(self, testid_checksum):
        """ IMPALA-6948: reproduces the issue by deleting a table from Hive while the catalogd
        is down. When catalogd is restarted, if the regression is present, the deleted
        table will still be present at the impalads."""
        db_name = "test_catalog_restart_%s" % testid_checksum
        try:
            with HiveDbWrapper(self, db_name):
                # Issue several invalidates to boost the version for the current incarnation of the
                # catalog. As a result, the table we'll add to Hive will get a version that's easier
                # to see is higher than the highest version of the restarted catalogd incarnation.
                for i in xrange(0, 50):
                    self.client.execute(
                        "invalidate metadata functional.alltypes")
                assert self.cluster.catalogd.service.get_catalog_version(
                ) >= 50
                # Creates a database and table with Hive and makes it visible to Impala.
                self.run_stmt_in_hive("create table %s.x (a string)" % db_name)
                self.client.execute("invalidate metadata %s.x" % db_name)
                assert "x" in self.client.execute("show tables in %s" %
                                                  db_name).data
                # Stops the catalog
                self.cluster.catalogd.kill()
                # Drops the table from the catalog using Hive.
                self.run_stmt_in_hive("drop table %s.x" % db_name)
                # Restarts the catalog
                self.cluster.catalogd.start()
                # Refreshes the state of the catalogd process.
                self.cluster.refresh()
                # Wait until the impalad catalog versions agree with the catalogd's version.
                catalogd_version = self.cluster.catalogd.service.get_catalog_version(
                )
                for impalad in self.cluster.impalads:
                    impalad.service.wait_for_metric_value(
                        "catalog.curr-version", catalogd_version)

                self.__validate_metadata()
        except Exception as e:
            assert False, "Unexpected exception: " + str(e)
        finally:
            # Hack to work-around IMPALA-5695.
            self.cluster.catalogd.kill()
Beispiel #9
0
    def test_change_table_name(self, vector):
        """
    Changing the table name in Hive propagates to Impala after 'invalidate
    metadata'.
    """

        with HiveDbWrapper(self, self.unique_string()) as db_name:
            with HiveTableWrapper(self, db_name + '.' + self.unique_string(),
                                  '(x int, y int)') as table_name:
                self.client.execute('invalidate metadata')
                int_column = {'type': 'int', 'comment': ''}
                expected_columns = {'x': int_column, 'y': int_column}
                assert expected_columns == self.impala_columns(table_name)
                new_name = table_name + '2'
                self.run_stmt_in_hive('alter table %s rename to %s' %
                                      (table_name, new_name))
                self.client.execute('invalidate metadata')
                assert expected_columns == self.impala_columns(new_name)
                self.assert_sql_error(self.client.execute,
                                      'describe %s' % table_name,
                                      'Could not resolve path')
Beispiel #10
0
 def test_compute_stats_get_to_impala(self, vector):
     """Column stats computed in Hive are also visible in Impala."""
     with HiveDbWrapper(self, self.unique_string()) as db_name:
         with HiveTableWrapper(self, db_name + '.' + self.unique_string(),
                               '(x int)') as table_name:
             hive_stats = self.hive_column_stats(table_name, 'x')
             self.client.execute('invalidate metadata')
             self.client.execute('refresh %s' % table_name)
             impala_stats = self.impala_all_column_stats(table_name)
             self.run_stmt_in_hive('insert into table %s values (33)' %
                                   table_name)
             self.run_stmt_in_hive(
                 'use %s; analyze table %s compute statistics for columns' %
                 (db_name, table_name.split('.')[1]))
             new_hive_stats = self.hive_column_stats(table_name, 'x')
             assert hive_stats != new_hive_stats
             assert '33' == new_hive_stats['min']
             assert '33' == new_hive_stats['max']
             assert '0' == new_hive_stats['num_nulls']
             self.client.execute('refresh %s' % table_name)
             new_impala_stats = self.impala_all_column_stats(table_name)
             assert impala_stats != new_impala_stats
             assert '0' == new_impala_stats['x']['#nulls']
Beispiel #11
0
    def run_test_insert_events(self, is_transactional=False):
        """Test for insert event processing. Events are created in Hive and processed in
    Impala. The following cases are tested :
    Insert into table --> for partitioned and non-partitioned table
    Insert overwrite table --> for partitioned and non-partitioned table
    Insert into partition --> for partitioned table
    """
        db_name = 'test_db'
        with HiveDbWrapper(self, db_name):
            # Test table with no partitions.
            TBL_INSERT_NOPART = 'tbl_insert_nopart'
            self.run_stmt_in_hive("drop table if exists %s.%s" %
                                  (db_name, TBL_INSERT_NOPART))
            last_synced_event_id = self.get_last_synced_event_id()
            TBLPROPERTIES = ""
            if is_transactional:
                TBLPROPERTIES = "TBLPROPERTIES ('transactional'='true'," \
                    "'transactional_properties'='insert_only')"
            self.run_stmt_in_hive("create table %s.%s (id int, val int) %s" %
                                  (db_name, TBL_INSERT_NOPART, TBLPROPERTIES))
            # Test insert into table, this will fire an insert event.
            self.run_stmt_in_hive("insert into %s.%s values(101, 200)" %
                                  (db_name, TBL_INSERT_NOPART))
            # With MetastoreEventProcessor running, the insert event will be processed. Query the
            # table from Impala.
            assert self.wait_for_insert_event_processing(
                last_synced_event_id) is True
            # Verify that the data is present in Impala.
            data = self.execute_scalar("select * from %s.%s" %
                                       (db_name, TBL_INSERT_NOPART))
            assert data.split('\t') == ['101', '200']

            # Test insert overwrite. Overwrite the existing value.
            last_synced_event_id = self.get_last_synced_event_id()
            self.run_stmt_in_hive(
                "insert overwrite table %s.%s values(101, 201)" %
                (db_name, TBL_INSERT_NOPART))
            # Make sure the event has been processed.
            assert self.wait_for_insert_event_processing(
                last_synced_event_id) is True
            # Verify that the data is present in Impala.
            data = self.execute_scalar("select * from %s.%s" %
                                       (db_name, TBL_INSERT_NOPART))
            assert data.split('\t') == ['101', '201']

            # Test partitioned table.
            last_synced_event_id = self.get_last_synced_event_id()
            TBL_INSERT_PART = 'tbl_insert_part'
            self.run_stmt_in_hive("drop table if exists %s.%s" %
                                  (db_name, TBL_INSERT_PART))
            self.run_stmt_in_hive(
                "create table %s.%s (id int, name string) "
                "partitioned by(day int, month int, year int) %s" %
                (db_name, TBL_INSERT_PART, TBLPROPERTIES))
            # Insert data into partitions.
            self.run_stmt_in_hive(
                "insert into %s.%s partition(day=28, month=03, year=2019)"
                "values(101, 'x')" % (db_name, TBL_INSERT_PART))
            # Make sure the event has been processed.
            assert self.wait_for_insert_event_processing(
                last_synced_event_id) is True
            # Verify that the data is present in Impala.
            data = self.execute_scalar("select * from %s.%s" %
                                       (db_name, TBL_INSERT_PART))
            assert data.split('\t') == ['101', 'x', '28', '3', '2019']

            # Test inserting into existing partitions.
            last_synced_event_id = self.get_last_synced_event_id()
            self.run_stmt_in_hive(
                "insert into %s.%s partition(day=28, month=03, year=2019)"
                "values(102, 'y')" % (db_name, TBL_INSERT_PART))
            assert self.wait_for_insert_event_processing(
                last_synced_event_id) is True
            # Verify that the data is present in Impala.
            data = self.execute_scalar(
                "select count(*) from %s.%s where day=28 and month=3 "
                "and year=2019" % (db_name, TBL_INSERT_PART))
            assert data.split('\t') == ['2']

            # Test insert overwrite into existing partitions
            last_synced_event_id = self.get_last_synced_event_id()
            self.run_stmt_in_hive(
                "insert overwrite table %s.%s partition(day=28, month=03, "
                "year=2019)"
                "values(101, 'z')" % (db_name, TBL_INSERT_PART))
            assert self.wait_for_insert_event_processing(
                last_synced_event_id) is True
            # Verify that the data is present in Impala.
            data = self.execute_scalar(
                "select * from %s.%s where day=28 and month=3 and"
                " year=2019 and id=101" % (db_name, TBL_INSERT_PART))
            assert data.split('\t') == ['101', 'z', '28', '3', '2019']
Beispiel #12
0
  def run_test_insert_events(self, is_transactional=False):
    """Test for insert event processing. Events are created in Hive and processed in
    Impala. The following cases are tested :
    Insert into table --> for partitioned and non-partitioned table
    Insert overwrite table --> for partitioned and non-partitioned table
    Insert into partition --> for partitioned table
    """
    db_name = self.__get_random_name("insert_event_db_")
    tblproperties = self.__get_transactional_tblproperties(is_transactional)
    with HiveDbWrapper(self, db_name):
      # Test table with no partitions.
      test_tbl_name = 'tbl_insert_nopart'
      self.run_stmt_in_hive("drop table if exists %s.%s" % (db_name, test_tbl_name))
      self.run_stmt_in_hive("create table %s.%s (id int, val int) %s"
         % (db_name, test_tbl_name, tblproperties))
      # Test insert into table, this will fire an insert event.
      self.run_stmt_in_hive("insert into %s.%s values(101, 200)"
         % (db_name, test_tbl_name))
      # With MetastoreEventProcessor running, the insert event will be processed. Query
      # the table from Impala.
      EventProcessorUtils.wait_for_event_processing(self)
      # Verify that the data is present in Impala.
      data = self.execute_scalar("select * from %s.%s" % (db_name, test_tbl_name))
      assert data.split('\t') == ['101', '200']

      # Test insert overwrite. Overwrite the existing value.
      self.run_stmt_in_hive("insert overwrite table %s.%s values(101, 201)"
         % (db_name, test_tbl_name))
      # Make sure the event has been processed.
      EventProcessorUtils.wait_for_event_processing(self)
      # Verify that the data is present in Impala.
      data = self.execute_scalar("select * from %s.%s" % (db_name, test_tbl_name))
      assert data.split('\t') == ['101', '201']

      # Test partitioned table.
      test_part_tblname = 'tbl_insert_part'
      self.run_stmt_in_hive("drop table if exists %s.%s" % (db_name, test_part_tblname))
      self.run_stmt_in_hive("create table %s.%s (id int, name string) "
         "partitioned by(day int, month int, year int) %s"
         % (db_name, test_part_tblname, tblproperties))
      # Insert data into partitions.
      self.run_stmt_in_hive("insert into %s.%s partition(day=28, month=03, year=2019)"
         "values(101, 'x')" % (db_name, test_part_tblname))
      # Make sure the event has been processed.
      EventProcessorUtils.wait_for_event_processing(self)
      # Verify that the data is present in Impala.
      data = self.execute_scalar("select * from %s.%s" % (db_name, test_part_tblname))
      assert data.split('\t') == ['101', 'x', '28', '3', '2019']

      # Test inserting into existing partitions.
      self.run_stmt_in_hive("insert into %s.%s partition(day=28, month=03, year=2019)"
         "values(102, 'y')" % (db_name, test_part_tblname))
      EventProcessorUtils.wait_for_event_processing(self)
      # Verify that the data is present in Impala.
      data = self.execute_scalar("select count(*) from %s.%s where day=28 and month=3 "
         "and year=2019" % (db_name, test_part_tblname))
      assert data.split('\t') == ['2']

      # Test insert overwrite into existing partitions
      self.run_stmt_in_hive("insert overwrite table %s.%s partition(day=28, month=03, "
         "year=2019)" "values(101, 'z')" % (db_name, test_part_tblname))
      EventProcessorUtils.wait_for_event_processing(self)
      # Verify that the data is present in Impala.
      data = self.execute_scalar("select * from %s.%s where day=28 and month=3 and"
         " year=2019 and id=101" % (db_name, test_part_tblname))
      assert data.split('\t') == ['101', 'z', '28', '3', '2019']