Esempio n. 1
0
    def test_query_large_file(self):
        self.__create_test_table(self.COMPRESSED_TABLE_NAME,
                                 self.COMPRESSED_TABLE_LOCATION)
        self.__create_test_table(self.UNCOMPRESSED_TABLE_NAME,
                                 self.UNCOMPRESSED_TABLE_LOCATION)
        self.__generate_file(self.COMPRESSED_LOCAL_FILE_PATH,
                             self.COMPRESSED_TABLE_LOCATION)
        self.__generate_file(self.UNCOMPRESSED_LOCAL_FILE_PATH,
                             self.UNCOMPRESSED_TABLE_LOCATION)
        self.client.execute("refresh %s" % self.COMPRESSED_TABLE_NAME)
        self.client.execute("refresh %s" % self.UNCOMPRESSED_TABLE_NAME)

        # Read from compressed table
        result = self.client.execute("select count(*) from %s" %
                                     self.COMPRESSED_TABLE_NAME)
        result_uncompressed = self.client.execute("select count(*) from %s" %
                                                  self.UNCOMPRESSED_TABLE_NAME)
        assert int(result.get_data()) == int(result_uncompressed.get_data())

        # Read top 10k rows from compressed table and uncompressed table, compare results
        base_result = self.execute_query_expect_success(
            self.client, "select * from {0} order by col limit 10000".format(
                self.UNCOMPRESSED_TABLE_NAME))
        test_result = self.execute_query_expect_success(
            self.client, "select * from {0} order by col limit 10000".format(
                self.COMPRESSED_TABLE_NAME))
        verify_query_result_is_equal(test_result.data, base_result.data)
Esempio n. 2
0
  def test_hive_impala_interop(self, unique_database, cluster_properties):
    """Tests compressed text file written by Hive with different codecs
    can be read from impala. And verify results."""
    # Setup source table.
    source_table = "{0}.{1}".format(unique_database, "t1_source")
    # TODO: Once IMPALA-8721 is fixed add coverage for TimeStamp data type.
    self.execute_query_expect_success(self.client,
        "create table {0} stored as textfile as select id, bool_col, tinyint_col, "
        "smallint_col, int_col, bigint_col, float_col, double_col, date_string_col,"
        "string_col, year, month from functional_parquet.alltypes".format(source_table))
    self.execute_query_expect_success(self.client,
        "insert into {0}(id) values (7777), (8888), (9999), (11111), (22222), (33333)"
        .format(source_table))

    # For Hive 3+, workaround for HIVE-22371 (CTAS puts files in the wrong place) by
    # explicitly creating an external table so that files are in the external warehouse
    # directory. Use external.table.purge=true so that it is equivalent to a Hive 2
    # managed table. Hive 2 stays the same.
    external = ""
    tblproperties = ""
    if HIVE_MAJOR_VERSION >= 3:
      external = "external"
      tblproperties = "TBLPROPERTIES('external.table.purge'='TRUE')"
    # Loop through the compression codecs and run interop tests.
    for codec in TEXT_CODECS:
      # Write data in Hive and read from Impala
      # switch codec to format hive can accept
      switcher = {
          'snappy': 'org.apache.hadoop.io.compress.SnappyCodec',
          'gzip': 'org.apache.hadoop.io.compress.GzipCodec',
          'zstd': 'org.apache.hadoop.io.compress.ZStandardCodec',
          'bzip2': 'org.apache.hadoop.io.compress.BZip2Codec',
          'deflate': 'org.apache.hadoop.io.compress.DeflateCodec',
          'default': 'org.apache.hadoop.io.compress.DefaultCodec'
      }
      hive_table = "{0}.{1}".format(unique_database, "t1_hive")
      self.run_stmt_in_hive("drop table if exists {0}".format(hive_table))
      self.run_stmt_in_hive("set hive.exec.compress.output=true;\
          set mapreduce.output.fileoutputformat.compress.codec={0};\
          create {1} table {2} stored as textfile {3} as select * from {4}"
          .format(switcher.get(codec, 'Invalid codec'), external, hive_table,
          tblproperties, source_table))

      # Make sure hive CTAS table is not empty
      assert self.run_stmt_in_hive("select count(*) from {0}".format(
          hive_table)).split("\n")[1] != "0", "CTAS created Hive table is empty."

      # Make sure Impala's metadata is in sync.
      if cluster_properties.is_catalog_v2_cluster():
        self.wait_for_table_to_appear(unique_database, hive_table, timeout_s=10)
      else:
        self.client.execute("invalidate metadata {0}".format(hive_table))

      # Read Hive data in Impala and verify results.
      base_result = self.execute_query_expect_success(self.client,
          "select * from {0} order by id".format(source_table))
      test_result = self.execute_query_expect_success(self.client,
          "select * from {0} order by id".format(hive_table))
      verify_query_result_is_equal(test_result.data, base_result.data)
    def test_hive_impala_interop(self, unique_database, cluster_properties):
        """Tests compressed text file written by Hive with different codecs
    can be read from impala. And verify results."""
        # Setup source table.
        source_table = "{0}.{1}".format(unique_database, "t1_source")
        # TODO: Once IMPALA-8721 is fixed add coverage for TimeStamp data type.
        self.execute_query_expect_success(
            self.client,
            "create table {0} stored as textfile as select id, bool_col, tinyint_col, "
            "smallint_col, int_col, bigint_col, float_col, double_col, date_string_col,"
            "string_col, year, month from functional_parquet.alltypes".format(
                source_table))
        self.execute_query_expect_success(
            self.client,
            "insert into {0}(id) values (7777), (8888), (9999), (11111), (22222), (33333)"
            .format(source_table))

        # Loop through the compression codecs and run interop tests.
        for codec in TEXT_CODECS:
            # Write data in Hive and read from Impala
            # switch codec to format hive can accept
            switcher = {
                'snappy': 'org.apache.hadoop.io.compress.SnappyCodec',
                'gzip': 'org.apache.hadoop.io.compress.GzipCodec',
                'zstd': 'org.apache.hadoop.io.compress.ZStandardCodec',
                'lzo': 'com.hadoop.compression.lzo.LzopCodec',
                'bzip2': 'org.apache.hadoop.io.compress.BZip2Codec',
                'deflate': 'org.apache.hadoop.io.compress.DeflateCodec',
                'default': 'org.apache.hadoop.io.compress.DefaultCodec'
            }
            hive_table = "{0}.{1}".format(unique_database, "t1_hive")
            self.run_stmt_in_hive(
                "drop table if exists {0}".format(hive_table))
            self.run_stmt_in_hive("set hive.exec.compress.output=true;\
          set mapreduce.output.fileoutputformat.compress.codec={0};\
          create table {1} stored as textfile as select * from {2}".format(
                switcher.get(codec, 'Invalid codec'), hive_table,
                source_table))

            # Make sure Impala's metadata is in sync.
            if cluster_properties.is_catalog_v2_cluster():
                self.wait_for_table_to_appear(unique_database,
                                              hive_table,
                                              timeout_s=10)
            else:
                self.client.execute(
                    "invalidate metadata {0}".format(hive_table))

            # Read Hive data in Impala and verify results.
            base_result = self.execute_query_expect_success(
                self.client,
                "select * from {0} order by id".format(source_table))
            test_result = self.execute_query_expect_success(
                self.client,
                "select * from {0} order by id".format(hive_table))
            verify_query_result_is_equal(test_result.data, base_result.data)
Esempio n. 4
0
 def test_insert_parquet_multi_codecs(self, vector, unique_database):
   # Tests that parquet files are written/read correctly when using multiple codecs
   self.run_test_case('QueryTest/insert_parquet_multi_codecs', vector, unique_database,
       multiple_impalad=True)
   base_table = "{0}.{1}".format(unique_database, "t1_default")
   test_table = "{0}.{1}".format(unique_database, "t1_zstd_gzip")
   # select all rows and compare the data in base_table and test_table
   base_result = self.execute_query("select * from {0} order by c3".format(base_table))
   test_result = self.execute_query("select * from {0} order by c3".format(test_table))
   verify_query_result_is_equal(test_result.data, base_result.data)
    def test_hive_impala_interop(self, vector, unique_database,
                                 cluster_properties):
        # Setup source table.
        source_table = "{0}.{1}".format(unique_database, "t1_source")
        # TODO: Once IMPALA-8721 is fixed add coverage for TimeStamp data type.
        self.execute_query_expect_success(
            self.client,
            "create table {0} as select id, bool_col, tinyint_col, smallint_col, int_col, "
            "bigint_col, float_col, double_col, date_string_col, string_col, year, month "
            "from functional_parquet.alltypes".format(source_table))
        self.execute_query_expect_success(
            self.client,
            "insert into {0}(id) values (7777), (8888), (9999), (11111), (22222), (33333)"
            .format(source_table))

        # Loop through the compression codecs and run interop tests.
        for codec in PARQUET_CODECS:
            # Write data in Impala.
            vector.get_value('exec_option')['compression_codec'] = codec
            impala_table = "{0}.{1}".format(unique_database, "t1_impala")
            self.execute_query_expect_success(
                self.client, "drop table if exists {0}".format(impala_table))
            self.execute_query_expect_success(
                self.client,
                "create table {0} stored as parquet as select * from {1}".
                format(impala_table, source_table),
                vector.get_value('exec_option'))

            # Read data from Impala and write in Hive
            if (codec == 'none'): codec = 'uncompressed'
            elif (codec == 'zstd:7'): codec = 'zstd'
            hive_table = "{0}.{1}".format(unique_database, "t1_hive")
            self.run_stmt_in_hive(
                "drop table if exists {0}".format(hive_table))
            self.run_stmt_in_hive("set parquet.compression={0};\
          create table {1} stored as parquet as select * from {2}".format(
                codec, hive_table, impala_table))

            # Make sure Impala's metadata is in sync.
            if cluster_properties.is_catalog_v2_cluster():
                self.wait_for_table_to_appear(unique_database,
                                              hive_table,
                                              timeout_s=10)
            else:
                self.client.execute(
                    "invalidate metadata {0}".format(hive_table))

            # Read Hive data in Impala and verify results.
            base_result = self.execute_query_expect_success(
                self.client,
                "select * from {0} order by id".format(source_table))
            test_result = self.execute_query_expect_success(
                self.client,
                "select * from {0} order by id".format(hive_table))
            verify_query_result_is_equal(test_result.data, base_result.data)
  def test_hive_impala_interop(self, vector, unique_database, cluster_properties):
    # Setup source table.
    source_table = "{0}.{1}".format(unique_database, "t1_source")
    self.execute_query_expect_success(self.client,
        "create table {0} as select * from functional_parquet.alltypes"
        .format(source_table))
    self.execute_query_expect_success(self.client,
        "insert into {0}(id) values (7777), (8888), (9999), (11111), (22222), (33333)"
        .format(source_table))

    # Loop through the compression codecs and run interop tests.
    for codec in PARQUET_CODECS:
      # Write data in Impala.
      vector.get_value('exec_option')['compression_codec'] = codec
      impala_table = "{0}.{1}".format(unique_database, "t1_impala")
      self.execute_query_expect_success(self.client,
          "drop table if exists {0}".format(impala_table))
      self.execute_query_expect_success(self.client,
          "create table {0} stored as parquet as select * from {1}"
          .format(impala_table, source_table), vector.get_value('exec_option'))

      # Read data from Impala and write in Hive
      if (codec == 'none'): codec = 'uncompressed'
      elif (codec == 'zstd:7'): codec = 'zstd'
      hive_table = "{0}.{1}".format(unique_database, "t1_hive")
      self.run_stmt_in_hive("drop table if exists {0}".format(hive_table))
      # For Hive 3+, workaround for HIVE-22371 (CTAS puts files in the wrong place) by
      # explicitly creating an external table so that files are in the external warehouse
      # directory. Use external.table.purge=true so that it is equivalent to a Hive 2
      # managed table. Hive 2 stays the same.
      external = ""
      tblproperties = ""
      if HIVE_MAJOR_VERSION >= 3:
        external = "external"
        tblproperties = "TBLPROPERTIES('external.table.purge'='TRUE')"
      self.run_stmt_in_hive("set parquet.compression={0};\
          create {1} table {2} stored as parquet {3} as select * from {4}"
          .format(codec, external, hive_table, tblproperties, impala_table))

      # Make sure Impala's metadata is in sync.
      if cluster_properties.is_catalog_v2_cluster():
        self.wait_for_table_to_appear(unique_database, hive_table, timeout_s=10)
      else:
        self.client.execute("invalidate metadata {0}".format(hive_table))

      # Read Hive data in Impala and verify results.
      base_result = self.execute_query_expect_success(self.client,
          "select * from {0} order by id".format(source_table))
      test_result = self.execute_query_expect_success(self.client,
          "select * from {0} order by id".format(hive_table))
      verify_query_result_is_equal(test_result.data, base_result.data)