Exemple #1
0
    def test_strings_utf8(self, vector, unique_database):
        # Create table
        table_name = "ice_str_utf8"
        qualified_table_name = "%s.%s" % (unique_database, table_name)
        query = 'create table %s (a string) stored as iceberg' % qualified_table_name
        self.client.execute(query)

        # Inserted string data should have UTF8 annotation regardless of query options.
        query = 'insert into %s values ("impala")' % qualified_table_name
        self.execute_query(query, {'parquet_annotate_strings_utf8': False})

        # Copy the created file to the local filesystem and parse metadata
        local_file = '/tmp/iceberg_utf8_test_%s.parq' % random.randint(
            0, 10000)
        LOG.info("test_strings_utf8 local file name: " + local_file)
        hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/data/*.parq' %
                                (unique_database, table_name))
        check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file])
        metadata = get_parquet_metadata(local_file)

        # Extract SchemaElements corresponding to the table column
        a_schema_element = metadata.schema[1]
        assert a_schema_element.name == 'a'

        # Check that the schema uses the UTF8 annotation
        assert a_schema_element.converted_type == ConvertedType.UTF8

        os.remove(local_file)
Exemple #2
0
    def test_set_column_orders(self, vector, unique_database, tmpdir):
        """Tests that the Parquet writers set FileMetaData::column_orders."""
        source_table = "functional_parquet.alltypessmall"
        target_table = "test_set_column_orders"
        qualified_target_table = "{0}.{1}".format(unique_database,
                                                  target_table)
        hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(
            unique_database, target_table))

        # Create table
        query = "create table {0} like {1} stored as parquet".format(
            qualified_target_table, source_table)
        self.execute_query(query)

        # Insert data
        query = (
            "insert into {0} partition(year, month) select * from {1}").format(
                qualified_target_table, source_table)
        self.execute_query(query)

        # Download hdfs files and verify column orders
        check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath])

        expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11

        for root, subdirs, files in os.walk(tmpdir.strpath):
            for f in files:
                parquet_file = os.path.join(root, str(f))
                file_meta_data = get_parquet_metadata(parquet_file)
                assert file_meta_data.column_orders == expected_col_orders
Exemple #3
0
  def test_sorting_columns(self, vector, unique_database, tmpdir):
    """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY
    columns."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_write_sorting_columns"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) select * from {1}").format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and extract rowgroup metadata
    row_groups = []
    check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath])

    for root, subdirs, files in os.walk(tmpdir.strpath):
      for f in files:
        parquet_file = os.path.join(root, str(f))
        file_meta_data = get_parquet_metadata(parquet_file)
        row_groups.extend(file_meta_data.row_groups)

    # Verify that the files have the sorted_columns set
    expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)]
    for row_group in row_groups:
      assert row_group.sorting_columns == expected
  def test_sorting_columns(self, vector, unique_database, tmpdir):
    """Tests that RowGroup::sorting_columns gets populated when specifying a sortby()
    insert hint."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_write_sorting_columns"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    # TODO: Simplify once IMPALA-4167 (insert hints in CTAS) has been fixed.
    query = "create table {0} like {1} stored as parquet".format(qualified_target_table,
        source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) /* +sortby(int_col, id) */ "
        "select * from {1}").format(qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and extract rowgroup metadata
    row_groups = []
    check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath])

    for root, subdirs, files in os.walk(tmpdir.strpath):
      for f in files:
        parquet_file = os.path.join(root, str(f))
        file_meta_data = get_parquet_metadata(parquet_file)
        row_groups.extend(file_meta_data.row_groups)

    # Verify that the files have the sorted_columns set
    expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)]
    for row_group in row_groups:
      assert row_group.sorting_columns == expected
Exemple #5
0
  def _get_row_group_stats_from_file(self, parquet_file):
    """Returns a list of statistics for each row group in file 'parquet_file'. The result
    is a two-dimensional list, containing stats by row group and column."""
    file_meta_data = get_parquet_metadata(parquet_file)
    # We only support flat schemas, the additional element is the root element.
    schemas = file_meta_data.schema[1:]
    file_stats = []
    for row_group in file_meta_data.row_groups:
      num_columns = len(row_group.columns)
      assert num_columns == len(schemas)
      column_stats = [c.meta_data.statistics for c in row_group.columns]
      file_stats.append(self._decode_row_group_stats(schemas, column_stats))

    return file_stats
  def _get_row_group_stats_from_file(self, parquet_file):
    """Returns a list of statistics for each row group in file 'parquet_file'. The result
    is a two-dimensional list, containing stats by row group and column."""
    file_meta_data = get_parquet_metadata(parquet_file)
    # We only support flat schemas, the additional element is the root element.
    schemas = file_meta_data.schema[1:]
    file_stats = []
    for row_group in file_meta_data.row_groups:
      num_columns = len(row_group.columns)
      assert num_columns == len(schemas)
      column_stats = [c.meta_data.statistics for c in row_group.columns]
      file_stats.append(self._decode_row_group_stats(schemas, column_stats))

    return file_stats
Exemple #7
0
    def _get_row_group_from_file(self, parquet_file):
        """Returns namedtuples that contain the schema, stats, offset_index, column_index,
    and page_headers for each column in the first row group in file 'parquet_file'. Fails
    if the file contains multiple row groups.
    """
        ColumnInfo = namedtuple('ColumnInfo', [
            'schema', 'stats', 'offset_index', 'column_index', 'page_headers'
        ])

        file_meta_data = get_parquet_metadata(parquet_file)
        assert len(file_meta_data.row_groups) == 1
        # We only support flat schemas, the additional element is the root element.
        schemas = file_meta_data.schema[1:]
        row_group = file_meta_data.row_groups[0]
        assert len(schemas) == len(row_group.columns)
        row_group_index = []
        with open(parquet_file) as file_handle:
            for column, schema in zip(row_group.columns, schemas):
                column_index_offset = column.column_index_offset
                column_index_length = column.column_index_length
                column_index = None
                if column_index_offset and column_index_length:
                    column_index = read_serialized_object(
                        ColumnIndex, file_handle, column_index_offset,
                        column_index_length)
                column_meta_data = column.meta_data
                stats = None
                if column_meta_data:
                    stats = column_meta_data.statistics

                offset_index_offset = column.offset_index_offset
                offset_index_length = column.offset_index_length
                offset_index = None
                page_headers = []
                if offset_index_offset and offset_index_length:
                    offset_index = read_serialized_object(
                        OffsetIndex, file_handle, offset_index_offset,
                        offset_index_length)
                    for page_loc in offset_index.page_locations:
                        page_header = read_serialized_object(
                            PageHeader, file_handle, page_loc.offset,
                            page_loc.compressed_page_size)
                        page_headers.append(page_header)

                column_info = ColumnInfo(schema, stats, offset_index,
                                         column_index, page_headers)
                row_group_index.append(column_info)
        return row_group_index
 def _get_first_row_group_bloom_filters(self, parquet_file):
     # While other functions require a filename relative to $IMPALA_HOME, and prepend the
     # path of $IMPALA_HOME but this one does not so we have to prepend it ourselves.
     filename = os.path.join(os.environ['IMPALA_HOME'], parquet_file)
     file_meta_data = get_parquet_metadata(filename)
     # We only support flat schemas, the additional element is the root element.
     schemas = file_meta_data.schema[1:]
     # We are only interested in the first row group.
     row_group = file_meta_data.row_groups[0]
     assert len(schemas) == len(row_group.columns)
     col_to_bloom_filter = dict()
     with open(filename) as file_handle:
         for i, column in enumerate(row_group.columns):
             column_meta_data = column.meta_data
             if column_meta_data and column_meta_data.bloom_filter_offset:
                 bloom_filter = self._try_read_bloom_filter(
                     file_handle, column_meta_data.bloom_filter_offset)
                 if bloom_filter:
                     col_to_bloom_filter[i] = bloom_filter
     return col_to_bloom_filter
    def get_schema_elements():
      # Copy the created file to the local filesystem and parse metadata
      local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000)
      LOG.info("test_annotate_utf8_option local file name: " + local_file)
      hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq'
          % (unique_database, TABLE_NAME))
      check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file])
      metadata = get_parquet_metadata(local_file)

      # Extract SchemaElements corresponding to the table columns
      a_schema_element = metadata.schema[1]
      assert a_schema_element.name == 'a'
      b_schema_element = metadata.schema[2]
      assert b_schema_element.name == 'b'
      c_schema_element = metadata.schema[3]
      assert c_schema_element.name == 'c'
      d_schema_element = metadata.schema[4]
      assert d_schema_element.name == 'd'

      os.remove(local_file)
      return a_schema_element, b_schema_element, c_schema_element, d_schema_element
Exemple #10
0
    def get_schema_elements():
      # Copy the created file to the local filesystem and parse metadata
      local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000)
      LOG.info("test_annotate_utf8_option local file name: " + local_file)
      hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq'
          % (unique_database, TABLE_NAME))
      check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file])
      metadata = get_parquet_metadata(local_file)

      # Extract SchemaElements corresponding to the table columns
      a_schema_element = metadata.schema[1]
      assert a_schema_element.name == 'a'
      b_schema_element = metadata.schema[2]
      assert b_schema_element.name == 'b'
      c_schema_element = metadata.schema[3]
      assert c_schema_element.name == 'c'
      d_schema_element = metadata.schema[4]
      assert d_schema_element.name == 'd'

      os.remove(local_file)
      return a_schema_element, b_schema_element, c_schema_element, d_schema_element