Ejemplo n.º 1
0
    def _ctas_and_get_metadata(self,
                               vector,
                               unique_database,
                               tmp_dir,
                               source_table,
                               table_name="test_hdfs_parquet_table_writer"):
        """CTAS 'source_table' into a Parquet table and returns its Parquet metadata."""
        qualified_table_name = "{0}.{1}".format(unique_database, table_name)
        hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(
            unique_database, table_name))

        # Setting num_nodes = 1 ensures that the query is executed on the coordinator,
        # resulting in a single parquet file being written.
        query = (
            "create table {0} stored as parquet as select * from {1}").format(
                qualified_table_name, source_table)
        vector.get_value('exec_option')['num_nodes'] = 1
        self.execute_query_expect_success(self.client, query,
                                          vector.get_value('exec_option'))

        file_metadata_list = get_parquet_metadata_from_hdfs_folder(
            hdfs_path, tmp_dir)
        assert len(file_metadata_list) == 1
        assert file_metadata_list[0] is not None
        return file_metadata_list[0]
Ejemplo n.º 2
0
  def test_set_column_orders(self, vector, unique_database, tmpdir):
    """Tests that the Parquet writers set FileMetaData::column_orders."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_set_column_orders"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    query = "create table {0} like {1} stored as parquet".format(qualified_target_table,
        source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) select * from {1}").format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and verify column orders
    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath)

    expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11

    for file_metadata in file_metadata_list:
      assert file_metadata.column_orders == expected_col_orders
Ejemplo n.º 3
0
    def test_set_column_orders(self, vector, unique_database, tmpdir):
        """Tests that the Parquet writers set FileMetaData::column_orders."""
        source_table = "functional_parquet.alltypessmall"
        target_table = "test_set_column_orders"
        qualified_target_table = "{0}.{1}".format(unique_database,
                                                  target_table)
        hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(
            unique_database, target_table))

        # Create table
        query = "create table {0} like {1} stored as parquet".format(
            qualified_target_table, source_table)
        self.execute_query(query)

        # Insert data
        query = (
            "insert into {0} partition(year, month) select * from {1}").format(
                qualified_target_table, source_table)
        self.execute_query(query)

        # Download hdfs files and verify column orders
        file_metadata_list = get_parquet_metadata_from_hdfs_folder(
            hdfs_path, tmpdir.strpath)

        expected_col_orders = [ColumnOrder(TYPE_ORDER=TypeDefinedOrder())] * 11

        for file_metadata in file_metadata_list:
            assert file_metadata.column_orders == expected_col_orders
Ejemplo n.º 4
0
  def test_sorting_columns(self, vector, unique_database, tmpdir):
    """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY
    columns."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_write_sorting_columns"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) select * from {1}").format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and extract rowgroup metadata
    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath)
    row_groups = []

    for file_metadata in file_metadata_list:
      row_groups.extend(file_metadata.row_groups)

    # Verify that the files have the sorted_columns set
    expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)]
    for row_group in row_groups:
      assert row_group.sorting_columns == expected
Ejemplo n.º 5
0
  def test_sorting_columns(self, vector, unique_database, tmpdir):
    """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY
    columns."""
    source_table = "functional_parquet.alltypessmall"
    target_table = "test_write_sorting_columns"
    qualified_target_table = "{0}.{1}".format(unique_database, target_table)
    hdfs_path = get_fs_path("/test-warehouse/{0}.db/{1}/".format(unique_database,
        target_table))

    # Create table
    query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Insert data
    query = ("insert into {0} partition(year, month) select * from {1}").format(
        qualified_target_table, source_table)
    self.execute_query(query)

    # Download hdfs files and extract rowgroup metadata
    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmpdir.strpath)
    row_groups = []

    for file_metadata in file_metadata_list:
      row_groups.extend(file_metadata.row_groups)

    # Verify that the files have the sorted_columns set
    expected = [SortingColumn(4, False, False), SortingColumn(0, False, False)]
    for row_group in row_groups:
      assert row_group.sorting_columns == expected
Ejemplo n.º 6
0
  def _get_row_group_stats_from_hdfs_folder(self, hdfs_path, tmp_dir):
    """Returns a list of statistics for each row group in all parquet files i 'hdfs_path'.
    'tmp_dir' needs to be supplied by the caller and will be used to store temporary
    files. The caller is responsible for cleaning up 'tmp_dir'. The result is a
    two-dimensional list, containing stats by row group and column."""
    row_group_stats = []

    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmp_dir)
    for file_metadata in file_metadata_list:
      row_group_stats.extend(self._get_row_group_stats_from_file_metadata(file_metadata))

    return row_group_stats
Ejemplo n.º 7
0
  def _get_row_group_stats_from_hdfs_folder(self, hdfs_path, tmp_dir):
    """Returns a list of statistics for each row group in all parquet files i 'hdfs_path'.
    'tmp_dir' needs to be supplied by the caller and will be used to store temporary
    files. The caller is responsible for cleaning up 'tmp_dir'. The result is a
    two-dimensional list, containing stats by row group and column."""
    row_group_stats = []

    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmp_dir)
    for file_metadata in file_metadata_list:
      row_group_stats.extend(self._get_row_group_stats_from_file_metadata(file_metadata))

    return row_group_stats
Ejemplo n.º 8
0
  def _ctas_and_get_metadata(self, vector, unique_database, tmp_dir, source_table,
                             table_name="test_hdfs_parquet_table_writer"):
    """CTAS 'source_table' into a Parquet table and returns its Parquet metadata."""
    qualified_table_name = "{0}.{1}".format(unique_database, table_name)
    hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database,
                                                                 table_name))

    # Setting num_nodes = 1 ensures that the query is executed on the coordinator,
    # resulting in a single parquet file being written.
    query = ("create table {0} stored as parquet as select * from {1}").format(
      qualified_table_name, source_table)
    vector.get_value('exec_option')['num_nodes'] = 1
    self.execute_query_expect_success(self.client, query,
                                      vector.get_value('exec_option'))

    file_metadata_list = get_parquet_metadata_from_hdfs_folder(hdfs_path, tmp_dir)
    assert len(file_metadata_list) == 1
    assert file_metadata_list[0] is not None
    return file_metadata_list[0]