Ejemplo n.º 1
0
    def _apply_row_group_selector(self, dataset, rowgroup_selector,
                                  filtered_row_group_indexes):
        """Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup
        indexes."""

        if not isinstance(rowgroup_selector, RowGroupSelectorBase):
            raise ValueError(
                'rowgroup_selector parameter is expected to be derived from RowGroupSelectorBase'
            )

        # Load indexes from metadata
        available_row_group_indexes = rowgroup_indexing.get_row_group_indexes(
            dataset)

        required_indexes = rowgroup_selector.get_index_names()
        if not set(required_indexes).issubset(
                set(available_row_group_indexes.keys())):
            raise ValueError(
                'Some of required indexes {} are not available in {}'.format(
                    required_indexes,
                    list(available_row_group_indexes.keys())))

        selected_indexes = rowgroup_selector.select_row_groups(
            available_row_group_indexes)

        # include only selected_indexes but in filtered_row_group_indexes order
        filtered_row_group_indexes = [
            idx for idx in filtered_row_group_indexes
            if idx in selected_indexes
        ]
        return filtered_row_group_indexes
Ejemplo n.º 2
0
                             'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')

    args = parser.parse_args()

    if args.dataset_url and args.dataset_url[-1] == '/':
        args.dataset_url = args.dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    print_all = not args.schema and not args.index
    if args.schema or print_all:
        print('*** Schema from dataset metadata ***')
        print((dataset_metadata.get_schema(dataset)))

    if args.index or print_all:
        index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
        print('*** Row group indexes from dataset metadata ***')
        for index_name in index_dict:
            print(('Index: {}'.format(index_name)))
            if args.skip_index is None or index_name not in args.skip_index:
                for field_value in index_dict[index_name].indexed_values:
                    print('  -- {}({})'.format(field_value,
                                               len(index_dict[index_name].get_row_group_indexes(field_value))))
                    if args.print_values:
                        print(index_dict[index_name].get_row_group_indexes(field_value))
            else:
                print('  (skipped)')