def _apply_row_group_selector(self, dataset, rowgroup_selector, filtered_row_group_indexes): """Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup indexes.""" if not isinstance(rowgroup_selector, RowGroupSelectorBase): raise ValueError( 'rowgroup_selector parameter is expected to be derived from RowGroupSelectorBase' ) # Load indexes from metadata available_row_group_indexes = rowgroup_indexing.get_row_group_indexes( dataset) required_indexes = rowgroup_selector.get_index_names() if not set(required_indexes).issubset( set(available_row_group_indexes.keys())): raise ValueError( 'Some of required indexes {} are not available in {}'.format( required_indexes, list(available_row_group_indexes.keys()))) selected_indexes = rowgroup_selector.select_row_groups( available_row_group_indexes) # include only selected_indexes but in filtered_row_group_indexes order filtered_row_group_indexes = [ idx for idx in filtered_row_group_indexes if idx in selected_indexes ] return filtered_row_group_indexes
'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)') args = parser.parse_args() if args.dataset_url and args.dataset_url[-1] == '/': args.dataset_url = args.dataset_url[:-1] # Create pyarrow file system resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver) dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(), validate_schema=False) print_all = not args.schema and not args.index if args.schema or print_all: print('*** Schema from dataset metadata ***') print((dataset_metadata.get_schema(dataset))) if args.index or print_all: index_dict = rowgroup_indexing.get_row_group_indexes(dataset) print('*** Row group indexes from dataset metadata ***') for index_name in index_dict: print(('Index: {}'.format(index_name))) if args.skip_index is None or index_name not in args.skip_index: for field_value in index_dict[index_name].indexed_values: print(' -- {}({})'.format(field_value, len(index_dict[index_name].get_row_group_indexes(field_value)))) if args.print_values: print(index_dict[index_name].get_row_group_indexes(field_value)) else: print(' (skipped)')