def test_arrow_schema_convertion_fail(self): arrow_schema = pa.schema([ pa.field('string', pa.string()), pa.field('binary', pa.binary(10)), ]) with self.assertRaises(ValueError) as ex: Unischema.from_arrow_schema(arrow_schema) assert 'Cannot auto-create unischema due to unsupported column type' in str(ex.exception)
def test_arrow_schema_convertion_fail(): arrow_schema = pa.schema([ pa.field('list_of_int', pa.float16()), ]) mock_dataset = _mock_parquet_dataset([], arrow_schema) with pytest.raises(ValueError, match='Cannot auto-create unischema due to unsupported column type'): Unischema.from_arrow_schema(mock_dataset)
def test_arrow_schema_convertion_fail(self): arrow_schema = pa.schema([ pa.field('list_of_int', pa.list_(pa.int8())), ]) mock_dataset = _mock_parquet_dataset([], arrow_schema) with self.assertRaises(ValueError) as ex: Unischema.from_arrow_schema(mock_dataset) assert 'Cannot auto-create unischema due to unsupported column type' in str(ex.exception)
def test_arrow_schema_convertion(): fields = [ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), pa.field('fixed_size_binary', pa.binary(10)), pa.field('variable_size_binary', pa.binary()), pa.field('decimal', pa.decimal128(3, 4)), pa.field('timestamp_s', pa.timestamp('s')), pa.field('timestamp_ns', pa.timestamp('ns')), pa.field('date_32', pa.date32()), pa.field('date_64', pa.date64()) ] arrow_schema = pa.schema(fields) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert getattr(unischema, name).codec is None if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable # Test schema preserve fields order field_name_list = [f.name for f in fields] assert list(unischema.fields.keys()) == field_name_list
def test_arrow_schema_convertion(self): arrow_schema = pa.schema([ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), pa.field('fixed_size_binary', pa.binary(10)), pa.field('variable_size_binary', pa.binary()), pa.field('decimal', pa.decimal128(3, 4)), pa.field('timestamp_s', pa.timestamp('s')), pa.field('timestamp_ns', pa.timestamp('ns')), pa.field('date_32', pa.date32()), pa.field('date_64', pa.date64()), pa.field('timestamp_ns', pa.timestamp('ns')), ]) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert isinstance(getattr(unischema, name).codec, ScalarCodec) if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable
def test_arrow_schema_convertion_with_int_partitions(): arrow_schema = pa.schema([ pa.field('int8', pa.int8()), ]) mock_dataset = _mock_parquet_dataset([pq.PartitionSet('part_name', ['0', '1', '2'])], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) assert unischema.part_name.numpy_dtype == np.int64
def infer_or_load_unischema(dataset): """Try to recover Unischema object stored by ``materialize_dataset`` function. If it can be loaded, infer Unischema from native Parquet schema""" try: return get_schema(dataset) except PetastormMetadataError: logger.info('Failed loading Unischema from metadata in %s. Assuming the dataset was not created with ' 'Petastorm. Will try to construct from native Parquet schema.') return Unischema.from_arrow_schema(dataset)
def test_arrow_schema_convertion_ignore(): arrow_schema = pa.schema([ pa.field('list_of_int', pa.float16()), pa.field('struct', pa.struct([pa.field('a', pa.string()), pa.field('b', pa.int32())])), ]) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset, omit_unsupported_fields=True) assert not hasattr(unischema, 'list_of_int')
def test_arrow_schema_convertion_with_partitions(self): arrow_schema = pa.schema([ pa.field('int8', pa.int8()), ]) mock_dataset = _mock_parquet_dataset(['part_name'], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) assert unischema.part_name.codec.spark_dtype().typeName() == 'string'
def test_arrow_schema_arrow_1644_list_of_struct(): arrow_schema = pa.schema([ pa.field('id', pa.string()), pa.field('list_of_struct', pa.list_(pa.struct([pa.field('a', pa.string()), pa.field('b', pa.int32())]))) ]) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) assert getattr(unischema, 'id').name == 'id' assert not hasattr(unischema, 'list_of_struct')
def test_arrow_schema_convertion(self): arrow_schema = pa.schema([ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), ]) unischema = Unischema.from_arrow_schema(arrow_schema) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert isinstance(getattr(unischema, name).codec, ScalarCodec) if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable
def __init__(self, pyarrow_filesystem, dataset_path, schema_fields=None, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, reader_pool=None, num_epochs=1, cur_shard=None, shard_count=None, cache=None, infer_schema=False): """Initializes a reader object. :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified, then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for ``s3://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``. If you want to to use ``libhdfs``, use ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``. :param dataset_path: filepath to a parquet directory on the specified filesystem. e.g. ``'/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``. :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default. This pool is a custom implementation used to parallelize reading data from the dataset. Any object from workers_pool package can be used (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`). :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number used. Each reader instance should pass in a unique shard number in the range ``[0, shard_count)``. ``shard_count`` must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shard partitions there are. Defaults to None :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet file the Reader will attempt to load these values from cache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shards are used). By default, use the :class:`.NullCache` implementation. """ # 1. Open the parquet storage (dataset) # 2. Get a list of all groups # 3. Filter rowgroups # a. predicates # b. row-group selector (our indexing mechanism) # c. partition: used to get a subset of data for distributed training # 4. Create a rowgroup ventilator object # 5. Start workers pool if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram) or schema_fields is None): raise ValueError("""Fields must be either None, an iterable collection of Unischema fields or an NGram object.""") self.ngram = schema_fields if isinstance(schema_fields, NGram) else None if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1: raise NotImplementedError('Using timestamp_overlap=False is not implemented with' ' shuffle_options.shuffle_row_drop_partitions > 1') cache = cache or NullCache() self._workers_pool = reader_pool or ThreadPool(10) # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, validate_schema=False) if infer_schema: # If inferring schema, just retrieve the schema from a file of the dataset meta = self.dataset.pieces[0].get_metadata(self.dataset.fs.open) arrow_schema = meta.schema.to_arrow_schema() stored_schema = Unischema.from_arrow_schema(arrow_schema) else: # Otherwise, get the stored schema stored_schema = dataset_metadata.get_schema(self.dataset) # Make a schema view (a view is a Unischema containing only a subset of fields # Will raise an exception if invalid schema fields are in schema_fields fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None self.schema = stored_schema.create_schema_view(fields) if fields else stored_schema # 2. Get a list of all groups row_groups = dataset_metadata.load_row_groups(self.dataset, infer_schema) # 3. Filter rowgroups filtered_row_group_indexes, worker_predicate = self._filter_row_groups(self.dataset, row_groups, predicate, rowgroup_selector, cur_shard, shard_count) # 4. Create a rowgroup ventilator object normalized_shuffle_row_drop_partitions = \ self._normalize_shuffle_options(shuffle_row_drop_partitions, self.dataset) ventilator = self._create_ventilator(filtered_row_group_indexes, shuffle_row_groups, normalized_shuffle_row_drop_partitions, num_epochs, worker_predicate, self._workers_pool.workers_count + _VENTILATE_EXTRA_ROWGROUPS) # 5. Start workers pool self._workers_pool.start(ReaderWorker, (pyarrow_filesystem, dataset_path, self.schema, self.ngram, row_groups, cache), ventilator=ventilator) logger.debug('Workers pool started') self.last_row_consumed = False # _result self._result_buffer = []
def __init__(self, dataset_url, schema_fields=None, predicate=None, rowgroup_selector=None, num_epochs=1, sequence=None, cur_shard=None, shard_count=None, read_timeout_s=None, cache=None, loader_pool=None, decoder_pool=None, shuffling_queue=None, shuffle_row_groups=True, shuffle_row_drop_partitions=1, pyarrow_filesystem=None, infer_schema=False): """Initializes a reader object. :param dataset_url: an filepath or a url to a parquet directory, e.g. 'hdfs://some_hdfs_cluster/user/yevgeni/parquet8', or '/tmp/mydataset' or ``'s3://bucket/mydataset'``. :param schema_fields: Either list of unischema fields to subset, or None to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param reader_pool: parallelization pool. ThreadPool(10) (10 threads) is used by default. This pool is a custom implementation used to parallelize reading data from the dataset. Any object from workers_pool package can be used (e.g. ProcessPool) :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting num_epochs to 'None' will result in an infinite number of epochs. :param sequence: This is deprecated. To use sequence/ngram, please supply the argument in schema_fields instead. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard count must be supplied as well. :param shard_count An int denoting the number of shards to break this dataset into. :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it times out and raises an EmptyResultError. Pass in None for an infinite timeout :param cache: An object conforming to `cache.CacheBase` interface. Before loading row groups from a parquet file the Reader will attempt to load these values from cache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if num_training_partitions is used). :param decoder_pool: An instance of a concurrent.futures pool executor used for decoding. If None, a default ThreadPoolExecutor(5) will be used. :param loader_pool: An instance of a concurrent.futures pool executor used for decoding. If None, a default ThreadPoolExecutor(5) will be used. By default, `NullCache` implementation """ # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) # 2. Get a list of all groups # 3. Filter rowgroups # a. predicates # b. row-group selector (our indexing mechanism) # c. partition: used to get a subset of data for distributed training # 4. Launch a new thread running `worker_loop` function. if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram) or schema_fields is None): raise ValueError("""Fields must be either None, an iterable collection of Unischema fields or an NGram object.""") if sequence is not None: raise ValueError("""'sequence' argument of Reader object is deprecated. Please pass an NGram instance to 'schema_fields' argument instead.""") self.ngram = schema_fields if isinstance(schema_fields, NGram) else None if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1: raise NotImplementedError('Using timestamp_overlap=False is not implemented with' ' shuffle_options.shuffle_row_drop_partitions > 1') cache = cache or NullCache() dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) logger.debug('dataset_url: %s', dataset_url) if pyarrow_filesystem is not None: filesystem = pyarrow_filesystem dataset_path = urlparse(dataset_url).path else: resolver = FilesystemResolver(dataset_url) filesystem = resolver.filesystem() dataset_path = resolver.get_dataset_path() self._dataset = pq.ParquetDataset(dataset_path, filesystem=filesystem, validate_schema=False) shuffle_row_drop_partitions = self._normalize_shuffle_options(shuffle_row_drop_partitions, self._dataset) if infer_schema: # If inferring schema, just retrieve the schema from a file of the dataset meta = self._dataset.pieces[0].get_metadata(self._dataset.fs.open) arrow_schema = meta.schema.to_arrow_schema() stored_schema = Unischema.from_arrow_schema(arrow_schema) else: # Otherwise, get the stored schema stored_schema = dataset_metadata.get_schema(self._dataset) # Make a schema view (a view is a Unischema containing only a subset of fields # Will raise an exception if invalid schema fields are in schema_fields fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None self.schema = stored_schema.create_schema_view(fields) if fields else stored_schema # 2. Get a list of all groups row_groups = dataset_metadata.load_row_groups(self._dataset, infer_schema) # 3. Filter rowgroups filtered_row_groups, worker_predicate = self._filter_row_groups(self._dataset, row_groups, predicate, rowgroup_selector, cur_shard, shard_count) epoch_items = self._apply_row_drop_partition(filtered_row_groups, shuffle_row_drop_partitions) # 4. Launch a new thread running `worker_loop` function. def epochs_iterator(): return epoch_generator(epoch_items, num_epochs, shuffle_row_groups) self._results_queue = Queue(_OUTPUT_QUEUE_SIZE) loader = RowGroupLoader(dataset_url, self.schema, self.ngram, cache, worker_predicate) decoder = RowDecoder(self.schema, self.ngram) self._loader_pool = loader_pool or ThreadPoolExecutor(5) self._decoder_pool = decoder_pool or ThreadPoolExecutor(5) self._stop_flow_manager_event = threading.Event() self._diags = Counter() if not shuffling_queue: shuffling_queue = NoopShufflingBuffer() self._flow_manager_thread = threading.Thread(target=worker_loop, args=(epochs_iterator, self._loader_pool, loader, self._decoder_pool, decoder, shuffling_queue, self._results_queue, self._stop_flow_manager_event, self._diags)) self._flow_manager_thread.daemon = True self._flow_manager_thread.start() self._read_timeout_s = read_timeout_s