def test_max_ventilation_size(self): """Tests that we dont surpass a max ventilation size in each pool type (since it relies on accurate ventilation size reporting)""" max_ventilation_size = 10 for pool in [DummyPool(), ProcessPool(10), ThreadPool(10)]: ventilator = ConcurrentVentilator(ventilate_fn=pool.ventilate, items_to_ventilate=[{'item': i} for i in range(100)], max_ventilation_queue_size=max_ventilation_size) pool.start(IdentityWorker, ventilator=ventilator) # Give time for the thread to fill the ventilation queue while ventilator._ventilated_items_count - ventilator._processed_items_count < max_ventilation_size: time.sleep(.1) # After stopping the ventilator queue, we should only get 10 results ventilator.stop() for _ in range(max_ventilation_size): pool.get_results() with self.assertRaises(EmptyResultError): pool.get_results() pool.stop() pool.join()
def test_reset_ventilator(self): """Resetting ventilator after all items were ventilated will make it re-ventilate the same items""" items_count = 100 for pool in [DummyPool(), ThreadPool(10)]: ventilator = ConcurrentVentilator(ventilate_fn=pool.ventilate, items_to_ventilate=[{ 'item': i } for i in range(items_count)], iterations=1) pool.start(IdentityWorker, ventilator=ventilator) # Readout all ventilated items for _ in range(items_count): pool.get_results() # Should fail reading the next, as all items were read by now with self.assertRaises(EmptyResultError): pool.get_results() # Resetting, hence will be read out the items all over again ventilator.reset() for _ in range(items_count): pool.get_results() with self.assertRaises(EmptyResultError): pool.get_results() pool.stop() pool.join()
def _test_noncontinuous_ngram_tf(ngram_fields, synthetic_dataset): """Test non continuous ngram in tf of a certain length. Non continuous here refers to that the reader will not necessarily return consecutive ngrams because partition is more than one and false is true.""" dataset_dicts = synthetic_dataset.data ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) reader = Reader( schema_fields=ngram, dataset_url=synthetic_dataset.url, reader_pool=ThreadPool(1), ) readout_examples = tf_tensors(reader) # Make sure we have static shape info for all fields for timestep in readout_examples: for field in readout_examples[timestep]: assert field.get_shape().dims is not None # Read a bunch of entries from the dataset and compare the data to reference with tf.Session() as sess: for _ in range(5): actual = sess.run(readout_examples) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram) reader.stop() reader.join()
def _test_continuous_ngram_tf(ngram_fields, dataset_num_files_1): """Tests continuous ngram in tf of a certain length. Continuous here refers to that this reader will always return consecutive ngrams due to shuffle being false and partition being 1. """ ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_num_files_1.url, reader_pool=ThreadPool(1), shuffle_options=ShuffleOptions(False)) as reader: readout_examples = tf_tensors(reader) # Make sure we have static shape info for all fields for timestep in readout_examples: for field in readout_examples[timestep]: assert field.get_shape().dims is not None # Read a bunch of entries from the dataset and compare the data to reference expected_id = 0 with tf.Session() as sess: for _ in range(5): actual = sess.run(readout_examples) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_num_files_1.data, expected_id) _assert_equal_ngram(actual, expected_ngram) expected_id = expected_id + 1
def test_single_column_predicate(synthetic_dataset): """Test quering a single column with a predicate on the same column """ with Reader(synthetic_dataset.url, schema_fields=[TestSchema.id], predicate=EqualPredicate({'id': 1}), reader_pool=ThreadPool(1)) as reader: # Read a bunch of entries from the dataset and compare the data to reference for row in reader: actual = dict(row._asdict()) expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id']) np.testing.assert_equal(expected['id'], actual['id'])
def _create_worker_pool(pool_type, workers_count, profiling_enabled, pyarrow_serialize): """Different worker pool implementation (in process none or thread-pool, out of process pool)""" if pool_type == WorkerPoolType.THREAD: worker_pool = ThreadPool(workers_count, profiling_enabled=profiling_enabled) elif pool_type == WorkerPoolType.PROCESS: worker_pool = ProcessPool(workers_count, serializer=PyArrowSerializer() if pyarrow_serialize else PickleSerializer()) elif pool_type == WorkerPoolType.NONE: worker_pool = DummyPool() else: raise ValueError('Supported pool types are thread, process or dummy. Got {}.'.format(pool_type)) return worker_pool
def test_exception_in_worker_thread(self): """ Test exception handler in thread pool """ QUEUE_SIZE = 100 pool = ThreadPool(10, results_queue_size=QUEUE_SIZE) self._test_exception_in_worker_impl(pool, QUEUE_SIZE) pool.stop() pool.join()
def test_invalid_schema_field(synthetic_dataset): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError) as e: Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1)) assert 'bogus_key' in str(e)
def test_predicate_with_invalid_fields(synthetic_dataset): """Try passing an invalid field name from a predicate to the reader. An error should be raised.""" TEST_CASES = [ {'invalid_field_name': 1}, dict(), {'invalid_field_name': 1, 'id': 11}, {'invalid_field_name': 1, 'invalid_field_name_2': 11}] for predicate_spec in TEST_CASES: with Reader(synthetic_dataset.url, shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(predicate_spec), reader_pool=ThreadPool(1)) as reader: with pytest.raises(ValueError): next(reader)
def test_worker_produces_no_results(self): """Check edge case, when workers consistently does not produce results""" # 10000 is an interesting case as in the original implementation it caused stack overflow for ventilate_count in [10, 10000]: for pool in [DummyPool(), ThreadPool(2)]: pool.start(PreprogrammedReturnValueWorker, ventilate_count * [[]]) for _ in range(ventilate_count): pool.ventilate('not_important') with self.assertRaises(EmptyResultError): pool.get_results() pool.stop() pool.join()
def test_worker_produces_some_results(self): """Check edge case, when workers consistently does not produce results""" # 10000 is an interesting case as in the original implementation it caused stack overflow VENTILATE_COUNT = 4 for pool in [DummyPool(), ThreadPool(1)]: pool.start(PreprogrammedReturnValueWorker, [[], [], [42], []]) for _ in range(VENTILATE_COUNT): pool.ventilate('not_important') self.assertEqual(42, pool.get_results()) with self.assertRaises(EmptyResultError): pool.get_results() pool.stop() pool.join()
def test_reset_in_the_middle_of_ventilation(self): """Can not reset ventilator in the middle of ventilation""" for pool in [DummyPool(), ThreadPool(10)]: ventilator = ConcurrentVentilator(ventilate_fn=pool.ventilate, items_to_ventilate=[{ 'item': i } for i in range(100)], iterations=None) pool.start(IdentityWorker, ventilator=ventilator) # Resetting is supported only when the ventilator has finished with self.assertRaises(NotImplementedError): ventilator.reset() pool.stop() pool.join()
def _test_continuous_ngram(ngram_fields, dataset_num_files_1): """Test continuous ngram of a certain length. Continuous here refers to that this reader will always return consecutive ngrams due to shuffle being false and partition being 1.""" ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_num_files_1.url, reader_pool=ThreadPool(1), shuffle_options=ShuffleOptions(False)) as reader: expected_id = 0 for _ in range(ngram.length): actual = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_num_files_1.data, expected_id) np.testing.assert_equal(actual, expected_ngram) expected_id = expected_id + 1
def test_ngram_delta_threshold(dataset_0_3_8_10_11_20_23): """Test to verify that delta threshold work as expected in one partition in the same ngram and between consecutive ngrams. delta threshold here refers that each ngram must not be more than delta threshold apart for the field specified by timestamp_field.""" fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=4, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_0_3_8_10_11_20_23.url, reader_pool=ThreadPool(1), shuffle_options=ShuffleOptions(False)) as reader: # NGrams expected: (0, 3), (8, 10), (10, 11) first_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 0) np.testing.assert_equal(first_item, expected_ngram) second_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 3) np.testing.assert_equal(second_item, expected_ngram) third_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 5) np.testing.assert_equal(third_item, expected_ngram) with pytest.raises(StopIteration): next(reader)
def test_ngram_delta_small_threshold(): """Test to verify that a small threshold work in ngrams.""" with temporary_directory() as tmp_dir: tmp_url = 'file://{}'.format(tmp_dir) ids = range(0, 99, 5) create_test_dataset(tmp_url, ids) fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=tmp_url, reader_pool=ThreadPool(10)) as reader: with pytest.raises(StopIteration): next(reader)
from petastorm.tests.test_end_to_end_predicates_impl import \ PartitionKeyInSetPredicate, EqualPredicate from petastorm.unischema import UnischemaField, Unischema from petastorm.workers_pool.dummy_pool import DummyPool from petastorm.workers_pool.process_pool import ProcessPool from petastorm.workers_pool.thread_pool import ThreadPool # pylint: disable=unnecessary-lambda MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: Reader(url, reader_pool=DummyPool(), **kwargs), lambda url, **kwargs: ReaderV2(url, **kwargs) ] # pylint: disable=unnecessary-lambda ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: Reader(url, reader_pool=ThreadPool(10), **kwargs), lambda url, **kwargs: Reader(url, reader_pool=ProcessPool(10), **kwargs), lambda url, **kwargs: ReaderV2( url, decoder_pool=ProcessPoolExecutor(10), **kwargs) ] def _check_simple_reader(reader, expected_data): # Read a bunch of entries from the dataset and compare the data to reference def _type(v): return v.dtype if isinstance(v, np.ndarray) else type(v) for row in reader: actual = row._asdict() expected = next(d for d in expected_data if d['id'] == actual['id']) np.testing.assert_equal(actual, expected)
def test_exception_reusing_thread_pool(self): WORKERS_COUNT = 10 pool = ThreadPool(WORKERS_COUNT) pool.start(WorkerIdGeneratingWorker) with self.assertRaises(EmptyResultError): pool.get_results() pool.ventilate() self.assertIsNotNone(pool.get_results()) with self.assertRaises(EmptyResultError): pool.get_results() pool.stop() pool.join() with self.assertRaises(RuntimeError) as e: pool.start(WorkerIdGeneratingWorker) self.assertTrue( 'ThreadPool({}) cannot be reused! stop_event set? {}'.format( WORKERS_COUNT, True) in str(e.exception))
def test_exception_in_worker_thread(self): """ Test exception handler in thread pool """ QUEUE_SIZE = 100 self._test_exception_in_worker_impl( ThreadPool(10, results_queue_size=QUEUE_SIZE), QUEUE_SIZE)
def test_stop_when_result_queue_is_full(self): """Makes sure we don't block indefinitely on ventilator queue""" SLEEP_DELTA = 0.01 TIMEOUT = 20 QUEUE_SIZE = 2 pool = ThreadPool(10, results_queue_size=QUEUE_SIZE) pool.start(WorkerIdGeneratingWorker) for _ in range(100): pool.ventilate() cumulative_wait = 0 while pool.results_qsize() != QUEUE_SIZE: time.sleep(SLEEP_DELTA) cumulative_wait += SLEEP_DELTA # Make sure we wait no longer than the timeout. Otherwise, something is very wrong self.assertLess( cumulative_wait, TIMEOUT, msg='Timeout while waiting for the results queue to fill') # No need to read from the queue. We are testing ability to exit when workers might be blocked on the # results queue pool.stop() pool.join()
def test_ventilator_threads(self): self._test_simple_ventilation(lambda: ThreadPool(10))
def make_reader(dataset_url, schema_fields=None, reader_pool_type='thread', workers_count=10, pyarrow_serialize=False, results_queue_size=50, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, num_epochs=1, cur_shard=None, shard_count=None, shard_seed=None, cache_type=NULL_CACHE, cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3', transform_spec=None, filters=None, storage_options=None, zmq_copy_buffers=True, filesystem=None): """ Creates an instance of Reader for reading Petastorm datasets. A Petastorm dataset is a dataset generated using :func:`~petastorm.etl.dataset_metadata.materialize_dataset` context manager as explained `here <https://petastorm.readthedocs.io/en/latest/readme_include.html#generating-a-dataset>`_. See :func:`~petastorm.make_batch_reader` to read from a Parquet store that was not generated using :func:`~petastorm.etl.dataset_metadata.materialize_dataset`. :param dataset_url: an filepath or a url to a parquet directory, e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``, or ``'s3://bucket/mydataset'``, or ``'gs://bucket/mydataset'``. :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields; an NGram object, then it will return an NGram of the specified fields. :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy'] denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread' :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the thread or process pool. Defaults to 10 :param pyarrow_serialize: THE ARGUMENT IS DEPRECATED AND WILL BE REMOVED IN FUTURE VERSIONS. :param results_queue_size: Size of the results queue to store prefetched row-groups. Currently only applicable to thread reader pool type. :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate will be passed a single row and must return a boolean value indicating whether to include it in the results. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None :param shard_seed: Random seed to shuffle row groups for data sharding. Defaults to None :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache. :param cache_location: A string denoting the location or path of the cache. :param cache_size_limit: An int specifying the size limit of the cache in bytes :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation, :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param transform_spec: An instance of :class:`~petastorm.transform.TransformSpec` object defining how a record is transformed after it is loaded and decoded. The transformation occurs on a worker thread/process (depends on the ``reader_pool_type`` value). :param filters: (List[Tuple] or List[List[Tuple]]): Standard PyArrow filters. These will be applied when loading the parquet file with PyArrow. More information here: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html :param storage_options: Dict of kwargs forwarded to ``fsspec`` to initialize the filesystem. :param zmq_copy_buffers: A bool indicating whether to use 0mq copy buffers with ProcessPool. :param filesystem: An instance of ``pyarrow.FileSystem`` to use. Will ignore storage_options and other filesystem configs if it's provided. :return: A :class:`Reader` object """ dataset_url = normalize_dir_url(dataset_url) filesystem, dataset_path = get_filesystem_and_path_or_paths( dataset_url, hdfs_driver, storage_options=storage_options, filesystem=filesystem) if cache_type is None or cache_type == NULL_CACHE: cache = NullCache() elif cache_type == LOCAL_DISK_CACHE: cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {}) else: raise ValueError('Unknown cache_type: {}'.format(cache_type)) try: dataset_metadata.get_schema_from_dataset_url( dataset_url, hdfs_driver=hdfs_driver, storage_options=storage_options, filesystem=filesystem) except PetastormMetadataError: warnings.warn( 'Currently make_reader supports reading only Petastorm datasets. ' 'To read from a non-Petastorm Parquet store use make_batch_reader') if reader_pool_type == 'thread': reader_pool = ThreadPool(workers_count, results_queue_size) elif reader_pool_type == 'process': if pyarrow_serialize: warnings.warn( "pyarrow_serializer was deprecated and will be removed in future versions. " "The argument no longer has any effect.") serializer = PickleSerializer() reader_pool = ProcessPool(workers_count, serializer, zmq_copy_buffers=zmq_copy_buffers) elif reader_pool_type == 'dummy': reader_pool = DummyPool() else: raise ValueError( 'Unknown reader_pool_type: {}'.format(reader_pool_type)) kwargs = { 'schema_fields': schema_fields, 'reader_pool': reader_pool, 'shuffle_row_groups': shuffle_row_groups, 'shuffle_row_drop_partitions': shuffle_row_drop_partitions, 'predicate': predicate, 'rowgroup_selector': rowgroup_selector, 'num_epochs': num_epochs, 'cur_shard': cur_shard, 'shard_count': shard_count, 'shard_seed': shard_seed, 'cache': cache, 'transform_spec': transform_spec, 'filters': filters } try: return Reader(filesystem, dataset_path, worker_class=PyDictReaderWorker, is_batched_reader=False, **kwargs) except PetastormMetadataError as e: logger.error('Unexpected exception: %s', str(e)) raise RuntimeError( 'make_reader has failed. If you were trying to open a Parquet store that was not ' 'created using Petastorm materialize_dataset and it contains only scalar columns, ' 'you may use make_batch_reader to read it.\n' 'Inner exception: %s', str(e))
def make_batch_reader(dataset_url_or_urls, schema_fields=None, reader_pool_type='thread', workers_count=10, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, num_epochs=1, cur_shard=None, shard_count=None, shard_seed=None, cache_type='null', cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3', transform_spec=None, filters=None, storage_options=None, zmq_copy_buffers=True, filesystem=None): """ Creates an instance of Reader for reading batches out of a non-Petastorm Parquet store. Currently, only stores having native scalar parquet data types are supported. Use :func:`~petastorm.make_reader` to read Petastorm Parquet stores generated with :func:`~petastorm.etl.dataset_metadata.materialize_dataset`. NOTE: only scalar columns or array type (of primitive type element) columns are currently supported. NOTE: If without `schema_fields` specified, the reader schema will be inferred from parquet dataset. then the reader schema fields order will preserve parqeut dataset fields order (partition column come first), but if setting `transform_spec` and specified `TransformSpec.selected_fields`, then the reader schema fields order will be the order of 'selected_fields'. :param dataset_url_or_urls: a url to a parquet directory or a url list (with the same scheme) to parquet files. e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``, or ``'s3://bucket/mydataset'``, or ``'gs://bucket/mydataset'``, or ``[file:///tmp/mydataset/00000.parquet, file:///tmp/mydataset/00001.parquet]``. :param schema_fields: A list of regex pattern strings. Only columns matching at least one of the patterns in the list will be loaded. :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy'] denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread' :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the thread or process pool. Defaults to 10 :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate will be passed a pandas DataFrame object and must return a pandas Series with boolean values of matching dimensions. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None :param shard_seed: Random seed to shuffle row groups for data sharding. Defaults to None :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache. :param cache_location: A string denoting the location or path of the cache. :param cache_size_limit: An int specifying the size limit of the cache in bytes :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation, :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param transform_spec: An instance of :class:`~petastorm.transform.TransformSpec` object defining how a record is transformed after it is loaded and decoded. The transformation occurs on a worker thread/process (depends on the ``reader_pool_type`` value). :param filters: (List[Tuple] or List[List[Tuple]]): Standard PyArrow filters. These will be applied when loading the parquet file with PyArrow. More information here: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html :param storage_options: Dict of kwargs forwarded to ``fsspec`` to initialize the filesystem. :param zmq_copy_buffers: A bool indicating whether to use 0mq copy buffers with ProcessPool. :param filesystem: An instance of ``pyarrow.FileSystem`` to use. Will ignore storage_options and other filesystem configs if it's provided. :return: A :class:`Reader` object """ dataset_url_or_urls = normalize_dataset_url_or_urls(dataset_url_or_urls) filesystem, dataset_path_or_paths = get_filesystem_and_path_or_paths( dataset_url_or_urls, hdfs_driver, storage_options=storage_options, filesystem=filesystem) try: dataset_metadata.get_schema_from_dataset_url( dataset_url_or_urls, hdfs_driver=hdfs_driver, storage_options=storage_options, filesystem=filesystem) warnings.warn( 'Please use make_reader (instead of \'make_batch_dataset\' function to read this dataset. ' 'You may get unexpected results. ' 'Currently make_batch_reader supports reading only Parquet stores that contain ' 'standard Parquet data types and do not require petastorm decoding.' ) except PetastormMetadataError: pass if cache_type is None or cache_type == NULL_CACHE: cache = NullCache() elif cache_type == LOCAL_DISK_CACHE: cache = LocalDiskArrowTableCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {}) else: raise ValueError('Unknown cache_type: {}'.format(cache_type)) if reader_pool_type == 'thread': reader_pool = ThreadPool(workers_count) elif reader_pool_type == 'process': serializer = ArrowTableSerializer() reader_pool = ProcessPool(workers_count, serializer, zmq_copy_buffers=zmq_copy_buffers) elif reader_pool_type == 'dummy': reader_pool = DummyPool() else: raise ValueError( 'Unknown reader_pool_type: {}'.format(reader_pool_type)) return Reader(filesystem, dataset_path_or_paths, schema_fields=schema_fields, worker_class=ArrowReaderWorker, reader_pool=reader_pool, shuffle_row_groups=shuffle_row_groups, shuffle_row_drop_partitions=shuffle_row_drop_partitions, predicate=predicate, rowgroup_selector=rowgroup_selector, num_epochs=num_epochs, cur_shard=cur_shard, shard_count=shard_count, shard_seed=shard_seed, cache=cache, transform_spec=transform_spec, is_batched_reader=True, filters=filters)
def __init__(self, pyarrow_filesystem, dataset_path, schema_fields=None, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, reader_pool=None, num_epochs=1, cur_shard=None, shard_count=None, cache=None, worker_class=None): """Initializes a reader object. :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified, then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for ``s3://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``. If you want to to use ``libhdfs``, use ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``. :param dataset_path: filepath to a parquet directory on the specified filesystem. e.g. ``'/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``. :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default. This pool is a custom implementation used to parallelize reading data from the dataset. Any object from workers_pool package can be used (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`). :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number used. Each reader instance should pass in a unique shard number in the range ``[0, shard_count)``. ``shard_count`` must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shard partitions there are. Defaults to None :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet file the Reader will attempt to load these values from cache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shards are used). By default, use the :class:`.NullCache` implementation. :param worker_class: This is the class that will be instantiated on a different thread/process. It's responsibility is to load and filter the data. """ # 1. Open the parquet storage (dataset) # 2. Get a list of all groups # 3. Filter rowgroups # a. predicates # b. row-group selector (our indexing mechanism) # c. partition: used to get a subset of data for distributed training # 4. Create a rowgroup ventilator object # 5. Start workers pool if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram) or schema_fields is None): raise ValueError( """Fields must be either None, an iterable collection of Unischema fields or an NGram object.""") self.ngram = schema_fields if isinstance(schema_fields, NGram) else None # By default, use original method of working with list of dictionaries and not arrow tables worker_class = worker_class or PyDictReaderWorker self._results_queue_reader = worker_class.new_results_queue_reader() if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1: raise NotImplementedError( 'Using timestamp_overlap=False is not implemented with' ' shuffle_options.shuffle_row_drop_partitions > 1') cache = cache or NullCache() self._workers_pool = reader_pool or ThreadPool(10) # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem, validate_schema=False) stored_schema = infer_or_load_unischema(self.dataset) # Make a schema view (a view is a Unischema containing only a subset of fields # Will raise an exception if invalid schema fields are in schema_fields fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None self.schema = stored_schema.create_schema_view( fields) if fields else stored_schema # 2. Get a list of all row groups row_groups = dataset_metadata.load_row_groups(self.dataset) # 3. Filter rowgroups filtered_row_group_indexes, worker_predicate = self._filter_row_groups( self.dataset, row_groups, predicate, rowgroup_selector, cur_shard, shard_count) # 4. Create a rowgroup ventilator object normalized_shuffle_row_drop_partitions = \ self._normalize_shuffle_options(shuffle_row_drop_partitions, self.dataset) ventilator = self._create_ventilator( filtered_row_group_indexes, shuffle_row_groups, normalized_shuffle_row_drop_partitions, num_epochs, worker_predicate, self._workers_pool.workers_count + _VENTILATE_EXTRA_ROWGROUPS) # 5. Start workers pool self._workers_pool.start(worker_class, (pyarrow_filesystem, dataset_path, self.schema, self.ngram, row_groups, cache), ventilator=ventilator) logger.debug('Workers pool started') self.last_row_consumed = False
def test_passing_args_threads(self): self._passing_args_impl(lambda: ThreadPool(10))
def make_batch_carbon_reader(dataset_url, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, schema_fields=None, reader_pool_type='thread', workers_count=10, results_queue_size=100, shuffle_blocklets=True, shuffle_row_drop_partitions=1, predicate=None, blocklet_selector=None, num_epochs=1, cur_shard=None, shard_count=None, cache_type='null', cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3', transform_spec=None): """ Creates an instance of Reader for reading batches out of a non-Pycarbon Carbon store. Currently, only stores having native scalar carbon data types are supported. Use :func:`~pycarbon.make_carbon_reader` to read Pycarbon Carbon stores generated with :func:`~pycarbon.etl.carbon_dataset_metadata.materialize_dataset_carbon`. NOTE: only scalar columns are currently supported. :param dataset_url: an filepath or a url to a carbon directory, e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/carbon8'``, or ``'file:///tmp/mydataset'`` or ``'s3://bucket/mydataset'``. :param key: access key :param secret: secret key :param endpoint: endpoint_url :param proxy: proxy :param proxy_port: proxy_port :param schema_fields: A list of regex pattern strings. Only columns matching at least one of the patterns in the list will be loaded. :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy'] denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread' :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the thread or process pool. Defaults to 10 :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to thread reader pool type. :param shuffle_blocklets: Whether to shuffle blocklets (the order in which full blocklets are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a blocklet into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each blocklet read will drop half of the rows within every blocklet and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate will be passed a pandas DataFrame object and must return a pandas Series with boolean values of matching dimensions. :param blocklet_selector: instance of blocklet selector object to select blocklets to be read :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache. :param cache_location: A string denoting the location or path of the cache. :param cache_size_limit: An int specifying the size limit of the cache in bytes :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation, :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param transform_spec: An instance of :class:`~petastorm.transform.TransformSpec` object defining how a record is transformed after it is loaded and decoded. The transformation occurs on a worker thread/process (depends on the ``reader_pool_type`` value). :return: A :class:`Reader` object """ if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url logger.debug('dataset_url: %s', dataset_url) resolver = CarbonFilesystemResolver(dataset_url, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port, hdfs_driver=hdfs_driver) filesystem = resolver.filesystem() try: carbon_dataset_metadata.get_schema_from_dataset_url_carbon( dataset_url, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port, filesystem=filesystem) warnings.warn( 'Please use make_carbon_reader (instead of \'make_batch_carbon_reader\' function ' 'to read this dataset as it contains unischema file.') except PycarbonMetadataError: pass if cache_type is None or cache_type == 'null': cache = NullCache() elif cache_type == 'local-disk': cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {}) elif cache_type == 'memory-cache': cache = LocalMemoryCache(cache_size_limit) else: raise ValueError('Unknown cache_type: {}'.format(cache_type)) if reader_pool_type == 'thread': reader_pool = ThreadPool(workers_count, results_queue_size) elif reader_pool_type == 'process': raise NotImplementedError('not support process reader_pool_type now.') elif reader_pool_type == 'dummy': raise NotImplementedError('not support dummy reader_pool_type now.') else: raise ValueError( 'Unknown reader_pool_type: {}'.format(reader_pool_type)) return CarbonDataReader( filesystem, dataset_url, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port, schema_fields=schema_fields, worker_class=ArrowCarbonReaderWorker, reader_pool=reader_pool, shuffle_blocklets=shuffle_blocklets, shuffle_row_drop_partitions=shuffle_row_drop_partitions, predicate=predicate, blocklet_selector=blocklet_selector, num_epochs=num_epochs, cur_shard=cur_shard, shard_count=shard_count, cache=cache, transform_spec=transform_spec)
def make_carbon_reader(dataset_url, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, schema_fields=None, reader_pool_type='thread', workers_count=10, results_queue_size=100, shuffle_blocklets=True, shuffle_row_drop_partitions=1, predicate=None, blocklet_selector=None, num_epochs=1, cur_shard=None, shard_count=None, cache_type='null', cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3', reader_engine='reader_v1', reader_engine_params=None, transform_spec=None): """ Creates an instance of Reader for reading Pycarbon datasets. A Pycarbon dataset is a dataset generated using :func:`~pycarbon.etl.carbon_dataset_metadata.materialize_dataset_carbon` context manager as explained See :func:`~pycarbon.make_batch_carbon_reader` to read from a Carbon store that was not generated using :func:`~pycarbon.etl.carbon_dataset_metadata.materialize_dataset_carbon`. :param dataset_url: an filepath or a url to a carbon directory, e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/carbon8'``, or ``'file:///tmp/mydataset'`` or ``'s3://bucket/mydataset'``. :param key: access key :param secret: secret key :param endpoint: endpoint_url :param proxy: proxy :param proxy_port: proxy_port :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields; an NGram object, then it will return an NGram of the specified fields. :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy'] denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread' TODO: process support :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the thread or process pool. Defaults to 10 :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to thread reader pool type. :param shuffle_blocklets: Whether to shuffle blocklets (the order in which full blocklets are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a blocklet into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each blocklet read will drop half of the rows within every blocklet and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate will be passed a single row and must return a boolean value indicating whether to include it in the results. :param blocklet_selector: instance of blocklet selector object to select blocklet to be read TODO: blocklet_selector :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None TODO: cur_shard & shard_count :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache. :param cache_location: A string denoting the location or path of the cache. :param cache_size_limit: An int specifying the size limit of the cache in bytes :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation, :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1' (the default value) selects a stable reader implementation. TODO: experimental_reader_v2 for carbon :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader implementation constructor chosen by ``reader_engine`` argument. You should not use this parameter, unless you fine-tuning of a reader. :param transform_spec: An instance of :class:`~petastorm.transform.TransformSpec` object defining how a record is transformed after it is loaded and decoded. The transformation occurs on a worker thread/process (depends on the ``reader_pool_type`` value). :return: A :class:`Reader` object """ if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url logger.debug('dataset_url: %s', dataset_url) resolver = CarbonFilesystemResolver(dataset_url, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port, hdfs_driver=hdfs_driver) filesystem = resolver.filesystem() if cache_type is None or cache_type == 'null': cache = NullCache() elif cache_type == 'local-disk': cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {}) elif cache_type == 'memory-cache': cache = LocalMemoryCache(cache_size_limit) else: raise ValueError('Unknown cache_type: {}'.format(cache_type)) # Fail if this is a non-pycarbon dataset. Typically, a Carbon store will have hundred thousands rows in a single # blocklet. Using PyDictCarbonReaderWorker or ReaderV2 implementation is very inefficient as it processes data on a # row by row basis. ArrowCarbonReaderWorker (used by make_batch_carbon_reader) is much more efficient in these cases. try: carbon_dataset_metadata.get_schema_from_dataset_url_carbon( dataset_url, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port, filesystem=filesystem) except PycarbonMetadataError: raise RuntimeError( 'Currently make_carbon_reader supports reading only Pycarbon datasets(has unischema). ' 'To read from a non-Pycarbon Carbon store use make_batch_carbon_reader' ) if reader_engine == 'reader_v1': if reader_pool_type == 'thread': reader_pool = ThreadPool(workers_count, results_queue_size) elif reader_pool_type == 'process': raise NotImplementedError( 'not support process reader_pool_type now.') elif reader_pool_type == 'dummy': raise NotImplementedError( 'not support dummy reader_pool_type now.') else: raise ValueError( 'Unknown reader_pool_type: {}'.format(reader_pool_type)) # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified kwargs = { 'key': key, 'secret': secret, 'endpoint': endpoint, 'proxy': proxy, 'proxy_port': proxy_port, 'schema_fields': schema_fields, 'reader_pool': reader_pool, 'shuffle_blocklets': shuffle_blocklets, 'shuffle_row_drop_partitions': shuffle_row_drop_partitions, 'predicate': predicate, 'blocklet_selector': blocklet_selector, 'num_epochs': num_epochs, 'cur_shard': cur_shard, 'shard_count': shard_count, 'cache': cache, 'transform_spec': transform_spec, } if reader_engine_params: kwargs.update(reader_engine_params) try: return CarbonDataReader(filesystem, dataset_url, worker_class=PyDictCarbonReaderWorker, **kwargs) except PycarbonMetadataError as e: logger.error('Unexpected exception: %s', str(e)) raise RuntimeError( 'make_carbon_reader has failed. If you were trying to open a Carbon store that was not ' 'created using Pycarbon materialize_dataset_carbon and it contains only scalar columns, ' 'you may use make_batch_reader to read it.\n' 'Inner exception: %s', str(e)) elif reader_engine == 'experimental_reader_v2': raise NotImplementedError( 'not support experimental_reader_v2 reader engine now.') else: raise ValueError( 'Unexpected value of reader_engine argument \'%s\'. ' 'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'', reader_engine)
def make_batch_reader(dataset_url, schema_fields=None, reader_pool_type='thread', workers_count=10, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, num_epochs=1, cur_shard=None, shard_count=None, cache_type='null', cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3'): """ Creates an instance of Reader for reading batches out of a non-Petastorm Parquet store. Currently, only stores having native scalar parquet data types are supported. Use :func:`~petastorm.make_reader` to read Petastorm Parquet stores generated with :func:`~petastorm.etl.dataset_metadata.materialize_dataset`. NOTE: only scalar columns are currently supported. :param dataset_url: an filepath or a url to a parquet directory, e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'`` or ``'s3://bucket/mydataset'``. :param schema_fields: A list of regex pattern strings. Only columns matching at least one of the patterns in the list will be loaded. :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy'] denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread' :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the thread or process pool. Defaults to 10 :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate will be passed a pandas DataFrame object and must return a pandas Series with boolean values of matching dimensions. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache. :param cache_location: A string denoting the location or path of the cache. :param cache_size_limit: An int specifying the size limit of the cache in bytes :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation, :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :return: A :class:`Reader` object """ if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url logger.debug('dataset_url: %s', dataset_url) resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver) filesystem = resolver.filesystem() dataset_path = resolver.parsed_dataset_url().path if cache_type is None or cache_type == 'null': cache = NullCache() elif cache_type == 'local-disk': cache = LocalDiskArrowTableCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {}) else: raise ValueError('Unknown cache_type: {}'.format(cache_type)) if reader_pool_type == 'thread': reader_pool = ThreadPool(workers_count) elif reader_pool_type == 'process': serializer = ArrowTableSerializer() reader_pool = ProcessPool(workers_count, serializer) elif reader_pool_type == 'dummy': reader_pool = DummyPool() else: raise ValueError( 'Unknown reader_pool_type: {}'.format(reader_pool_type)) return Reader(filesystem, dataset_path, schema_fields=schema_fields, worker_class=ArrowReaderWorker, reader_pool=reader_pool, shuffle_row_groups=shuffle_row_groups, shuffle_row_drop_partitions=shuffle_row_drop_partitions, predicate=predicate, rowgroup_selector=rowgroup_selector, num_epochs=num_epochs, cur_shard=cur_shard, shard_count=shard_count, cache=cache)
def __init__(self, dataset_url, schema_fields=None, shuffle=None, predicate=None, rowgroup_selector=None, reader_pool=None, num_epochs=1, sequence=None, training_partition=None, num_training_partitions=None, read_timeout_s=None, cache=None, shuffle_options=None): """Initializes a reader object. :param dataset_url: an filepath or a url to a parquet directory, e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``. :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default. This pool is a custom implementation used to parallelize reading data from the dataset. Any object from workers_pool package can be used (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`). :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param training_partition: An int denoting the partition number used for multi node training. Each node should pass in a unique partition number in the range ``[0, num_training_partitions)``. ``num_training_partitions`` must be supplied as well. :param num_training_partitions: An int denoting the number of training partitions (how many nodes are performing the multi node training). :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it times out and raises an EmptyResultError. Pass in None for an infinite timeout. :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet file the Reader will attempt to load these values from cache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if num_training_partitions is used). By default, use the :class:`.NullCache` implementation. :param shuffle_options: ShuffleOptions object to describe how to shuffle dataset (supercedes shuffle parameter) defaults to shuffling row groups but not to drop rows based on partitions. :param sequence: *DEPRECATED* To use sequence/ngram, please supply the argument in ``schema_fields`` instead. :param shuffle: *DEPRECATED* Boolean whether to shuffle the row group order. Use ``shuffle_row_groups`` in :class:`.ShuffleOptions` instead. """ # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) # 2. Get a list of all groups # 3. Filter rowgroups # a. predicates # b. row-group selector (our indexing mechanism) # c. partition: used to get a subset of data for distributed training # 4. Create a rowgroup ventilator object # 5. Start workers pool if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram) or schema_fields is None): raise ValueError( """Fields must be either None, an iterable collection of Unischema fields or an NGram object.""") if sequence is not None: raise ValueError( """'sequence' argument of Reader object is deprecated. Please pass an NGram instance to 'schema_fields' argument instead.""") self.ngram = schema_fields if isinstance(schema_fields, NGram) else None if self.ngram and not self.ngram.timestamp_overlap and shuffle_options.shuffle_row_drop_partitions > 1: raise NotImplementedError( 'Using timestamp_overlap=False is not implemented with' ' shuffle_options.shuffle_row_drop_partitions > 1') cache = cache or NullCache() dataset_url = dataset_url[:-1] if dataset_url[ -1] == '/' else dataset_url self._workers_pool = reader_pool or ThreadPool(10) # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) logger.debug('dataset_url: %s', dataset_url) resolver = FilesystemResolver(dataset_url) self.dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path, filesystem=resolver.filesystem(), validate_schema=False) # Get a unischema stored in the dataset metadata. stored_schema = dataset_metadata.get_schema(self.dataset) # Make a schema view (a view is a Unischema containing only a subset of fields # Will raise an exception if invalid schema fields are in schema_fields fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None self.schema = stored_schema.create_schema_view( fields) if fields else stored_schema # 2. Get a list of all groups row_groups = dataset_metadata.load_row_groups(self.dataset) # 3. Filter rowgroups filtered_row_group_indexes, worker_predicate = self._filter_row_groups( self.dataset, row_groups, predicate, rowgroup_selector, training_partition, num_training_partitions) # 4. Create a rowgroup ventilator object if shuffle_options is None: if shuffle is None: shuffle = True else: logger.warning( 'shuffle option is deprecated. Please use shuffle_options instead' ) shuffle_options = ShuffleOptions(shuffle) self._normalize_shuffle_options(shuffle_options, self.dataset) ventilator = self._create_ventilator(filtered_row_group_indexes, shuffle_options, num_epochs, worker_predicate) # 5. Start workers pool self._workers_pool.start(ReaderWorker, (dataset_url, self.schema, self.ngram, row_groups, cache, worker_predicate), ventilator=ventilator) self._read_timeout_s = read_timeout_s
def make_reader(dataset_url, schema_fields=None, reader_pool_type='thread', workers_count=10, pyarrow_serialize=False, results_queue_size=50, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, num_epochs=1, cur_shard=None, shard_count=None, cache_type='null', cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3', reader_engine='reader_v1', reader_engine_params=None): """ Creates an instance of Reader for reading Petastorm datasets. A Petastorm dataset is a dataset generated using :func:`~petastorm.etl.dataset_metadata.materialize_dataset` context manager as explained `here <https://petastorm.readthedocs.io/en/latest/readme_include.html#generating-a-dataset>`_. See :func:`~petastorm.make_batch_reader` to read from a Parquet store that was not generated using :func:`~petastorm.etl.dataset_metadata.materialize_dataset`. :param dataset_url: an filepath or a url to a parquet directory, e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'`` or ``'s3://bucket/mydataset'``. :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields; an NGram object, then it will return an NGram of the specified fields. :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy'] denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread' :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the thread or process pool. Defaults to 10 :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool. Defaults to False. :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to thread reader pool type. :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate will be passed a single row and must return a boolean value indicating whether to include it in the results. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache. :param cache_location: A string denoting the location or path of the cache. :param cache_size_limit: An int specifying the size limit of the cache in bytes :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation, :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1' (the default value) selects a stable reader implementation. :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader implementation constructor chosen by ``reader_engine`` argument. You should not use this parameter, unless you fine-tuning of a reader. :return: A :class:`Reader` object """ if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url logger.debug('dataset_url: %s', dataset_url) resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver) filesystem = resolver.filesystem() dataset_path = resolver.get_dataset_path() if cache_type is None or cache_type == 'null': cache = NullCache() elif cache_type == 'local-disk': cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {}) else: raise ValueError('Unknown cache_type: {}'.format(cache_type)) # Fail if this is a non-petastorm dataset. Typically, a Parquet store will have hundred thousands rows in a single # rowgroup. Using PyDictReaderWorker or ReaderV2 implementation is very inefficient as it processes data on a # row by row basis. ArrowReaderWorker (used by make_batch_reader) is much more efficient in these cases. try: dataset_metadata.get_schema_from_dataset_url(dataset_url) except PetastormMetadataError: raise RuntimeError( 'Currently make_reader supports reading only Petastorm datasets. ' 'To read from a non-Petastorm Parquet store use make_batch_reader') if reader_engine == 'reader_v1': if reader_pool_type == 'thread': reader_pool = ThreadPool(workers_count, results_queue_size) elif reader_pool_type == 'process': if pyarrow_serialize: serializer = PyArrowSerializer() else: serializer = PickleSerializer() reader_pool = ProcessPool(workers_count, serializer) elif reader_pool_type == 'dummy': reader_pool = DummyPool() else: raise ValueError( 'Unknown reader_pool_type: {}'.format(reader_pool_type)) # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified kwargs = { 'schema_fields': schema_fields, 'reader_pool': reader_pool, 'shuffle_row_groups': shuffle_row_groups, 'shuffle_row_drop_partitions': shuffle_row_drop_partitions, 'predicate': predicate, 'rowgroup_selector': rowgroup_selector, 'num_epochs': num_epochs, 'cur_shard': cur_shard, 'shard_count': shard_count, 'cache': cache, } if reader_engine_params: kwargs.update(reader_engine_params) try: return Reader(filesystem, dataset_path, worker_class=PyDictReaderWorker, **kwargs) except PetastormMetadataError as e: logger.error('Unexpected exception: %s', str(e)) raise RuntimeError( 'make_reader has failed. If you were trying to open a Parquet store that was not ' 'created using Petastorm materialize_dataset and it contains only scalar columns, ' 'you may use make_batch_reader to read it.\n' 'Inner exception: %s', str(e)) elif reader_engine == 'experimental_reader_v2': if reader_pool_type == 'thread': decoder_pool = ThreadPoolExecutor(workers_count) elif reader_pool_type == 'process': decoder_pool = ProcessPoolExecutor(workers_count) elif reader_pool_type == 'dummy': decoder_pool = SameThreadExecutor() else: raise ValueError( 'Unknown reader_pool_type: {}'.format(reader_pool_type)) # TODO(yevgeni): once ReaderV2 is ready to be out of experimental status, we should extend # the make_reader interfaces to take shuffling buffer parameters explicitly shuffling_queue = RandomShufflingBuffer( 1000, 800) if shuffle_row_groups else NoopShufflingBuffer() # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified kwargs = { 'schema_fields': schema_fields, 'predicate': predicate, 'rowgroup_selector': rowgroup_selector, 'num_epochs': num_epochs, 'cur_shard': cur_shard, 'shard_count': shard_count, 'cache': cache, 'decoder_pool': decoder_pool, 'shuffling_queue': shuffling_queue, 'shuffle_row_groups': shuffle_row_groups, 'shuffle_row_drop_partitions': shuffle_row_drop_partitions, } if reader_engine_params: kwargs.update(reader_engine_params) return ReaderV2(dataset_url, **kwargs) else: raise ValueError( 'Unexpected value of reader_engine argument \'%s\'. ' 'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'', reader_engine)
def make_reader(dataset_url, schema_fields=None, reader_pool_type='thread', workers_count=10, pyarrow_serialize=False, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, num_epochs=1, cur_shard=None, shard_count=None, cache_type='null', cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3', infer_schema=False, reader_engine='reader_v1', reader_engine_params=None): """ Factory convenience method for :class:`Reader`. :param dataset_url: an filepath or a url to a parquet directory, e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'`` or ``'s3://bucket/mydataset'``. :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy'] denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread' :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the thread or process pool. Defaults to 10 :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool. Defaults to False. :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read) :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to break up a row group into for increased shuffling in exchange for worse performance (extra reads). For example if you specify 2 each row group read will drop half of the rows within every row group and read the remaining rows in separate reads. It is recommended to keep this number below the regular row group size in order to not waste reads which drop all rows. :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to ``None`` will result in an infinite number of epochs. :param cur_shard: An int denoting the current shard number. Each node reading a shard should pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well. Defaults to None :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache. :param cache_location: A string denoting the location or path of the cache. :param cache_size_limit: An int specifying the size limit of the cache in bytes :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation, :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param infer_schema: Whether to infer the unischema object from the parquet schema. Only works for schemas containing certain scalar type. This option allows getting around explicitly generating petastorm metadata using :func:`petastorm.etl.dataset_metadata.materialize_dataset` or petastorm-generate-metadata.py :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1' (the default value) selects a stable reader implementation. :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader implementation constructor chosen by ``reader_engine`` argument. You should not use this parameter, unless you fine-tuning of a reader. :return: A :class:`Reader` object """ if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url logger.debug('dataset_url: %s', dataset_url) resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver) filesystem = resolver.filesystem() dataset_path = resolver.get_dataset_path() if cache_type is None or cache_type == 'null': cache = NullCache() elif cache_type == 'local-disk': cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {}) else: raise ValueError('Unknown cache_type: {}'.format(cache_type)) if reader_engine == 'reader_v1': if reader_pool_type == 'thread': reader_pool = ThreadPool(workers_count) elif reader_pool_type == 'process': reader_pool = ProcessPool(workers_count, pyarrow_serialize=pyarrow_serialize) elif reader_pool_type == 'dummy': reader_pool = DummyPool() else: raise ValueError('Unknown reader_pool_type: {}'.format(reader_pool_type)) # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified kwargs = { 'schema_fields': schema_fields, 'reader_pool': reader_pool, 'shuffle_row_groups': shuffle_row_groups, 'shuffle_row_drop_partitions': shuffle_row_drop_partitions, 'predicate': predicate, 'rowgroup_selector': rowgroup_selector, 'num_epochs': num_epochs, 'cur_shard': cur_shard, 'shard_count': shard_count, 'cache': cache, 'infer_schema': infer_schema, } if reader_engine_params: kwargs.update(reader_engine_params) return Reader(filesystem, dataset_path, **kwargs) elif reader_engine == 'experimental_reader_v2': if reader_pool_type == 'thread': decoder_pool = ThreadPoolExecutor(workers_count) elif reader_pool_type == 'process': decoder_pool = ProcessPoolExecutor(workers_count) elif reader_pool_type == 'dummy': decoder_pool = SameThreadExecutor() else: raise ValueError('Unknown reader_pool_type: {}'.format(reader_pool_type)) # TODO(yevgeni): once ReaderV2 is ready to be out of experimental status, we should extend # the make_reader interfaces to take shuffling buffer parameters explicitly shuffling_queue = RandomShufflingBuffer(1000, 800) if shuffle_row_groups else NoopShufflingBuffer() # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified kwargs = { 'schema_fields': schema_fields, 'predicate': predicate, 'rowgroup_selector': rowgroup_selector, 'num_epochs': num_epochs, 'cur_shard': cur_shard, 'shard_count': shard_count, 'cache': cache, 'decoder_pool': decoder_pool, 'shuffling_queue': shuffling_queue, 'shuffle_row_groups': shuffle_row_groups, 'shuffle_row_drop_partitions': shuffle_row_drop_partitions, 'infer_schema': infer_schema, } if reader_engine_params: kwargs.update(reader_engine_params) return ReaderV2(dataset_url, **kwargs) else: raise ValueError('Unexpected value of reader_engine argument \'%s\'. ' 'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'', reader_engine)