def test_copy_not_null_rows_cli(tmpdir, synthetic_dataset): target_url = 'file://' + os.path.join(tmpdir.strpath, 'copied_data') _main([ synthetic_dataset.url, target_url, '--not-null-fields', 'string_array_nullable' ]) with ReaderV2(target_url, num_epochs=1) as reader: not_null_data = list(reader) assert len(not_null_data) < len(synthetic_dataset.data)
def test_copy_some_fields_with_repartition_cli(tmpdir, synthetic_dataset): target_path = os.path.join(tmpdir.strpath, 'copied_data') target_url = 'file://' + target_path _main([ synthetic_dataset.url, target_url, '--field-regex', r'\bid\b', '--partition-count', '1' ]) # Check reparititioning assert 1 == len(glob.glob(os.path.join(target_path, 'part-*'))) # Check we the regex filter worked with ReaderV2(target_url, num_epochs=1) as reader: assert list(reader.schema.fields.keys()) == ['id']
def test_copy_and_overwrite_cli(tmpdir, synthetic_dataset): target_url = 'file:///' + os.path.join(tmpdir.strpath, 'copied_data') _main([synthetic_dataset.url, target_url]) with ReaderV2(target_url, num_epochs=1) as reader: for row in reader: actual = row._asdict() expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id']) np.testing.assert_equal(actual, expected) with pytest.raises(AnalysisException, match='already exists'): _main([synthetic_dataset.url, target_url]) _main([synthetic_dataset.url, target_url, '--overwrite'])
def reader_v2_throughput(dataset_url, field_regex=None, warmup_cycles_count=300, measure_cycles_count=1000, pool_type=WorkerPoolType.THREAD, loaders_count=3, decoders_count=3, read_method=ReadMethod.PYTHON, shuffling_queue_size=500, min_after_dequeue=400, reader_extra_args=None, pyarrow_serialize=False, spawn_new_process=True): """Constructs a ReaderV2 instance and uses it to performs throughput measurements. The function will spawn a new process if ``spawn_separate_process`` is set. This is needed to make memory footprint measurements accurate. :param dataset_url: A url of the dataset to be used for measurements. :param field_regex: A list of regular expressions. Only fields that match one of the regex patterns will be used during the benchmark. :param warmup_cycles_count: Number of warmup cycles. During warmup cycles no measurements are being recorded. :param measure_cycles_count: Number of measurements cycles. Only time elapsed during measurements cycles are used in throughput calculations. :param pool_type: :class:`WorkerPoolType` enum value. :param loaders_count: Number of IO threads. :param decoders_count: Number of threads or processes used for decoding. ``pool_type`` parameter defines whether multiple processes or threads are used for parallel decoding. :param read_method: An enum :class:`ReadMethod` that defines whether a :class:`petastorm.reader.Reader` will be used. :param shuffling_queue_size: Maximum number of elements in the shuffling queue. :param min_after_dequeue: Minimum number of elements in a shuffling queue before entries can be read from it. :param reader_extra_args: Extra arguments that would be passed to Reader constructor. :param pyarrow_serialize: When True, pyarrow.serialize library will be used for serializing decoded payloads. :param spawn_new_process: This function will respawn itself in a new process if the argument is True. Spawning a new process is needed to get an accurate memory footprint. :return: An instance of ``BenchmarkResult`` namedtuple with the results of the benchmark. The namedtuple has the following fields: `time_mean`, `samples_per_second`, `memory_info` and `cpu` """ if not reader_extra_args: reader_extra_args = dict() if spawn_new_process: args = copy.deepcopy(locals()) args['spawn_new_process'] = False executor = ProcessPoolExecutor(1) future = executor.submit(reader_v2_throughput, **args) return future.result() logger.info('Arguments: %s', locals()) if 'schema_fields' not in reader_extra_args: unischema_fields = match_unischema_fields(get_schema_from_dataset_url(dataset_url), field_regex) reader_extra_args['schema_fields'] = unischema_fields logger.info('Fields used in the benchmark: %s', str(reader_extra_args['schema_fields'])) decoder_pool_executor = _create_concurrent_executor(pool_type, decoders_count) with ReaderV2(dataset_url, num_epochs=None, loader_pool=ThreadPoolExecutor(loaders_count), decoder_pool=decoder_pool_executor, shuffling_queue=RandomShufflingBuffer(shuffling_queue_size, min_after_dequeue), **reader_extra_args) as reader: if read_method == ReadMethod.PYTHON: result = _time_warmup_and_work(reader, warmup_cycles_count, measure_cycles_count) elif read_method == ReadMethod.TF: result = _time_warmup_and_work_tf(reader, warmup_cycles_count, measure_cycles_count, 0, 0) else: raise RuntimeError('Unexpected reader_type value: %s', str(read_method)) return result
@pytest.mark.parametrize('reader_factory', MINIMAL_READER_FLAVOR_FACTORIES) def test_reading_subset_of_columns_using_regex(synthetic_dataset, reader_factory): """Just a bunch of read and compares of all values to the expected values""" with reader_factory(synthetic_dataset.url, schema_fields=['id$', 'id_.*$', 'partition_key$']) as reader: # Read a bunch of entries from the dataset and compare the data to reference for row in reader: actual = dict(row._asdict()) assert set(actual.keys()) == {'id_float', 'id_odd', 'id', 'partition_key'} expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id']) np.testing.assert_equal(expected['id_float'], actual['id_float']) @pytest.mark.parametrize('reader_factory', [ lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs), lambda url, **kwargs: make_batch_reader(url, reader_pool_type='dummy', **kwargs), lambda url, **kwargs: ReaderV2(url, loader_pool=SameThreadExecutor(), decoder_pool=SameThreadExecutor(), **kwargs)]) def test_shuffle(synthetic_dataset, reader_factory): rows_count = len(synthetic_dataset.data) # Read ids twice without shuffle: assert we have the same array and all expected ids are in the array with reader_factory(synthetic_dataset.url, shuffle_row_groups=False) as reader_1: first_readout = _readout_all_ids(reader_1) with reader_factory(synthetic_dataset.url, shuffle_row_groups=False) as reader_2: second_readout = _readout_all_ids(reader_2) np.testing.assert_array_equal(range(rows_count), sorted(first_readout)) np.testing.assert_array_equal(first_readout, second_readout) # Now read with shuffling with reader_factory(synthetic_dataset.url, shuffle_row_groups=True) as shuffled_reader: shuffled_readout = _readout_all_ids(shuffled_reader)
import pyarrow # noqa: F401 pylint: disable=W0611 import torch from petastorm import make_reader from petastorm.pytorch import _sanitize_pytorch_types, DataLoader, decimal_friendly_collate from petastorm.reader import ReaderV2 from petastorm.tests.test_common import TestSchema BATCHABLE_FIELDS = set(TestSchema.fields.values()) - \ {TestSchema.matrix_nullable, TestSchema.string_array_nullable, TestSchema.matrix_string, TestSchema.empty_matrix_string} # pylint: disable=unnecessary-lambda MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs), lambda url, **kwargs: ReaderV2(url, **kwargs) ] # pylint: disable=unnecessary-lambda ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs ), lambda url, **kwargs: make_reader( url, reader_pool_type='process', pyarrow_serialize=False, **kwargs), lambda url, **kwargs: make_reader(url, reader_pool_type='process', workers_count=1, pyarrow_serialize=True, **kwargs), lambda url, **kwargs: ReaderV2(url, decoder_pool=ProcessPoolExecutor(10), **kwargs) ]
from petastorm.reader import Reader, ReaderV2 from petastorm.reader_impl.same_thread_executor import SameThreadExecutor from petastorm.selectors import SingleIndexSelector from petastorm.shuffle_options import ShuffleOptions from petastorm.tests.test_common import create_test_dataset, TestSchema from petastorm.tests.test_end_to_end_predicates_impl import \ PartitionKeyInSetPredicate, EqualPredicate from petastorm.unischema import UnischemaField, Unischema from petastorm.workers_pool.dummy_pool import DummyPool from petastorm.workers_pool.process_pool import ProcessPool from petastorm.workers_pool.thread_pool import ThreadPool # pylint: disable=unnecessary-lambda MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: Reader(url, reader_pool=DummyPool(), **kwargs), lambda url, **kwargs: ReaderV2(url, **kwargs) ] # pylint: disable=unnecessary-lambda ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: Reader(url, reader_pool=ThreadPool(10), **kwargs), lambda url, **kwargs: Reader(url, reader_pool=ProcessPool(10), **kwargs), lambda url, **kwargs: ReaderV2( url, decoder_pool=ProcessPoolExecutor(10), **kwargs) ] def _check_simple_reader(reader, expected_data): # Read a bunch of entries from the dataset and compare the data to reference def _type(v): return v.dtype if isinstance(v, np.ndarray) else type(v)