def test_copy_not_null_rows_cli(tmpdir, synthetic_dataset):
    target_url = 'file://' + os.path.join(tmpdir.strpath, 'copied_data')

    _main([
        synthetic_dataset.url, target_url, '--not-null-fields',
        'string_array_nullable'
    ])
    with ReaderV2(target_url, num_epochs=1) as reader:
        not_null_data = list(reader)
    assert len(not_null_data) < len(synthetic_dataset.data)
def test_copy_some_fields_with_repartition_cli(tmpdir, synthetic_dataset):
    target_path = os.path.join(tmpdir.strpath, 'copied_data')
    target_url = 'file://' + target_path
    _main([
        synthetic_dataset.url, target_url, '--field-regex', r'\bid\b',
        '--partition-count', '1'
    ])

    # Check reparititioning
    assert 1 == len(glob.glob(os.path.join(target_path, 'part-*')))

    # Check we the regex filter worked
    with ReaderV2(target_url, num_epochs=1) as reader:
        assert list(reader.schema.fields.keys()) == ['id']
def test_copy_and_overwrite_cli(tmpdir, synthetic_dataset):
    target_url = 'file:///' + os.path.join(tmpdir.strpath, 'copied_data')
    _main([synthetic_dataset.url, target_url])

    with ReaderV2(target_url, num_epochs=1) as reader:
        for row in reader:
            actual = row._asdict()
            expected = next(d for d in synthetic_dataset.data
                            if d['id'] == actual['id'])
            np.testing.assert_equal(actual, expected)

    with pytest.raises(AnalysisException, match='already exists'):
        _main([synthetic_dataset.url, target_url])

    _main([synthetic_dataset.url, target_url, '--overwrite'])
Esempio n. 4
0
def reader_v2_throughput(dataset_url, field_regex=None, warmup_cycles_count=300, measure_cycles_count=1000,
                         pool_type=WorkerPoolType.THREAD, loaders_count=3, decoders_count=3,
                         read_method=ReadMethod.PYTHON, shuffling_queue_size=500, min_after_dequeue=400,
                         reader_extra_args=None, pyarrow_serialize=False, spawn_new_process=True):
    """Constructs a ReaderV2 instance and uses it to performs throughput measurements.

    The function will spawn a new process if ``spawn_separate_process`` is set. This is needed to make memory footprint
    measurements accurate.

    :param dataset_url: A url of the dataset to be used for measurements.
    :param field_regex:  A list of regular expressions. Only fields that match one of the regex patterns will be used
      during the benchmark.
    :param warmup_cycles_count: Number of warmup cycles. During warmup cycles no measurements are being recorded.
    :param measure_cycles_count: Number of measurements cycles. Only time elapsed during measurements cycles are used
      in throughput calculations.
    :param pool_type: :class:`WorkerPoolType` enum value.
    :param loaders_count: Number of IO threads.
    :param decoders_count: Number of threads or processes used for decoding. ``pool_type`` parameter defines
      whether multiple processes or threads are used for parallel decoding.
    :param read_method:  An enum :class:`ReadMethod` that defines whether a :class:`petastorm.reader.Reader` will be
      used.
    :param shuffling_queue_size: Maximum number of elements in the shuffling queue.
    :param min_after_dequeue: Minimum number of elements in a shuffling queue before entries can be read from it.
    :param reader_extra_args: Extra arguments that would be passed to Reader constructor.
    :param pyarrow_serialize: When True, pyarrow.serialize library will be used for serializing decoded payloads.
    :param spawn_new_process: This function will respawn itself in a new process if the argument is True. Spawning
      a new process is needed to get an accurate memory footprint.

    :return: An instance of ``BenchmarkResult`` namedtuple with the results of the benchmark. The namedtuple has
      the following fields: `time_mean`, `samples_per_second`, `memory_info` and `cpu`
    """
    if not reader_extra_args:
        reader_extra_args = dict()

    if spawn_new_process:
        args = copy.deepcopy(locals())
        args['spawn_new_process'] = False
        executor = ProcessPoolExecutor(1)
        future = executor.submit(reader_v2_throughput, **args)
        return future.result()

    logger.info('Arguments: %s', locals())

    if 'schema_fields' not in reader_extra_args:
        unischema_fields = match_unischema_fields(get_schema_from_dataset_url(dataset_url), field_regex)
        reader_extra_args['schema_fields'] = unischema_fields

    logger.info('Fields used in the benchmark: %s', str(reader_extra_args['schema_fields']))

    decoder_pool_executor = _create_concurrent_executor(pool_type, decoders_count)

    with ReaderV2(dataset_url, num_epochs=None,
                  loader_pool=ThreadPoolExecutor(loaders_count),
                  decoder_pool=decoder_pool_executor,
                  shuffling_queue=RandomShufflingBuffer(shuffling_queue_size, min_after_dequeue),
                  **reader_extra_args) as reader:

        if read_method == ReadMethod.PYTHON:
            result = _time_warmup_and_work(reader, warmup_cycles_count, measure_cycles_count)
        elif read_method == ReadMethod.TF:
            result = _time_warmup_and_work_tf(reader, warmup_cycles_count, measure_cycles_count, 0, 0)
        else:
            raise RuntimeError('Unexpected reader_type value: %s', str(read_method))

    return result
Esempio n. 5
0
@pytest.mark.parametrize('reader_factory', MINIMAL_READER_FLAVOR_FACTORIES)
def test_reading_subset_of_columns_using_regex(synthetic_dataset, reader_factory):
    """Just a bunch of read and compares of all values to the expected values"""
    with reader_factory(synthetic_dataset.url, schema_fields=['id$', 'id_.*$', 'partition_key$']) as reader:
        # Read a bunch of entries from the dataset and compare the data to reference
        for row in reader:
            actual = dict(row._asdict())
            assert set(actual.keys()) == {'id_float', 'id_odd', 'id', 'partition_key'}
            expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id'])
            np.testing.assert_equal(expected['id_float'], actual['id_float'])


@pytest.mark.parametrize('reader_factory', [
    lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs),
    lambda url, **kwargs: make_batch_reader(url, reader_pool_type='dummy', **kwargs),
    lambda url, **kwargs: ReaderV2(url, loader_pool=SameThreadExecutor(), decoder_pool=SameThreadExecutor(), **kwargs)])
def test_shuffle(synthetic_dataset, reader_factory):
    rows_count = len(synthetic_dataset.data)

    # Read ids twice without shuffle: assert we have the same array and all expected ids are in the array
    with reader_factory(synthetic_dataset.url, shuffle_row_groups=False) as reader_1:
        first_readout = _readout_all_ids(reader_1)
    with reader_factory(synthetic_dataset.url, shuffle_row_groups=False) as reader_2:
        second_readout = _readout_all_ids(reader_2)

    np.testing.assert_array_equal(range(rows_count), sorted(first_readout))
    np.testing.assert_array_equal(first_readout, second_readout)

    # Now read with shuffling
    with reader_factory(synthetic_dataset.url, shuffle_row_groups=True) as shuffled_reader:
        shuffled_readout = _readout_all_ids(shuffled_reader)
Esempio n. 6
0
import pyarrow  # noqa: F401 pylint: disable=W0611
import torch

from petastorm import make_reader
from petastorm.pytorch import _sanitize_pytorch_types, DataLoader, decimal_friendly_collate
from petastorm.reader import ReaderV2
from petastorm.tests.test_common import TestSchema

BATCHABLE_FIELDS = set(TestSchema.fields.values()) - \
                   {TestSchema.matrix_nullable, TestSchema.string_array_nullable,
                    TestSchema.matrix_string, TestSchema.empty_matrix_string}

# pylint: disable=unnecessary-lambda
MINIMAL_READER_FLAVOR_FACTORIES = [
    lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs),
    lambda url, **kwargs: ReaderV2(url, **kwargs)
]

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [
    lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs
                                      ),
    lambda url, **kwargs: make_reader(
        url, reader_pool_type='process', pyarrow_serialize=False, **kwargs),
    lambda url, **kwargs: make_reader(url,
                                      reader_pool_type='process',
                                      workers_count=1,
                                      pyarrow_serialize=True,
                                      **kwargs), lambda url, **kwargs:
    ReaderV2(url, decoder_pool=ProcessPoolExecutor(10), **kwargs)
]
Esempio n. 7
0
from petastorm.reader import Reader, ReaderV2
from petastorm.reader_impl.same_thread_executor import SameThreadExecutor
from petastorm.selectors import SingleIndexSelector
from petastorm.shuffle_options import ShuffleOptions
from petastorm.tests.test_common import create_test_dataset, TestSchema
from petastorm.tests.test_end_to_end_predicates_impl import \
    PartitionKeyInSetPredicate, EqualPredicate
from petastorm.unischema import UnischemaField, Unischema
from petastorm.workers_pool.dummy_pool import DummyPool
from petastorm.workers_pool.process_pool import ProcessPool
from petastorm.workers_pool.thread_pool import ThreadPool

# pylint: disable=unnecessary-lambda
MINIMAL_READER_FLAVOR_FACTORIES = [
    lambda url, **kwargs: Reader(url, reader_pool=DummyPool(), **kwargs),
    lambda url, **kwargs: ReaderV2(url, **kwargs)
]

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [
    lambda url, **kwargs: Reader(url, reader_pool=ThreadPool(10), **kwargs),
    lambda url, **kwargs: Reader(url, reader_pool=ProcessPool(10), **kwargs),
    lambda url, **kwargs: ReaderV2(
        url, decoder_pool=ProcessPoolExecutor(10), **kwargs)
]


def _check_simple_reader(reader, expected_data):
    # Read a bunch of entries from the dataset and compare the data to reference
    def _type(v):
        return v.dtype if isinstance(v, np.ndarray) else type(v)