Exemple #1
0
def test_should_fail_if_reading_out_of_context_manager(synthetic_dataset):
    with make_reader(synthetic_dataset.url, workers_count=1) as reader:
        next(reader)

    with pytest.raises(RuntimeError, match='Trying to read a sample.*'):
        next(reader)
Exemple #2
0
def test_simple_read_with_pyarrow_serialize(synthetic_dataset):
    """Same as test_simple_read, but don't check type correctness as pyarrow_serialize messes up integer types"""
    with make_reader(synthetic_dataset.url, reader_pool_type='process', workers_count=1,
                     pyarrow_serialize=True) as reader:
        _check_simple_reader(reader, synthetic_dataset.data, check_types=False)
Exemple #3
0
from pyspark.sql.types import LongType, ShortType, StringType

from petastorm import make_reader
from petastorm.codecs import ScalarCodec
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.reader import ReaderV2
from petastorm.reader_impl.same_thread_executor import SameThreadExecutor
from petastorm.selectors import SingleIndexSelector
from petastorm.tests.test_common import create_test_dataset, TestSchema
from petastorm.tests.test_end_to_end_predicates_impl import \
    PartitionKeyInSetPredicate, EqualPredicate
from petastorm.unischema import UnischemaField, Unischema

# pylint: disable=unnecessary-lambda
MINIMAL_READER_FLAVOR_FACTORIES = [
    lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs),
    lambda url, **kwargs: make_reader(
        url, reader_engine='experimental_reader_v2', **kwargs),
]

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [
    lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs
                                      ),
    lambda url, **kwargs: make_reader(
        url, reader_pool_type='process', pyarrow_serialize=False, **kwargs),
    lambda url, **kwargs: make_reader(url,
                                      reader_pool_type='process',
                                      workers_count=1,
                                      pyarrow_serialize=True,
                                      **kwargs),
from petastorm import make_reader, TransformSpec, make_batch_reader
from petastorm.pytorch import _sanitize_pytorch_types, DataLoader, BatchedDataLoader, decimal_friendly_collate
from petastorm.tests.test_common import TestSchema

ALL_DATA_LOADERS = [DataLoader, BatchedDataLoader]

BATCHABLE_FIELDS = set(TestSchema.fields.values()) - \
    {TestSchema.matrix_nullable, TestSchema.string_array_nullable,
     TestSchema.matrix_string, TestSchema.empty_matrix_string, TestSchema.integer_nullable}

TORCH_BATCHABLE_FIELDS = BATCHABLE_FIELDS - \
    {TestSchema.decimal, TestSchema.partition_key, }

# pylint: disable=unnecessary-lambda
MINIMAL_READER_FLAVOR_FACTORIES = [
    lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs),
]

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [
    lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs
                                      ),
    lambda url, **kwargs: make_reader(
        url, reader_pool_type='process', workers_count=1, **kwargs),
]


def _check_simple_reader(loader, expected_data, expected_fields):
    # Read a bunch of entries from the dataset and compare the data to reference
    def _type(v):
        return v.dtype if isinstance(v, np.ndarray) else type(v)
Exemple #5
0
def reader_throughput(dataset_url,
                      field_regex=None,
                      warmup_cycles_count=300,
                      measure_cycles_count=1000,
                      pool_type=WorkerPoolType.THREAD,
                      loaders_count=3,
                      profile_threads=False,
                      read_method=ReadMethod.PYTHON,
                      shuffling_queue_size=500,
                      min_after_dequeue=400,
                      reader_extra_args=None,
                      spawn_new_process=True):
    """Constructs a Reader instance and uses it to performs throughput measurements.

    The function will spawn a new process if ``spawn_separate_process`` is set. This is needed to make memory footprint
    measurements accurate.

    :param dataset_url: A url of the dataset to be used for measurements.
    :param field_regex:  A list of regular expressions. Only fields that match one of the regex patterns will be used
      during the benchmark.
    :param warmup_cycles_count: Number of warmup cycles. During warmup cycles no measurements are being recorded.
    :param measure_cycles_count: Number of measurements cycles. Only time elapsed during measurements cycles are used
      in throughput calculations.
    :param pool_type: :class:`WorkerPoolType` enum value.
    :param loaders_count: Number of threads (same thread is used for IO and decoding).
    :param profile_threads:  Enables profiling threads. Will print result when thread pool is shut down.
    :param read_method:  An enum :class:`ReadMethod` that defines whether a :class:`petastorm.reader.Reader` will be
      used.
    :param shuffling_queue_size: Maximum number of elements in the shuffling queue.
    :param min_after_dequeue: Minimum number of elements in a shuffling queue before entries can be read from it.
    :param reader_extra_args: Extra arguments that would be passed to Reader constructor.
    :param spawn_new_process: This function will respawn itself in a new process if the argument is True. Spawning
      a new process is needed to get an accurate memory footprint.

    :return: An instance of ``BenchmarkResult`` namedtuple with the results of the benchmark. The namedtuple has
      the following fields: `time_mean`, `samples_per_second`, `memory_info` and `cpu`
    """
    if not reader_extra_args:
        reader_extra_args = dict()

    if spawn_new_process:
        args = copy.deepcopy(locals())
        args['spawn_new_process'] = False
        executor = ProcessPoolExecutor(1)
        future = executor.submit(reader_throughput, **args)
        return future.result()

    logger.info('Arguments: %s', locals())

    if 'schema_fields' not in reader_extra_args:
        unischema_fields = match_unischema_fields(
            get_schema_from_dataset_url(dataset_url), field_regex)
        reader_extra_args['schema_fields'] = unischema_fields

    logger.info('Fields used in the benchmark: %s',
                str(reader_extra_args['schema_fields']))

    with make_reader(dataset_url,
                     num_epochs=None,
                     reader_pool_type=str(pool_type),
                     workers_count=loaders_count,
                     **reader_extra_args) as reader:

        if read_method == ReadMethod.PYTHON:
            result = _time_warmup_and_work(reader, warmup_cycles_count,
                                           measure_cycles_count)
        elif read_method == ReadMethod.TF:
            result = _time_warmup_and_work_tf(reader, warmup_cycles_count,
                                              measure_cycles_count,
                                              shuffling_queue_size,
                                              min_after_dequeue)
        else:
            raise RuntimeError('Unexpected reader_type value: %s',
                               str(read_method))

    return result
Exemple #6
0
def test_diagnostics_reader_v1(synthetic_dataset):
    with make_reader(synthetic_dataset.url) as reader:
        next(reader)
        diags = reader.diagnostics
        # Hard to make a meaningful assert on the content of the diags without potentially introducing a race
        assert 'output_queue_size' in diags
Exemple #7
0
import numpy as np
import pytest
import six
import tensorflow as tf

from petastorm import make_reader, make_batch_reader
from petastorm.ngram import NGram
from petastorm.predicates import in_lambda
from petastorm.tests.test_common import TestSchema
from petastorm.tf_utils import make_petastorm_dataset

_EXCLUDE_FIELDS = set(TestSchema.fields.values()) \
                  - {TestSchema.matrix_nullable, TestSchema.string_array_nullable, TestSchema.decimal}

MINIMAL_READER_FLAVOR_FACTORIES = [
    lambda url, **kwargs: make_reader(url, **_merge_params({'reader_pool_type': 'dummy',
                                                            'schema_fields': _EXCLUDE_FIELDS}, kwargs)),
]

ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [
    lambda url, **kwargs: make_reader(url, **_merge_params({'reader_pool_type': 'thread', 'workers_count': 1,
                                                            'schema_fields': _EXCLUDE_FIELDS}, kwargs)),
    lambda url, **kwargs: make_reader(url, **_merge_params({'reader_pool_type': 'process', 'workers_count': 1,
                                                            'schema_fields': _EXCLUDE_FIELDS}, kwargs)),
]


def _merge_params(base, overwrite):
    """Merges two dictionaries when values from ``overwrite`` takes precedence over values of ``base`` dictionary.

    Both input parameters are not modified.
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='Petastorm MNIST Example')
    default_dataset_url = 'file://{}'.format(DEFAULT_MNIST_DATA_PATH)
    parser.add_argument(
        '--dataset-url',
        type=str,
        default=default_dataset_url,
        metavar='S',
        help='hdfs:// or file:/// URL to the MNIST petastorm dataset '
        '(default: %s)' % default_dataset_url)
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--all-epochs',
                        action='store_true',
                        default=False,
                        help='train all epochs before testing accuracy/loss')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device('cuda' if use_cuda else 'cpu')

    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    # Configure loop and Reader epoch for illustrative purposes.
    # Typical training usage would use the `all_epochs` approach.
    #
    if args.all_epochs:
        # Run training across all the epochs before testing for accuracy
        loop_epochs = 1
        reader_epochs = args.epochs
    else:
        # Test training accuracy after each epoch
        loop_epochs = args.epochs
        reader_epochs = 1

    transform = TransformSpec(_transform_row, removed_fields=['idx'])

    # Instantiate each petastorm Reader with a single thread, shuffle enabled, and appropriate epoch setting
    for epoch in range(1, loop_epochs + 1):
        with DataLoader(make_reader('{}/train'.format(args.dataset_url),
                                    num_epochs=reader_epochs,
                                    transform_spec=transform),
                        batch_size=args.batch_size) as train_loader:
            train(model, device, train_loader, args.log_interval, optimizer,
                  epoch)
        with DataLoader(make_reader('{}/test'.format(args.dataset_url),
                                    num_epochs=reader_epochs,
                                    transform_spec=transform),
                        batch_size=args.test_batch_size) as test_loader:
            test(model, device, test_loader)
def test_predicate_on_partition_filters_out_everything(synthetic_dataset, reader_factory):
    with pytest.warns(UserWarning, match='No matching data is available for loading'):
        # This predicate should filter out all rowgroups. We should raise an error in this case.
        make_reader(synthetic_dataset.url, reader_pool_type='dummy',
                    predicate=PartitionKeyInSetPredicate({'non existing value'}))
def main():
    parser = argparse.ArgumentParser(
        description='Petastorm/Sagemaker/Tensorflow MNIST Example')

    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm-model-dir',
                        type=str,
                        default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train',
                        type=str,
                        default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--hosts',
                        type=list,
                        default=json.loads(os.environ.get('SM_HOSTS')))
    parser.add_argument('--current-host',
                        type=str,
                        default=os.environ.get('SM_CURRENT_HOST'))

    parser.add_argument('--dataset-url',
                        type=str,
                        metavar='S',
                        help='S3:// URL to the MNIST petastorm dataset')

    parser.add_argument('--training_steps', type=int, default=300)
    parser.add_argument('--evaluation_steps', type=int, default=10)
    parser.add_argument('--log_step_count_steps', type=int, default=100)
    parser.add_argument('--save_checkpoints_steps', type=int, default=500)
    parser.add_argument('--save_summary_steps', type=int, default=50)
    parser.add_argument('--throttle_secs', type=int, default=10)

    parser.add_argument('--prefetch_size', type=int, default=16)
    parser.add_argument('--num_parallel_batches', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=256)

    args = parser.parse_args()

    tf.logging.set_verbosity(tf.logging.DEBUG)

    # TF 1.13 and 1.14 handle logging a bit different, so wrapping the logging setup in a try/except block
    try:
        tf_logger = tf_logging._get_logger()
        handler = tf_logger.handlers[0]
        handler.setFormatter(
            _logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    except:
        pass

    # In 1.14, a multi-worker synchronous training can be achieved using CollectiveAllReduceStrategy per
    # See https://github.com/tensorflow/tensorflow/issues/23664
    # Without providing train_distribute, I believe asynchronous training is done
    run_config = tf.estimator.RunConfig(
        save_checkpoints_steps=args.save_checkpoints_steps,
        log_step_count_steps=args.log_step_count_steps,
        save_summary_steps=args.save_summary_steps,
    )

    model_dir_parent_path = args.model_dir[:-5]
    model_dir_parent = model_dir_parent_path.split("/")[-2]

    print(
        f"Launch tensorboard by running the following in terminal:\n" +
        "aws s3 sync {model_dir_parent_path} ~/Downloads/{model_dir_parent} && "
        + "tensorboard --logdir=~/Downloads/{model_dir_parent}")

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=args.model_dir,
                                       params={"batch_size": args.batch_size},
                                       config=run_config)

    workers = json.loads(os.environ['SM_HOSTS'])
    worker_index = workers.index(os.environ['SM_CURRENT_HOST'])
    nr_workers = len(workers)
    print(
        f"Inside training script on worker with (0-based) index {worker_index} out of {nr_workers - 1}."
    )

    with make_reader(os.path.join(args.dataset_url, 'train'),
                     num_epochs=None,
                     cur_shard=worker_index,
                     shard_count=nr_workers,
                     workers_count=nr_workers) as train_reader:
        with make_reader(os.path.join(args.dataset_url, 'test'),
                         num_epochs=None,
                         cur_shard=0,
                         shard_count=1) as eval_reader:

            train_fn = lambda: _input_fn(reader=train_reader,
                                         batch_size=args.batch_size,
                                         num_parallel_batches=args.
                                         num_parallel_batches)

            eval_fn = lambda: _input_fn(reader=eval_reader,
                                        batch_size=args.batch_size,
                                        num_parallel_batches=args.
                                        num_parallel_batches)

            train_spec = tf.estimator.TrainSpec(input_fn=train_fn,
                                                max_steps=args.training_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_fn,
                                              throttle_secs=args.throttle_secs,
                                              steps=args.evaluation_steps)

            tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def pytorch_hello_world(dataset_url='file:///tmp/hello_world_dataset'):
    with DataLoader(make_reader(dataset_url)) as train_loader:
        sample = next(iter(train_loader))
        print(sample['id'])
 def test_no_metadata(self):
     self.vanish_metadata()
     with self.assertRaises(RuntimeError) as e:
         make_reader(self._dataset_url, reader_pool_type='dummy')
     self.assertTrue('make_reader supports reading only Petastorm datasets' in str(e.exception))
     self.restore_metadata()
Exemple #13
0
import numpy as np
import pytest
import tensorflow as tf
from tensorflow.python.framework.errors_impl import OutOfRangeError

from petastorm import make_reader
from petastorm.ngram import NGram
from petastorm.reader_impl.same_thread_executor import SameThreadExecutor
from petastorm.tests.conftest import SyntheticDataset, maybe_cached_dataset
from petastorm.tests.test_common import create_test_dataset, TestSchema
from petastorm.tf_utils import tf_tensors

# Tests in this module will run once for each entry in the READER_FACTORIES
# pylint: disable=unnecessary-lambda
READER_FACTORIES = [
    lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs),
    lambda url, **kwargs: make_reader(
        url, reader_pool_type='process', workers_count=1, **kwargs),
    lambda url, **kwargs: make_reader(url,
                                      reader_engine='experimental_reader_v2',
                                      reader_pool_type='dummy',
                                      reader_engine_params=
                                      {'loader_pool': SameThreadExecutor()},
                                      **kwargs),
]


@pytest.fixture(scope="session")
def dataset_num_files_1(request, tmpdir_factory):
    def _dataset_generator():
        path = tmpdir_factory.mktemp("data").strpath
def test_schema_mismatch(synthetic_dataset):
    readers = [make_reader(synthetic_dataset.url, schema_fields=['id'], workers_count=1),
               make_reader(synthetic_dataset.url, schema_fields=['image_png'], workers_count=1)]

    with pytest.raises(ValueError, match='.*should have the same schema.*'):
        WeightedSamplingReader(readers, [0.5, 0.5])
def train_and_test(dataset_url, training_iterations, batch_size, evaluation_interval):
    """
    Train a model for training iterations with a batch size batch_size, printing accuracy every log_interval.
    :param dataset_url: The MNIST dataset url.
    :param training_iterations: The training iterations to train for.
    :param batch_size: The batch size for training.
    :param evaluation_interval: The interval used to print the accuracy.
    :return:
    """
    with make_reader(os.path.join(dataset_url, 'train'), num_epochs=None) as train_reader:
        with make_reader(os.path.join(dataset_url, 'test'), num_epochs=None) as test_reader:
            train_readout = tf_tensors(train_reader)
            train_image = tf.cast(tf.reshape(train_readout.image, [784]), tf.float32)
            train_label = train_readout.digit
            batch_image, batch_label = tf.train.batch(
                [train_image, train_label], batch_size=batch_size
            )

            W = tf.Variable(tf.zeros([784, 10]))
            b = tf.Variable(tf.zeros([10]))
            y = tf.matmul(batch_image, W) + b

            # The raw formulation of cross-entropy,
            #
            #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
            #                                 reduction_indices=[1]))
            #
            # can be numerically unstable.
            #
            # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
            # outputs of 'y', and then average across the batch.
            cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=batch_label, logits=y)
            train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

            correct_prediction = tf.equal(tf.argmax(y, 1), batch_label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

            test_readout = tf_tensors(test_reader)
            test_image = tf.cast(tf.reshape(test_readout.image, [784]), tf.float32)
            test_label = test_readout.digit
            test_batch_image, test_batch_label = tf.train.batch(
                [test_image, test_label], batch_size=batch_size
            )

            # Train
            print('Training model for {0} training iterations with batch size {1} and evaluation interval {2}'.format(
                training_iterations, batch_size, evaluation_interval
            ))
            with tf.Session() as sess:
                sess.run([
                    tf.local_variables_initializer(),
                    tf.global_variables_initializer(),
                ])
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(sess=sess, coord=coord)
                try:
                    for i in range(training_iterations):
                        if coord.should_stop():
                            break

                        sess.run(train_step)

                        if (i % evaluation_interval) == 0 or i == (training_iterations - 1):
                            feed_batch_image, feed_batch_label = sess.run([test_batch_image, test_batch_label])
                            print('After {0} training iterations, the accuracy of the model is: {1:.2f}'.format(
                                i,
                                sess.run(accuracy, feed_dict={
                                    batch_image: feed_batch_image, batch_label: feed_batch_label
                                })))
                finally:
                    coord.request_stop()
                    coord.join(threads)
def test_too_many_shards(synthetic_dataset, reader_factory):
    with pytest.raises(NoDataAvailableError, match='Number of row-groups in the dataset'):
        # If number of shards is greater than number of rowgroups, users might be surprised if a reader
        # does not produce any error, hence we raise an explicit exception
        make_reader(synthetic_dataset.url, reader_pool_type='dummy', cur_shard=0, shard_count=10000000)
Exemple #17
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from time import sleep

import pyarrow.parquet as pq
import pytest

from petastorm import make_reader
from petastorm.reader import Reader

# pylint: disable=unnecessary-lambda
READER_FACTORIES = [
    make_reader,
    lambda url, **kwargs: make_reader(
        url, reader_engine='experimental_reader_v2', **kwargs),
]


@pytest.mark.parametrize('reader_factory', READER_FACTORIES)
def test_dataset_url_must_be_string(reader_factory):
    with pytest.raises(ValueError):
        reader_factory(None)

    with pytest.raises(ValueError):
        reader_factory(123)

    with pytest.raises(ValueError):
        reader_factory([])

def test_basic_pytorch_dataloader(synthetic_dataset):
    loader = DataLoader(make_reader(synthetic_dataset.url,
                                    reader_pool_type='dummy'),
                        collate_fn=_noop_collate)
    for item in loader:
        assert len(item) == 1
Exemple #19
0
def test_invalid_reader_engine(synthetic_dataset, reader_factory):
    with pytest.raises(ValueError, match='Supported reader_engine values'):
        make_reader(synthetic_dataset.url, reader_engine='bogus reader engine')
def test_pytorch_dataloader_context(synthetic_dataset):
    with DataLoader(make_reader(synthetic_dataset.url,
                                reader_pool_type='dummy'),
                    collate_fn=_noop_collate) as loader:
        for item in loader:
            assert len(item) == 1
Exemple #21
0
# Must import pyarrow before torch. See: https://github.com/uber/petastorm/blob/master/docs/troubleshoot.rst
import pyarrow  # noqa: F401 pylint: disable=W0611
import torch

from petastorm import make_reader
from petastorm.pytorch import _sanitize_pytorch_types, DataLoader, decimal_friendly_collate
from petastorm.reader import ReaderV2
from petastorm.tests.test_common import TestSchema

BATCHABLE_FIELDS = set(TestSchema.fields.values()) - \
                   {TestSchema.matrix_nullable, TestSchema.string_array_nullable,
                    TestSchema.matrix_string, TestSchema.empty_matrix_string}

# pylint: disable=unnecessary-lambda
MINIMAL_READER_FLAVOR_FACTORIES = [
    lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs),
    lambda url, **kwargs: ReaderV2(url, **kwargs)
]

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [
    lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs
                                      ),
    lambda url, **kwargs: make_reader(
        url, reader_pool_type='process', pyarrow_serialize=False, **kwargs),
    lambda url, **kwargs: make_reader(url,
                                      reader_pool_type='process',
                                      workers_count=1,
                                      pyarrow_serialize=True,
                                      **kwargs), lambda url, **kwargs:
    ReaderV2(url, decoder_pool=ProcessPoolExecutor(10), **kwargs)
Exemple #22
0
def test_reader_engine_v2_with_transform_is_not_supported(
        synthetic_dataset, reader_factory):
    with pytest.raises(NotImplementedError):
        make_reader(synthetic_dataset.url,
                    reader_engine='experimental_reader_v2',
                    transform_spec=TransformSpec(lambda x: x))
 def test_no_metadata(self):
     self.vanish_metadata()
     with self.assertRaises(PetastormMetadataError) as e:
         make_reader(self._dataset_url, reader_pool_type='dummy')
     self.assertTrue('Could not find _common_metadata file' in str(e.exception))
     self.restore_metadata()
Exemple #24
0
def test_generate(petastorm_dataset):
    # Read from it using a plain reader
    with make_reader(petastorm_dataset.url) as reader:
        all_samples = list(reader)
    assert all_samples
Exemple #25
0
from petastorm import make_reader, make_batch_reader, TransformSpec
from petastorm.codecs import ScalarCodec
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.predicates import in_lambda
from petastorm.reader import ReaderV2
from petastorm.reader_impl.same_thread_executor import SameThreadExecutor
from petastorm.selectors import SingleIndexSelector
from petastorm.tests.test_common import create_test_dataset, TestSchema
from petastorm.tests.test_end_to_end_predicates_impl import \
    PartitionKeyInSetPredicate, EqualPredicate, VectorizedEqualPredicate
from petastorm.unischema import UnischemaField, Unischema

# pylint: disable=unnecessary-lambda
MINIMAL_READER_FLAVOR_FACTORIES = [
    lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs),
    lambda url, **kwargs: make_reader(url, reader_engine='experimental_reader_v2', **kwargs),
]

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [
    lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs),
    lambda url, **kwargs: make_reader(url, reader_pool_type='process', workers_count=2, **kwargs),
    lambda url, **kwargs: make_reader(url, workers_count=2, reader_engine='experimental_reader_v2', **kwargs),
    lambda url, **kwargs: make_reader(url, reader_pool_type='process', workers_count=2, **kwargs),
    lambda url, **kwargs: make_reader(url, workers_count=2, reader_engine='experimental_reader_v2', **kwargs),
]

SCALAR_FIELDS = [f for f in TestSchema.fields.values() if isinstance(f.codec, ScalarCodec)]

SCALAR_ONLY_READER_FACTORIES = [
Exemple #26
0
def _check_reader(path, rowgroup_selector=None):
    # Just check that you can open and read from a reader successfully
    with make_reader('file://{}'.format(path), reader_pool_type='dummy', rowgroup_selector=rowgroup_selector) as reader:
        [next(reader) for _ in range(10)]
Exemple #27
0
def test_make_reader_fails_loading_non_petastrom_dataset(scalar_dataset):
    with pytest.raises(RuntimeError, match='use make_batch_reader'):
        make_reader(scalar_dataset.url)
Exemple #28
0

# define how to decorate the open method
def retry_open(decorated_open, retry):
    def open(self,
             path,
             mode='rb',
             buffer_size=None,
             replication=None,
             default_block_size=None):
        print('opening {}'.format(path))
        return retry.call(decorated_open,
                          self,
                          path,
                          mode=mode,
                          buffer_size=buffer_size,
                          replication=replication,
                          default_block_size=default_block_size)

    return open


# decorate open
retry = tenacity.Retrying()
HadoopFileSystem.open = retry_open(HadoopFileSystem.open, retry)

file = 'hdfs://ip-10-1-1-36.example.com/user/spark/petastorm_dataset.parquet'
with make_reader(file, hdfs_driver='libhdfs',
                 pyarrow_serialize=True) as train_reader:
    pass