Ejemplo n.º 1
0
    def _iter_impl(self):
        """
        The Data Loader iterator stops the for-loop when reader runs out of samples.
        """
        # As we iterate over incoming samples, we are going to store them in `self._batch_acc`, until we have a batch of
        # the requested batch_size ready.

        keys = None
        if self.shuffling_queue_capacity > 0:
            # We can not know what is the reasonable number to use for the extra capacity, so we set a huge number
            # and give up on the unbound growth protection mechanism.
            # To keep the same behavior as DataLoader, we need to increase the shuffling_queue_capacity
            min_after_dequeue = self.shuffling_queue_capacity - 1
            shuffling_queue_capacity = min_after_dequeue + self.batch_size
            self._shuffling_buffer = BatchedRandomShufflingBuffer(
                shuffling_queue_capacity,
                min_after_retrieve=min_after_dequeue,
                extra_capacity=100000000,
                batch_size=self.batch_size)
        else:
            self._shuffling_buffer = BatchedNoopShufflingBuffer(
                batch_size=self.batch_size)

        for row in self.reader:
            # Default collate does not work nicely on namedtuples and treat them as lists
            # Using dict will result in the yielded structures being dicts as well
            row_as_dict = row._asdict()

            keys = row_as_dict.keys()

            # Promote some types that are incompatible with pytorch to be pytorch friendly.
            _sanitize_pytorch_types(row_as_dict)

            # Add rows to shuffling buffer
            for k, v in row_as_dict.items():
                if not self.reader.is_batched_reader:
                    row_as_dict[k] = self.transform_fn([v])
                else:
                    row_as_dict[k] = self.transform_fn(v)
            self._shuffling_buffer.add_many(row_as_dict.values())

            # _yield_batches will emit as much batches as are allowed by the shuffling_buffer (RandomShufflingBuffer
            # will avoid underflowing below a certain number of samples to guarantee some samples decorrelation)
            for batch in self._yield_batches(keys):
                yield batch

        # Once reader can not read new rows, we might still have a bunch of rows waiting in the shuffling buffer.
        # Telling shuffling buffer that we are finished allows to deplete the buffer completely, regardless its
        # min_after_dequeue setting.
        self._shuffling_buffer.finish()

        for batch in self._yield_batches(keys):
            yield batch
Ejemplo n.º 2
0
    def __init__(self, reader, batch_size=1, transform_fn=None,
                 shuffling_queue_capacity=0):
        """
        Initializes a data loader object.

        Number of epochs is defined by the configuration of the reader argument.

        An optional shuffling queue is created if shuffling_queue_capacity is greater than 0. No samples will be
        returned to a user by the ``BatchedDataLoader`` until the queue is full. After that, batches of `batch_size`
        will be created by uniformly sampling the shuffling queue. Once no more samples are available from the data
        reader, the shuffling queue is allowed to be consumed till no further samples are available.

        Note that the last returned batch could have less then ``batch_size`` samples.

        NOTE: if you are using ``make_batch_reader``, this shuffling queue will be randomizing the order of the
        entire batches and not changing the order of elements within a batch. This is likely not what you intend to do.

        This class does not support special types that are not supported in PyTorch (decimal/string).

        :param reader: petastorm Reader instance
        :param batch_size: the number of items to return per batch; factored into the len() of this reader
        :param transform_fn: an optional callable to convert batches from the reader to PyTorch tensors
        :param shuffling_queue_capacity: Queue capacity is passed to the underlying :class:`tf.RandomShuffleQueue`
          instance. If set to 0, no suffling will be done.
        """
        self.reader = reader
        self.batch_size = batch_size
        self.transform_fn = transform_fn or torch.as_tensor

        # _batch_acc accumulates samples for a single batch.
        self._batch_acc = []
        if shuffling_queue_capacity > 0:
            # We can not know what is the reasonable number to use for the extra capacity, so we set a huge number
            # and give up on the unbound growth protection mechanism.
            # To keep the same behavior as DataLoader, we need to increase the shuffling_queue_capacity
            min_after_dequeue = shuffling_queue_capacity - 1
            shuffling_queue_capacity = min_after_dequeue + batch_size
            self._shuffling_buffer = BatchedRandomShufflingBuffer(
                shuffling_queue_capacity,
                min_after_retrieve=min_after_dequeue,
                extra_capacity=100000000,
                batch_size=batch_size
            )
        else:
            self._shuffling_buffer = BatchedNoopShufflingBuffer(batch_size=batch_size)
def test_batched_buffer_stream_through(batch_size):
    """Feed a 0:99 sequence through a BatchedNoopShufflingBuffer. Check that the order has not changed."""
    expected = list(range(100))
    actual = _feed_a_sequence_through_the_queue(BatchedNoopShufflingBuffer(batch_size), expected)
    assert expected == actual
Ejemplo n.º 4
0
class BatchedDataLoader(object):
    """
    Same as DataLoader except it uses torch-based shuffling buffers which enable batched buffering
    (significantly faster for small data).
    """

    def __init__(self, reader, batch_size=1, transform_fn=None,
                 shuffling_queue_capacity=0):
        """
        Initializes a data loader object.

        Number of epochs is defined by the configuration of the reader argument.

        An optional shuffling queue is created if shuffling_queue_capacity is greater than 0. No samples will be
        returned to a user by the ``BatchedDataLoader`` until the queue is full. After that, batches of `batch_size`
        will be created by uniformly sampling the shuffling queue. Once no more samples are available from the data
        reader, the shuffling queue is allowed to be consumed till no further samples are available.

        Note that the last returned batch could have less then ``batch_size`` samples.

        NOTE: if you are using ``make_batch_reader``, this shuffling queue will be randomizing the order of the
        entire batches and not changing the order of elements within a batch. This is likely not what you intend to do.

        This class does not support special types that are not supported in PyTorch (decimal/string).

        :param reader: petastorm Reader instance
        :param batch_size: the number of items to return per batch; factored into the len() of this reader
        :param transform_fn: an optional callable to convert batches from the reader to PyTorch tensors
        :param shuffling_queue_capacity: Queue capacity is passed to the underlying :class:`tf.RandomShuffleQueue`
          instance. If set to 0, no suffling will be done.
        """
        self.reader = reader
        self.batch_size = batch_size
        self.transform_fn = transform_fn or torch.as_tensor

        # _batch_acc accumulates samples for a single batch.
        self._batch_acc = []
        if shuffling_queue_capacity > 0:
            # We can not know what is the reasonable number to use for the extra capacity, so we set a huge number
            # and give up on the unbound growth protection mechanism.
            # To keep the same behavior as DataLoader, we need to increase the shuffling_queue_capacity
            min_after_dequeue = shuffling_queue_capacity - 1
            shuffling_queue_capacity = min_after_dequeue + batch_size
            self._shuffling_buffer = BatchedRandomShufflingBuffer(
                shuffling_queue_capacity,
                min_after_retrieve=min_after_dequeue,
                extra_capacity=100000000,
                batch_size=batch_size
            )
        else:
            self._shuffling_buffer = BatchedNoopShufflingBuffer(batch_size=batch_size)

    def __iter__(self):
        """
        The Data Loader iterator stops the for-loop when reader runs out of samples.
        """
        # As we iterate over incoming samples, we are going to store them in `self._batch_acc`, until we have a batch of
        # the requested batch_size ready.

        keys = None

        for row in self.reader:
            # Default collate does not work nicely on namedtuples and treat them as lists
            # Using dict will result in the yielded structures being dicts as well
            row_as_dict = row._asdict()

            keys = row_as_dict.keys()

            # Promote some types that are incompatible with pytorch to be pytorch friendly.
            _sanitize_pytorch_types(row_as_dict)

            # Add rows to shuffling buffer
            for k, v in row_as_dict.items():
                if not self.reader.is_batched_reader:
                    row_as_dict[k] = self.transform_fn([v])
                else:
                    row_as_dict[k] = self.transform_fn(v)
            self._shuffling_buffer.add_many(row_as_dict.values())

            # _yield_batches will emit as much batches as are allowed by the shuffling_buffer (RandomShufflingBuffer
            # will avoid underflowing below a certain number of samples to guarantee some samples decorrelation)
            for batch in self._yield_batches(keys):
                yield batch

        # Once reader can not read new rows, we might still have a bunch of rows waiting in the shuffling buffer.
        # Telling shuffling buffer that we are finished allows to deplete the buffer completely, regardless its
        # min_after_dequeue setting.
        self._shuffling_buffer.finish()

        for batch in self._yield_batches(keys):
            yield batch

    def _yield_batches(self, keys):
        while self._shuffling_buffer.can_retrieve():
            batch = self._shuffling_buffer.retrieve()
            if not isinstance(batch, dict):
                # This is for the case of batched reads. Here we restore back the
                # dictionary format of records
                batch = dict(zip(keys, batch))
            yield batch

    # Functions needed to treat data loader as a context manager
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.reader.stop()
        self.reader.join()
Ejemplo n.º 5
0
 def _instantiate_noop_buffer():
     return BatchedNoopShufflingBuffer(batch_size=self.batch_size)