Ejemplo n.º 1
0
    def __call__(self,
                 instances: Iterable[Instance],
                 num_epochs: int = None,
                 shuffle: bool = False) -> Iterator[TensorDict]:
        key = id(instances)
        starting_epoch = self._epochs[key]

        if num_epochs is None:
            epochs: Iterable[int] = itertools.count(starting_epoch)
        else:
            epochs = range(starting_epoch, starting_epoch + num_epochs)

        for epoch in epochs:

            # In order to ensure that we are (almost) constantly streaming data to the model we
            # need to have all of the instances in memory ($$$)
            instance_list = list(instances)
            if shuffle:
                random.shuffle(instance_list)

            # We create queues for each instance in the batch, and greedily fill them to try and
            # ensure each queue's length is roughly equal in size.
            queues: List[Deque[Instance]] = [
                deque() for _ in range(self._batch_size)
            ]
            queue_lengths = np.zeros(self._batch_size, dtype=int)
            for instance in instances:

                # Now we split the instance into chunks.
                chunks, length = self._split(instance)

                # Next we identify which queue is the shortest and add the chunks to that queue.
                destination = np.argmin(queue_lengths)
                queues[destination].extend(chunks)
                queue_lengths[destination] += length

            # We need a NULL instance to replace the output of an exhausted queue if we are evaluating
            prototype = deepcopy(chunks[-1])
            new_fields: Dict[str, Field] = {}
            for name, field in prototype.fields.items():
                if isinstance(field, MetadataField):
                    new_fields[name] = field
                else:
                    new_fields[name] = field.empty_field()
            blank_instance = Instance(new_fields)

            for batch in self._generate_batches(queues, blank_instance):
                if self._track_epoch:
                    add_epoch_number(batch, epoch)

                if self.vocab is not None:
                    batch.index_instances(self.vocab)

                padding_lengths = batch.get_padding_lengths()
                yield batch.as_tensor_dict(padding_lengths), 1

            self._epochs[key] = epoch + 1
Ejemplo n.º 2
0
    def __call__(self,
                 instances: Iterable[Instance],
                 num_epochs: int = None,
                 shuffle: bool = True) -> Iterator[TensorDict]:
        """
        Returns a generator that yields batches over the given dataset
        for the given number of epochs. If ``num_epochs`` is not specified,
        it will yield batches forever.

        Parameters
        ----------
        instances : ``Iterable[Instance]``
            The instances in the dataset. IMPORTANT: this must be able to be
            iterated over *multiple times*. That is, it must be either a List
            or some other object whose ``__iter__`` method returns a fresh iterator
            each time it's called.
        num_epochs : ``int``, optional (default=``None``)
            How times should we iterate over this dataset?  If ``None``, we will iterate over it
            forever.
        shuffle : ``bool``, optional (default=``True``)
            If ``True``, we will shuffle the instances in ``dataset`` before constructing batches
            and iterating over the data.
        """
        # Instances is likely to be a list, which cannot be used as a key,
        # so we take the object id instead.
        key = id(instances)
        starting_epoch = self._epochs[key]

        if num_epochs is None:
            epochs: Iterable[int] = itertools.count(starting_epoch)
        else:
            epochs = range(starting_epoch, starting_epoch + num_epochs)

        for epoch in epochs:
            if self._cache_instances and key in self._cache:
                # Serve the results from the cache.
                tensor_dicts = self._cache[key]

                if shuffle:
                    random.shuffle(tensor_dicts)
                for tensor_dict in tensor_dicts:
                    if self._track_epoch:
                        # The tensor_dict already has an "epoch_num" tensor,
                        # so just fill it with the right value.
                        epoch_tensor: torch.Tensor = tensor_dict['epoch_num']
                        epoch_tensor.fill_(epoch)
                    for split_tensor_dict in self._splitter(tensor_dict):
                        yield split_tensor_dict
            else:
                batches = self._create_batches(instances, shuffle)

                # Should we add the instances to the cache this epoch?
                add_to_cache = self._cache_instances and key not in self._cache

                for batch in batches:
                    if self._track_epoch:
                        add_epoch_number(batch, epoch)

                    if self.vocab is not None:
                        batch.index_instances(self.vocab)

                    # In order to make  gradient updates fair in expectation,
                    # we randomly choose a sequence to cutoff at.
                    all_instance_lengths = [
                        instance.get_padding_lengths()
                        for instance in batch.instances
                    ]
                    random_instance = random.choice(all_instance_lengths)
                    truncate_at = random_instance['tokens']['num_tokens']
                    padding_lengths = batch.get_padding_lengths()
                    logger.debug('trunacate at: %s', truncate_at)
                    logger.debug('padding_lengths: %s', padding_lengths)

                    tensor_dict = batch.as_tensor_dict(padding_lengths)

                    if add_to_cache:
                        self._cache[key].append(tensor_dict)

                    for split_tensor_dict in self._splitter(
                            tensor_dict, truncate_at):
                        yield split_tensor_dict

            # Increment epoch tracker
            self._epochs[key] = epoch + 1