Esempio n. 1
0
    def _memory_sized_lists(
            self, instances: Iterable[Instance]) -> Iterable[List[Instance]]:
        """
        Breaks the dataset into "memory-sized" lists of instances,
        which it yields up one at a time until it gets through a full epoch.

        For example, if the dataset is already an in-memory list, and each epoch
        represents one pass through the dataset, it just yields back the dataset.
        Whereas if the dataset is lazily read from disk and we've specified to
        load 1000 instances at a time, then it yields lists of 1000 instances each.
        """
        lazy = is_lazy(instances)

        # Get an iterator over the next epoch worth of instances.
        iterator = self._take_instances(instances, self._instances_per_epoch)

        # We have four different cases to deal with:

        # With lazy instances and no guidance about how many to load into memory,
        # we just load ``batch_size`` instances at a time:
        if lazy and self._max_instances_in_memory is None:
            yield from lazy_groups_of(iterator, self._batch_size)
        # If we specified max instances in memory, lazy or not, we just
        # load ``max_instances_in_memory`` instances at a time:
        elif self._max_instances_in_memory is not None:
            yield from lazy_groups_of(iterator, self._max_instances_in_memory)
        # If we have non-lazy instances, and we want all instances each epoch,
        # then we just yield back the list of instances:
        elif self._instances_per_epoch is None:
            yield ensure_list(instances)
        # In the final case we have non-lazy instances, we want a specific number
        # of instances each epoch, and we didn't specify how to many instances to load
        # into memory. So we convert the whole iterator to a list:
        else:
            yield list(iterator)
Esempio n. 2
0
    def run(self) -> None:
        has_reader = self._dataset_reader is not None
        if has_reader:
            for batch in lazy_groups_of(self._get_instance_data(), self._batch_size):
                for model_input_instance, result in zip(batch, self._predict_instances(batch)):
                    self._maybe_print_to_console_and_file(result, str(model_input_instance))
        else:
            for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size):
                for model_input_json, result in zip(batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(result, json.dumps(model_input_json))

        if self._output_file is not None:
            self._output_file.close()
Esempio n. 3
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            instance_list = sort_by_padding(instance_list, self._sorting_keys,
                                            self.vocab, self._padding_noise)

            batches = []
            for batch_instances in lazy_groups_of(iter(instance_list),
                                                  self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                        batch_instances):
                    batches.append(Batch(possibly_smaller_batches))

            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                # NOTE: if shuffle is false, the data will still be in a different order
                # because of the bucket sorting.
                random.shuffle(batches)
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            yield from batches
Esempio n. 4
0
 def _create_batches(self, instances: Iterable[Instance],
                     shuffle: bool) -> Iterable[Batch]:
     # First break the dataset into memory-sized lists:
     for instance_list in self._memory_sized_lists(instances):
         if shuffle:
             random.shuffle(instance_list)
         iterator = iter(instance_list)
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator, self._batch_size):
             for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                     batch_instances):
                 batch = Batch(possibly_smaller_batches)
                 yield batch