Esempio n. 1
0
    def test_multiple_cursors(self):
        # pylint: disable=protected-access
        lazy_instances1 = _LazyInstances(lambda: (i for i in self.instances))
        lazy_instances2 = _LazyInstances(lambda: (i for i in self.instances))

        eager_instances1 = self.instances[:]
        eager_instances2 = self.instances[:]

        for instances1, instances2 in [(eager_instances1, eager_instances2),
                                       (lazy_instances1, lazy_instances2)]:
            iterator = BasicIterator(batch_size=1, instances_per_epoch=2)
            iterator.index_with(self.vocab)

            # First epoch through dataset1
            batches = list(iterator._create_batches(instances1, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[0]], [self.instances[1]]]

            # First epoch through dataset2
            batches = list(iterator._create_batches(instances2, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[0]], [self.instances[1]]]

            # Second epoch through dataset1
            batches = list(iterator._create_batches(instances1, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[2]], [self.instances[3]]]

            # Second epoch through dataset2
            batches = list(iterator._create_batches(instances2, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[2]], [self.instances[3]]]
Esempio n. 2
0
    def test_multiple_cursors(self):
        # pylint: disable=protected-access
        instances1 = _LazyInstances(lambda:
                                    (i for i in self.instance_iterable))
        instances2 = _LazyInstances(lambda:
                                    (i for i in self.instance_iterable))

        iterator = LazyBasicIterator(batch_size=1, instances_per_epoch=2)
        iterator.index_with(self.vocab)

        # First epoch through dataset1
        batches = list(iterator._create_batches(instances1, shuffle=False))
        grouped_instances = [batch.instances for batch in batches]
        assert grouped_instances == [[self.instances[0]], [self.instances[1]]]

        # First epoch through dataset2
        batches = list(iterator._create_batches(instances2, shuffle=False))
        grouped_instances = [batch.instances for batch in batches]
        assert grouped_instances == [[self.instances[0]], [self.instances[1]]]

        # Second epoch through dataset1
        batches = list(iterator._create_batches(instances1, shuffle=False))
        grouped_instances = [batch.instances for batch in batches]
        assert grouped_instances == [[self.instances[2]], [self.instances[3]]]

        # Second epoch through dataset2
        batches = list(iterator._create_batches(instances2, shuffle=False))
        grouped_instances = [batch.instances for batch in batches]
        assert grouped_instances == [[self.instances[2]], [self.instances[3]]]
Esempio n. 3
0
    def test_multiple_cursors(self):

        lazy_instances1 = _LazyInstances(lambda: (i for i in self.instances))
        lazy_instances2 = _LazyInstances(lambda: (i for i in self.instances))

        eager_instances1 = self.instances[:]
        eager_instances2 = self.instances[:]

        for instances1, instances2 in [
            (eager_instances1, eager_instances2),
            (lazy_instances1, lazy_instances2),
        ]:
            iterator = BasicIterator(batch_size=1, instances_per_epoch=2)
            iterator.index_with(self.vocab)

            # First epoch through dataset1
            batches = list(iterator._create_batches(instances1, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[0]], [self.instances[1]]]

            # First epoch through dataset2
            batches = list(iterator._create_batches(instances2, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[0]], [self.instances[1]]]

            # Second epoch through dataset1
            batches = list(iterator._create_batches(instances1, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[2]], [self.instances[3]]]

            # Second epoch through dataset2
            batches = list(iterator._create_batches(instances2, shuffle=False))
            grouped_instances = [batch.instances for batch in batches]
            assert grouped_instances == [[self.instances[2]], [self.instances[3]]]
Esempio n. 4
0
    def read(self, *args, **kwargs) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")

        if lazy:
            return _LazyInstances(lambda: iter(self._read(*args, **kwargs)))
        else:
            instances = self._read(*args, **kwargs)
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError(
                    f"No instances were read from the given args ({args}). "
                    f"and kwargs ({kwargs})Is the path correct?")
            return instances
Esempio n. 5
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        # self.folder_path = file_path
        img_file = os.path.join(file_path,'imgs.tsv')
        imgid2img = self.load_feature(img_file)

        lazy = getattr(self, 'lazy', None)

        if lazy is None:
            logger.warning("DatasetReader.lazy is not set, "
                           "did you forget to call the superclass constructor?")

        if self._cache_directory:
            cache_file = self._get_cache_location_for_file_path(file_path)
        else:
            cache_file = None

        if lazy:
            return _LazyInstances(lambda: self._read(file_path, imgid2img),
                                  cache_file,
                                  self.deserialize_instance,
                                  self.serialize_instance)
        else:
            # First we read the instances, either from a cache or from the original file.
            if cache_file and os.path.exists(cache_file):
                instances = self._instances_from_cache_file(cache_file)
            else:
                instances = self._read(file_path)

            # Then some validation.
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError("No instances were read from the given filepath {}. "
                                         "Is the path correct?".format(file_path))

            # And finally we write to the cache if we need to.
            if cache_file and not os.path.exists(cache_file):
                logger.info(f"Caching instances to {cache_file}")
                self._instances_to_cache_file(cache_file, instances)

            return instances
 def read(self, file_path: str) -> Iterable[Instance]:
     return _LazyInstances(self._one_epoch)
Esempio n. 7
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")
        if lazy:
            return _LazyInstances(lambda: iter(self._read(file_path)))
        else:
            if self.cache_path is not None:
                # create a key for the file based on the reader config
                hash_ = self.get_hash(file_path)
                pathlib.Path(self.cache_path).mkdir(parents=True,
                                                    exist_ok=True)
                cache_file = os.path.join(self.cache_path, (hash_ + '.cache'))
                if not os.path.exists(cache_file) or self.overwrite_cache:
                    instances = self._read(file_path)
                    if not isinstance(instances, list):
                        instances = [
                            instance for instance in Tqdm.tqdm(instances)
                        ]
                    if not instances:
                        raise ConfigurationError(
                            "No instances were read from the given filepath {}. "
                            "Is the path correct?".format(file_path))
                    logger.info(f'caching instances to file: {cache_file}')

                    with open(cache_file, 'wb') as cache:
                        dill.dump(instances, cache)
                else:
                    logger.info(
                        f'Reading instances from cache file: {cache_file}')
                    # instances = []
                    # with open(cache_file, 'rb') as cache:
                    #     start   = time.time()
                    #     instances = []
                    #     for line in Tqdm.tqdm(cache):
                    #         instances.append(self.deserialize_instance(line.strip()))
                    #     print(time.time()-start)
                    with open(cache_file, 'rb') as f_in:
                        instances = dill.load(f_in)
            else:
                instances = self._read(file_path)
                if not isinstance(instances, list):
                    instances = [instance for instance in Tqdm.tqdm(instances)]
                if not instances:
                    raise ConfigurationError(
                        "No instances were read from the given filepath {}. "
                        "Is the path correct?".format(file_path))
            return instances