Example #1
0
    def get_data_iterator(self,
                          path: str,
                          batch_size: int,
                          is_train: bool,
                          shuffle=True,
                          shuffle_seed: int = 0,
                          offset: int = 0) -> ShardedDataIterator:
        data_files = glob.glob(path)
        logger.info("Data files: %s", data_files)
        if not data_files:
            raise RuntimeError('No Data files found')
        preprocessed_data_files = self._get_preprocessed_files(
            data_files, is_train)
        data = read_serialized_data_from_files(preprocessed_data_files)

        iterator = ShardedDataIterator(data,
                                       shard_id=self.shard_id,
                                       num_shards=self.distributed_factor,
                                       batch_size=batch_size,
                                       shuffle=shuffle,
                                       shuffle_seed=shuffle_seed,
                                       offset=offset)

        # apply deserialization hook
        iterator.apply(lambda sample: sample.on_deserialize())
        return iterator
Example #2
0
    def get_data_iterator(
        self,
        path: str,
        batch_size: int,
        is_train: bool,
        shuffle=True,
        shuffle_seed: int = 0,
        offset: int = 0,
    ) -> ShardedDataIterator:

        run_preprocessing = (True if self.distributed_factor == 1
                             or self.cfg.local_rank in [-1, 0] else False)

        # Original, raw gold passages
        gold_passages_src = self.cfg.gold_passages_src
        if gold_passages_src:
            if not is_train:
                gold_passages_src = self.cfg.gold_passages_src_dev

            assert os.path.exists(
                gold_passages_src
            ), "Please specify valid gold_passages_src/gold_passages_src_dev"

        # Processed, 100-word split gold passages
        gold_passages_processed = (self.cfg.gold_passages_processed if is_train
                                   else self.cfg.gold_passages_processed_dev)

        if self.wiki_data is None:
            self.wiki_data = TokenizedWikipediaPassages(
                data_file=self.cfg.wiki_psgs_tokenized)

        bm25_retrieval_results = self.cfg.bm25_retrieval_results if is_train else None
        dataset = ExtractiveReaderGeneralDataset(
            path,
            bm25_retrieval_results,
            self.wiki_data,
            is_train,
            gold_passages_src,
            gold_passages_processed,
            self.tensorizer,
            run_preprocessing,
            self.cfg.num_workers,
            debugging=self.debugging,
        )

        dataset.load_data()

        iterator = ShardedDataIterator(
            dataset,
            shard_id=self.shard_id,
            num_shards=self.distributed_factor,
            batch_size=batch_size,
            shuffle=shuffle,
            shuffle_seed=shuffle_seed,
            offset=offset,
        )

        # apply deserialization hook
        iterator.apply(lambda sample: sample.on_deserialize())
        return iterator
    def get_data_iterator(
        self,
        path: str,
        batch_size: int,
        is_train: bool,
        shuffle=True,
        shuffle_seed: int = 0,
        offset: int = 0,
    ) -> ShardedDataIterator:

        run_preprocessing = (
            True
            if self.distributed_factor == 1 or self.cfg.local_rank in [-1, 0]
            else False
        )

        gold_passages_src = self.cfg.gold_passages_src
        if gold_passages_src:
            if not is_train:
                gold_passages_src = self.cfg.gold_passages_src_dev

            assert os.path.exists(
                gold_passages_src
            ), "Please specify valid gold_passages_src/gold_passages_src_dev"

        dataset = ExtractiveReaderDataset(
            path,
            is_train,
            gold_passages_src,
            self.tensorizer,
            run_preprocessing,
            self.cfg.num_workers,
        )

        dataset.load_data()

        iterator = ShardedDataIterator(
            dataset,
            shard_id=self.shard_id,
            num_shards=self.distributed_factor,
            batch_size=batch_size,
            shuffle=shuffle,
            shuffle_seed=shuffle_seed,
            offset=offset,
        )

        # apply deserialization hook
        iterator.apply(lambda sample: sample.on_deserialize())
        return iterator