def get_data_iterator(self, path: str, batch_size: int, is_train: bool, shuffle=True, shuffle_seed: int = 0, offset: int = 0) -> ShardedDataIterator: data_files = glob.glob(path) logger.info("Data files: %s", data_files) if not data_files: raise RuntimeError('No Data files found') preprocessed_data_files = self._get_preprocessed_files( data_files, is_train) data = read_serialized_data_from_files(preprocessed_data_files) iterator = ShardedDataIterator(data, shard_id=self.shard_id, num_shards=self.distributed_factor, batch_size=batch_size, shuffle=shuffle, shuffle_seed=shuffle_seed, offset=offset) # apply deserialization hook iterator.apply(lambda sample: sample.on_deserialize()) return iterator
def get_data_iterator( self, path: str, batch_size: int, is_train: bool, shuffle=True, shuffle_seed: int = 0, offset: int = 0, ) -> ShardedDataIterator: run_preprocessing = (True if self.distributed_factor == 1 or self.cfg.local_rank in [-1, 0] else False) # Original, raw gold passages gold_passages_src = self.cfg.gold_passages_src if gold_passages_src: if not is_train: gold_passages_src = self.cfg.gold_passages_src_dev assert os.path.exists( gold_passages_src ), "Please specify valid gold_passages_src/gold_passages_src_dev" # Processed, 100-word split gold passages gold_passages_processed = (self.cfg.gold_passages_processed if is_train else self.cfg.gold_passages_processed_dev) if self.wiki_data is None: self.wiki_data = TokenizedWikipediaPassages( data_file=self.cfg.wiki_psgs_tokenized) bm25_retrieval_results = self.cfg.bm25_retrieval_results if is_train else None dataset = ExtractiveReaderGeneralDataset( path, bm25_retrieval_results, self.wiki_data, is_train, gold_passages_src, gold_passages_processed, self.tensorizer, run_preprocessing, self.cfg.num_workers, debugging=self.debugging, ) dataset.load_data() iterator = ShardedDataIterator( dataset, shard_id=self.shard_id, num_shards=self.distributed_factor, batch_size=batch_size, shuffle=shuffle, shuffle_seed=shuffle_seed, offset=offset, ) # apply deserialization hook iterator.apply(lambda sample: sample.on_deserialize()) return iterator
def get_data_iterator( self, path: str, batch_size: int, is_train: bool, shuffle=True, shuffle_seed: int = 0, offset: int = 0, ) -> ShardedDataIterator: run_preprocessing = ( True if self.distributed_factor == 1 or self.cfg.local_rank in [-1, 0] else False ) gold_passages_src = self.cfg.gold_passages_src if gold_passages_src: if not is_train: gold_passages_src = self.cfg.gold_passages_src_dev assert os.path.exists( gold_passages_src ), "Please specify valid gold_passages_src/gold_passages_src_dev" dataset = ExtractiveReaderDataset( path, is_train, gold_passages_src, self.tensorizer, run_preprocessing, self.cfg.num_workers, ) dataset.load_data() iterator = ShardedDataIterator( dataset, shard_id=self.shard_id, num_shards=self.distributed_factor, batch_size=batch_size, shuffle=shuffle, shuffle_seed=shuffle_seed, offset=offset, ) # apply deserialization hook iterator.apply(lambda sample: sample.on_deserialize()) return iterator