Exemple #1
0
 def get_cc_shard(self, shard: int) -> process_wet_file.CCShardReader:
     return process_wet_file.CCShardReader(
         self.dump,
         shard=shard,
         num_shards=self.num_shards,
         num_segments_per_shard=self.num_segments_per_shard,
         min_len=self.min_len,
     )
Exemple #2
0
    def get_cc_shard(self, shard: int) -> process_wet_file.CCShardReader:
        dump_cache: Optional[Path] = None
        if self.cache_dir:
            self.cache_dir.mkdir(exist_ok=True)
            dump_cache = self.cache_dir / self.dump
            dump_cache.mkdir(exist_ok=True)

        return process_wet_file.CCShardReader(
            self.dump,
            shard=shard,
            num_shards=self.num_shards,
            num_segments_per_shard=self.num_segments_per_shard,
            min_len=self.min_len,
            cache_dir=dump_cache,
        )