def _init_file_record_count(self, recordio_files): self._data_blocks = [] start = 0 for file_path in recordio_files: with closing(recordio.Index(file_path)) as rio: num_records = rio.num_records() end = start + num_records self._data_blocks.append(RecordBlock(file_path, start, end)) start = end
def _collect_file_records_from_dir(data_dir): if not data_dir: return {} f_records = {} for f in os.listdir(data_dir): p = os.path.join(data_dir, f) with closing(recordio.Index(p)) as rio: f_records[p] = rio.num_records() return f_records
def create_shards(self): data_dir = self._kwargs["data_dir"] start_ind = 0 f_records = {} for f in os.listdir(data_dir): p = os.path.join(data_dir, f) with closing(recordio.Index(p)) as rio: f_records[p] = (start_ind, rio.num_records()) return f_records