def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): """Build indexed dataset.""" logging.info(' > building dataset index ...') start_time = time.time() indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time)) logging.info(' number of documents: {}'.format(indexed_dataset.sizes.shape[0])) return indexed_dataset
def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): logging.info(' > building dataset index ...') start_time = time.time() indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1] logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time)) logging.info(' > indexed dataset stats:') logging.info(' number of documents: {}'.format(indexed_dataset.doc_idx.shape[0] - 1)) logging.info(' number of sentences: {}'.format(indexed_dataset.sizes.shape[0])) return indexed_dataset