Beispiel #1
0
def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
    """Build indexed dataset."""
    logging.info(' > building dataset index ...')

    start_time = time.time()
    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
    logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time))
    logging.info('    number of documents: {}'.format(indexed_dataset.sizes.shape[0]))

    return indexed_dataset
Beispiel #2
0
def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):

    logging.info(' > building dataset index ...')

    start_time = time.time()
    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
    logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time))

    logging.info(' > indexed dataset stats:')
    logging.info('    number of documents: {}'.format(indexed_dataset.doc_idx.shape[0] - 1))
    logging.info('    number of sentences: {}'.format(indexed_dataset.sizes.shape[0]))

    return indexed_dataset