Example #1
0
def UDPOS(root, split):
    dataset_tar = download_from_url(URL, root=root, hash_value=MD5, hash_type='md5')
    extracted_files = extract_archive(dataset_tar)
    if split == 'valid':
        path = _find_match("dev.txt", extracted_files)
    else:
        path = _find_match(split + ".txt", extracted_files)
    return _RawTextIterableDataset(DATASET_NAME, NUM_LINES[split],
                                   _create_data_from_iob(path))
Example #2
0
def WikiText103(root, split):
    dataset_tar = download_from_url(URL, root=root, hash_value=MD5, hash_type='md5')
    extracted_files = extract_archive(dataset_tar)

    path = _find_match(split, extracted_files)
    logging.info('Creating {} data'.format(split))
    return _RawTextIterableDataset('WikiText103',
                                   NUM_LINES[split], iter(io.open(path, encoding="utf8")))
Example #3
0
def WikiText2(root, split):
    dataset_tar = download_from_url(URL,
                                    root=root,
                                    hash_value=MD5,
                                    hash_type='md5')
    extracted_files = extract_archive(dataset_tar)
    path = _find_match(split, extracted_files)
    logging.info('Creating {} data'.format(split))
    return _RawTextIterableDataset(DATASET_NAME, NUM_LINES[split],
                                   _read_text_iterator(path))
Example #4
0
def DBpedia(root, split):
    dataset_tar = download_from_url(URL,
                                    root=root,
                                    path=os.path.join(root, _PATH),
                                    hash_value=MD5,
                                    hash_type='md5')
    extracted_files = extract_archive(dataset_tar)

    path = _find_match(split + '.csv', extracted_files)
    return _RawTextIterableDataset(DATASET_NAME, NUM_LINES[split],
                                   _create_data_from_csv(path))
Example #5
0
def DBpedia(root, split):
    def _create_data_from_csv(data_path):
        with io.open(data_path, encoding="utf8") as f:
            reader = unicode_csv_reader(f)
            for row in reader:
                yield int(row[0]), ' '.join(row[1:])

    dataset_tar = download_from_url(URL,
                                    root=root,
                                    path=os.path.join(root, _PATH),
                                    hash_value=MD5,
                                    hash_type='md5')
    extracted_files = extract_archive(dataset_tar)

    path = _find_match(split + '.csv', extracted_files)
    return _RawTextIterableDataset("DBpedia", NUM_LINES[split],
                                   _create_data_from_csv(path))