def UDPOS(root, split): dataset_tar = download_from_url(URL, root=root, hash_value=MD5, hash_type='md5') extracted_files = extract_archive(dataset_tar) if split == 'valid': path = _find_match("dev.txt", extracted_files) else: path = _find_match(split + ".txt", extracted_files) return _RawTextIterableDataset(DATASET_NAME, NUM_LINES[split], _create_data_from_iob(path))
def WikiText103(root, split): dataset_tar = download_from_url(URL, root=root, hash_value=MD5, hash_type='md5') extracted_files = extract_archive(dataset_tar) path = _find_match(split, extracted_files) logging.info('Creating {} data'.format(split)) return _RawTextIterableDataset('WikiText103', NUM_LINES[split], iter(io.open(path, encoding="utf8")))
def WikiText2(root, split): dataset_tar = download_from_url(URL, root=root, hash_value=MD5, hash_type='md5') extracted_files = extract_archive(dataset_tar) path = _find_match(split, extracted_files) logging.info('Creating {} data'.format(split)) return _RawTextIterableDataset(DATASET_NAME, NUM_LINES[split], _read_text_iterator(path))
def DBpedia(root, split): dataset_tar = download_from_url(URL, root=root, path=os.path.join(root, _PATH), hash_value=MD5, hash_type='md5') extracted_files = extract_archive(dataset_tar) path = _find_match(split + '.csv', extracted_files) return _RawTextIterableDataset(DATASET_NAME, NUM_LINES[split], _create_data_from_csv(path))
def DBpedia(root, split): def _create_data_from_csv(data_path): with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: yield int(row[0]), ' '.join(row[1:]) dataset_tar = download_from_url(URL, root=root, path=os.path.join(root, _PATH), hash_value=MD5, hash_type='md5') extracted_files = extract_archive(dataset_tar) path = _find_match(split + '.csv', extracted_files) return _RawTextIterableDataset("DBpedia", NUM_LINES[split], _create_data_from_csv(path))