Exemple #1
0
def _setup_datasets(dataset_name, root='.data'):
    extracted_files = []
    select_to_index = {'train': 0, 'dev': 1}
    extracted_files = [
        download_from_url(URLS[dataset_name][select_to_index[key]], root=root)
        for key in select_to_index.keys()
    ]
    train_iter = _create_data_from_json(extracted_files[0])
    dev_iter = _create_data_from_json(extracted_files[1])
    return (RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name],
                                   train_iter),
            RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name],
                                   dev_iter))
def _setup_datasets(dataset_name, root, split_, offset):
    split = check_default_set(split_, ('train', 'test'), dataset_name)
    if dataset_name == 'AG_NEWS':
        extracted_files = [
            download_from_url(URLS[dataset_name][item],
                              root=root,
                              hash_value=MD5['AG_NEWS'][item],
                              hash_type='md5') for item in ('train', 'test')
        ]
    else:
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files = extract_archive(dataset_tar)

    cvs_path = {}
    for fname in extracted_files:
        if fname.endswith('train.csv'):
            cvs_path['train'] = fname
        if fname.endswith('test.csv'):
            cvs_path['test'] = fname
    return wrap_datasets(
        tuple(
            RawTextIterableDataset(dataset_name,
                                   NUM_LINES[dataset_name][item],
                                   _create_data_from_csv(cvs_path[item]),
                                   offset=offset) for item in split), split_)
Exemple #3
0
def _setup_datasets(dataset_name, separator, root=".data"):

    extracted_files = []
    if isinstance(URLS[dataset_name], list):
        for f in URLS[dataset_name]:
            dataset_tar = download_from_url(f, root=root)
            extracted_files.extend(extract_archive(dataset_tar))
    elif isinstance(URLS[dataset_name], str):
        dataset_tar = download_from_url(URLS[dataset_name], root=root)
        extracted_files.extend(extract_archive(dataset_tar))
    else:
        raise ValueError(
            "URLS for {} has to be in a form or list or string".format(
                dataset_name))

    data_filenames = {
        "train": _construct_filepath(extracted_files, "train.txt"),
        "valid": _construct_filepath(extracted_files, "dev.txt"),
        "test": _construct_filepath(extracted_files, "test.txt")
    }

    datasets = []
    for key in data_filenames.keys():
        if data_filenames[key] is not None:
            datasets.append(
                RawTextIterableDataset(
                    dataset_name, NUM_LINES[dataset_name],
                    _create_data_from_iob(data_filenames[key], separator)))
        else:
            datasets.append(None)

    return datasets
Exemple #4
0
def _setup_datasets(dataset_name, separator, root, split, offset):
    extracted_files = []
    if isinstance(URLS[dataset_name], dict):
        for name, item in URLS[dataset_name].items():
            dataset_tar = download_from_url(item,
                                            root=root,
                                            hash_value=MD5[dataset_name][name],
                                            hash_type='md5')
            extracted_files.extend(extract_archive(dataset_tar))
    elif isinstance(URLS[dataset_name], str):
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files.extend(extract_archive(dataset_tar))
    else:
        raise ValueError(
            "URLS for {} has to be in a form of dictionary or string".format(
                dataset_name))

    data_filenames = {
        "train": _construct_filepath(extracted_files, "train.txt"),
        "valid": _construct_filepath(extracted_files, "dev.txt"),
        "test": _construct_filepath(extracted_files, "test.txt")
    }
    return [
        RawTextIterableDataset(
            dataset_name,
            NUM_LINES[dataset_name][item],
            _create_data_from_iob(data_filenames[item], separator),
            offset=offset) if data_filenames[item] is not None else None
        for item in split
    ]
Exemple #5
0
def _setup_datasets(dataset_name, root, split, year, language, offset):
    if dataset_name == 'PennTreebank':
        extracted_files = [download_from_url(URLS['PennTreebank'][key],
                                             root=root, hash_value=MD5['PennTreebank'][key],
                                             hash_type='md5') for key in split]
    else:
        dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5')
        extracted_files = extract_archive(dataset_tar)

    if dataset_name == 'WMTNewsCrawl':
        file_name = 'news.{}.{}.shuffled'.format(year, language)
        extracted_files = [f for f in extracted_files if file_name in f]

    path = {}
    for item in split:
        for fname in extracted_files:
            if item in fname:
                path[item] = fname

    datasets = []
    for item in split:
        logging.info('Creating {} data'.format(item))
        datasets.append(RawTextIterableDataset(dataset_name,
                                               NUM_LINES[dataset_name][item], iter(io.open(path[item], encoding="utf8")), offset=offset))

    return datasets
def IMDB(root='.data', split=('train', 'test'), offset=0):
    """ Defines raw IMDB datasets.

    Create supervised learning dataset: IMDB

    Separately returns the raw training and test dataset

    Args:
        root: Directory where the datasets are saved. Default: ".data"
        split: a string or tuple for the returned datasets. Default: ('train', 'test')
            By default, both datasets (train, test) are generated. Users could also choose any one or two of them,
            for example ('train', 'test') or just a string 'train'.
        offset: the number of the starting line. Default: 0

    Examples:
        >>> train, test = torchtext.experimental.datasets.raw.IMDB()
    """
    split_ = check_default_set(split, ('train', 'test'), 'IMDB')
    dataset_tar = download_from_url(URLS['IMDB'],
                                    root=root,
                                    hash_value=MD5['IMDB'],
                                    hash_type='md5')
    extracted_files = extract_archive(dataset_tar)
    return wrap_datasets(
        tuple(
            RawTextIterableDataset("IMDB",
                                   NUM_LINES["IMDB"][item],
                                   generate_imdb_data(item, extracted_files),
                                   offset=offset) for item in split_), split)
Exemple #7
0
def _setup_datasets(dataset_name, root='.data'):
    dataset_tar = download_from_url(URLS[dataset_name], root=root)
    extracted_files = extract_archive(dataset_tar)

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname

    train_iter = _create_data_from_csv(train_csv_path)
    test_iter = _create_data_from_csv(test_csv_path)
    return (RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name],
                                   train_iter),
            RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name],
                                   test_iter))
Exemple #8
0
def _setup_datasets(dataset_name, separator, root, data_select):
    data_select = check_default_set(data_select,
                                    target_select=('train', 'valid', 'test'))
    extracted_files = []
    if isinstance(URLS[dataset_name], list):
        for f in URLS[dataset_name]:
            dataset_tar = download_from_url(f, root=root)
            extracted_files.extend(extract_archive(dataset_tar))
    elif isinstance(URLS[dataset_name], str):
        dataset_tar = download_from_url(URLS[dataset_name], root=root)
        extracted_files.extend(extract_archive(dataset_tar))
    else:
        raise ValueError(
            "URLS for {} has to be in a form or list or string".format(
                dataset_name))

    data_filenames = {
        "train": _construct_filepath(extracted_files, "train.txt"),
        "valid": _construct_filepath(extracted_files, "dev.txt"),
        "test": _construct_filepath(extracted_files, "test.txt")
    }
    return tuple(
        RawTextIterableDataset(
            dataset_name, NUM_LINES[dataset_name],
            _create_data_from_iob(data_filenames[item], separator)
        ) if data_filenames[item] is not None else None
        for item in data_select)
Exemple #9
0
def _setup_datasets(dataset_name, root, split, offset):
    if dataset_name == 'AG_NEWS':
        extracted_files = [
            download_from_url(URLS[dataset_name][item],
                              root=root,
                              path=os.path.join(root,
                                                _PATHS[dataset_name][item]),
                              hash_value=MD5['AG_NEWS'][item],
                              hash_type='md5') for item in ('train', 'test')
        ]
    else:
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        path=os.path.join(
                                            root, _PATHS[dataset_name]),
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files = extract_archive(dataset_tar)

    cvs_path = {}
    for fname in extracted_files:
        if fname.endswith('train.csv'):
            cvs_path['train'] = fname
        if fname.endswith('test.csv'):
            cvs_path['test'] = fname
    return [
        RawTextIterableDataset(dataset_name,
                               NUM_LINES[dataset_name][item],
                               _create_data_from_csv(cvs_path[item]),
                               offset=offset) for item in split
    ]
Exemple #10
0
def IMDB(root='.data', split=('train', 'test'), offset=0):
    """
    Examples:
        >>> train, test = torchtext.experimental.datasets.raw.IMDB()
    """
    dataset_tar = download_from_url(URLS['IMDB'], root=root,
                                    hash_value=MD5['IMDB'], hash_type='md5')
    extracted_files = extract_archive(dataset_tar)
    return [RawTextIterableDataset("IMDB", NUM_LINES["IMDB"][item],
                                   generate_imdb_data(item,
                                                      extracted_files), offset=offset) for item in split]
Exemple #11
0
def IMDB(root='.data'):
    """ Defines IMDB datasets.

    Create supervised learning dataset: IMDB

    Separately returns the training and test dataset

    Arguments:
        root: Directory where the datasets are saved. Default: ".data"

    Examples:
        >>> train, test = torchtext.experimental.datasets.raw.IMDB()
    """

    dataset_tar = download_from_url(URLS['IMDB'], root=root)
    extracted_files = extract_archive(dataset_tar)
    train_iter = generate_imdb_data('train', extracted_files)
    test_iter = generate_imdb_data('test', extracted_files)
    return (RawTextIterableDataset("IMDB", NUM_LINES["IMDB"], train_iter),
            RawTextIterableDataset("IMDB", NUM_LINES["IMDB"], test_iter))
Exemple #12
0
def _setup_datasets(dataset_name, root, data_select):
    data_select = check_default_set(data_select, ('train', 'dev'))
    extracted_files = {
        key: download_from_url(URLS[dataset_name][key],
                               root=root,
                               hash_value=MD5[dataset_name][key],
                               hash_type='md5')
        for key in data_select
    }
    return tuple(
        RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item],
                               _create_data_from_json(extracted_files[item]))
        for item in data_select)
Exemple #13
0
def _setup_datasets(dataset_name, root, data_select, year, language):
    data_select = check_default_set(data_select, ('train', 'test', 'valid'))
    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(('train', 'test', 'valid'))):
        raise TypeError('data_select is not supported!')

    if dataset_name == 'PennTreebank':
        extracted_files = []
        select_to_index = {'train': 0, 'test': 1, 'valid': 2}
        extracted_files = [
            download_from_url(URLS['PennTreebank'][select_to_index[key]],
                              root=root,
                              hash_value=MD5['PennTreebank'][key],
                              hash_type='md5') for key in data_select
        ]
    elif dataset_name == 'WMTNewsCrawl':
        if not (data_select == ['train']
                or set(data_select).issubset(set(('train', )))):
            raise ValueError("WMTNewsCrawl only creates a training dataset. "
                             "data_select should be 'train' "
                             "or ('train',), got {}.".format(data_select))
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5['WMTNewsCrawl'],
                                        hash_type='md5')
        extracted_files = extract_archive(dataset_tar)
        file_name = 'news.{}.{}.shuffled'.format(year, language)
        extracted_files = [f for f in extracted_files if file_name in f]
    else:
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files = extract_archive(dataset_tar)

    _path = {}
    for item in data_select:
        for fname in extracted_files:
            if item in fname:
                _path[item] = fname

    data = {}
    for item in _path.keys():
        logging.info('Creating {} data'.format(item))
        data[item] = iter(io.open(_path[item], encoding="utf8"))

    return tuple(
        RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item],
                               data[item]) for item in data_select)
Exemple #14
0
def _setup_datasets(dataset_name, root, split, offset):
    extracted_files = {
        key: download_from_url(URLS[dataset_name][key],
                               root=root,
                               hash_value=MD5[dataset_name][key],
                               hash_type='md5')
        for key in split
    }
    return [
        RawTextIterableDataset(dataset_name,
                               NUM_LINES[dataset_name][item],
                               _create_data_from_json(extracted_files[item]),
                               offset=offset) for item in split
    ]
Exemple #15
0
def _setup_datasets(dataset_name, root, split_, offset):
    split = check_default_set(split_, ('train', 'dev'), dataset_name)
    extracted_files = {
        key: download_from_url(URLS[dataset_name][key],
                               root=root,
                               hash_value=MD5[dataset_name][key],
                               hash_type='md5')
        for key in split
    }
    return wrap_datasets(
        tuple(
            RawTextIterableDataset(dataset_name,
                                   NUM_LINES[dataset_name][item],
                                   _create_data_from_json(
                                       extracted_files[item]),
                                   offset=offset) for item in split), split_)
Exemple #16
0
def _setup_datasets(dataset_name, root, data_select):
    data_select = check_default_set(data_select,
                                    target_select=('train', 'test'))
    if dataset_name == 'AG_NEWS':
        extracted_files = [
            download_from_url(URLS[dataset_name][item], root=root)
            for item in ('train', 'test')
        ]
    else:
        dataset_tar = download_from_url(URLS[dataset_name], root=root)
        extracted_files = extract_archive(dataset_tar)
    cvs_path = {}
    for fname in extracted_files:
        if fname.endswith('train.csv'):
            cvs_path['train'] = fname
        if fname.endswith('test.csv'):
            cvs_path['test'] = fname
    return tuple(
        RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name],
                               _create_data_from_csv(cvs_path[item]))
        for item in data_select)
Exemple #17
0
def IMDB(root='.data', data_select=('train', 'test')):
    """ Defines raw IMDB datasets.

    Create supervised learning dataset: IMDB

    Separately returns the raw training and test dataset

    Arguments:
        root: Directory where the datasets are saved. Default: ".data"
        data_select: a string or tuple for the returned datasets. Default: ('train', 'test')
            By default, both datasets (train, test) are generated. Users could also choose any one or two of them,
            for example ('train', 'test') or just a string 'train'.

    Examples:
        >>> train, test = torchtext.experimental.datasets.raw.IMDB()
    """
    data_select = check_default_set(data_select,
                                    target_select=('train', 'test'))
    dataset_tar = download_from_url(URLS['IMDB'], root=root)
    extracted_files = extract_archive(dataset_tar)
    return tuple(
        RawTextIterableDataset("IMDB", NUM_LINES["IMDB"],
                               generate_imdb_data(item, extracted_files))
        for item in data_select)
Exemple #18
0
def _setup_datasets(dataset_name, train_filenames, valid_filenames,
                    test_filenames, split, root, offset):
    if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \
            and not isinstance(test_filenames, tuple):
        raise ValueError("All filenames must be tuples")
    src_train, tgt_train = train_filenames
    src_eval, tgt_eval = valid_filenames
    src_test, tgt_test = test_filenames

    extracted_files = []  # list of paths to the extracted files
    if isinstance(URLS[dataset_name], list):
        for idx, f in enumerate(URLS[dataset_name]):
            dataset_tar = download_from_url(f,
                                            root=root,
                                            hash_value=MD5[dataset_name][idx],
                                            hash_type='md5')
            extracted_files.extend(extract_archive(dataset_tar))
    elif isinstance(URLS[dataset_name], str):
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_dataset_tar = extract_archive(dataset_tar)
        if dataset_name == 'IWSLT':
            # IWSLT dataset's url downloads a multilingual tgz.
            # We need to take an extra step to pick out the specific language pair from it.
            src_language = train_filenames[0].split(".")[-1]
            tgt_language = train_filenames[1].split(".")[-1]
            languages = "-".join([src_language, tgt_language])
            iwslt_tar = '.data/2016-01/texts/{}/{}/{}.tgz'
            iwslt_tar = iwslt_tar.format(src_language, tgt_language, languages)
            extracted_dataset_tar = extract_archive(iwslt_tar)
        extracted_files.extend(extracted_dataset_tar)
    else:
        raise ValueError(
            "URLS for {} has to be in a form or list or string".format(
                dataset_name))

    # Clean the xml and tag file in the archives
    file_archives = []
    for fname in extracted_files:
        if 'xml' in fname:
            _clean_xml_file(fname)
            file_archives.append(os.path.splitext(fname)[0])
        elif "tags" in fname:
            _clean_tags_file(fname)
            file_archives.append(fname.replace('.tags', ''))
        else:
            file_archives.append(fname)

    data_filenames = defaultdict(dict)
    data_filenames = {
        "train": _construct_filepaths(file_archives, src_train, tgt_train),
        "valid": _construct_filepaths(file_archives, src_eval, tgt_eval),
        "test": _construct_filepaths(file_archives, src_test, tgt_test)
    }

    for key in data_filenames.keys():
        if len(data_filenames[key]) == 0 or data_filenames[key] is None:
            raise FileNotFoundError(
                "Files are not found for data type {}".format(key))

    datasets = []
    for key in split:
        src_data_iter = _read_text_iterator(data_filenames[key][0])
        tgt_data_iter = _read_text_iterator(data_filenames[key][1])

        def _iter(src_data_iter, tgt_data_iter):
            for item in zip(src_data_iter, tgt_data_iter):
                yield item

        datasets.append(
            RawTextIterableDataset(dataset_name,
                                   NUM_LINES[dataset_name][key],
                                   _iter(src_data_iter, tgt_data_iter),
                                   offset=offset))

    return datasets
Exemple #19
0
def _setup_datasets(dataset_name, train_filenames, valid_filenames,
                    test_filenames, data_select, root):
    data_select = check_default_set(data_select, ('train', 'valid', 'test'))
    if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \
            and not isinstance(test_filenames, tuple):
        raise ValueError("All filenames must be tuples")
    src_train, tgt_train = train_filenames
    src_eval, tgt_eval = valid_filenames
    src_test, tgt_test = test_filenames

    extracted_files = []
    if isinstance(URLS[dataset_name], list):
        for idx, f in enumerate(URLS[dataset_name]):
            dataset_tar = download_from_url(f,
                                            root=root,
                                            hash_value=MD5[dataset_name][idx],
                                            hash_type='md5')
            extracted_files.extend(extract_archive(dataset_tar))
    elif isinstance(URLS[dataset_name], str):
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files.extend(extract_archive(dataset_tar))
    else:
        raise ValueError(
            "URLS for {} has to be in a form or list or string".format(
                dataset_name))

    # Clean the xml and tag file in the archives
    file_archives = []
    for fname in extracted_files:
        if 'xml' in fname:
            _clean_xml_file(fname)
            file_archives.append(os.path.splitext(fname)[0])
        elif "tags" in fname:
            _clean_tags_file(fname)
            file_archives.append(fname.replace('.tags', ''))
        else:
            file_archives.append(fname)

    data_filenames = defaultdict(dict)
    data_filenames = {
        "train": _construct_filepaths(file_archives, src_train, tgt_train),
        "valid": _construct_filepaths(file_archives, src_eval, tgt_eval),
        "test": _construct_filepaths(file_archives, src_test, tgt_test)
    }

    for key in data_filenames.keys():
        if len(data_filenames[key]) == 0 or data_filenames[key] is None:
            raise FileNotFoundError(
                "Files are not found for data type {}".format(key))

    datasets = []
    for key in data_select:
        src_data_iter = _read_text_iterator(data_filenames[key][0])
        tgt_data_iter = _read_text_iterator(data_filenames[key][1])

        def _iter(src_data_iter, tgt_data_iter):
            for item in zip(src_data_iter, tgt_data_iter):
                yield item

        datasets.append(
            RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key],
                                   _iter(src_data_iter, tgt_data_iter)))

    return tuple(datasets)
Exemple #20
0
   for key in data_filenames.keys():
        if len(data_filenames[key]) == 0 or data_filenames[key] is None:
            raise FileNotFoundError(
                "Files are not found for data type {}".format(key))
    print('#data_filenames 2: ', data_filenames)
    datasets = []
    for key in data_select:
        src_data_iter = _read_text_iterator(data_filenames[key][0])
        tgt_data_iter = _read_text_iterator(data_filenames[key][1])

        def _iter(src_data_iter, tgt_data_iter):
            for item in zip(src_data_iter, tgt_data_iter):
                yield item

        datasets.append(
            RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key], _iter(src_data_iter, tgt_data_iter)))
    print('#datasets: ', datasets)
    return tuple(datasets)


def Multi30k(train_filenames=("train.de", "train.en"),
             valid_filenames=("val.de", "val.en"),
             test_filenames=("test_2016_flickr.de", "test_2016_flickr.en"),
             data_select=('train', 'valid', 'test'), root='.data'):
    """ Define translation datasets: Multi30k
        Separately returns train/valid/test datasets as a tuple
        The available dataset include:
            test_2016_flickr.cs
            test_2016_flickr.de
            test_2016_flickr.en
            test_2016_flickr.fr