def IMDB(root='.data', split=('train', 'test'), offset=0):
    """ Defines raw IMDB datasets.

    Create supervised learning dataset: IMDB

    Separately returns the raw training and test dataset

    Args:
        root: Directory where the datasets are saved. Default: ".data"
        split: a string or tuple for the returned datasets. Default: ('train', 'test')
            By default, both datasets (train, test) are generated. Users could also choose any one or two of them,
            for example ('train', 'test') or just a string 'train'.
        offset: the number of the starting line. Default: 0

    Examples:
        >>> train, test = torchtext.experimental.datasets.raw.IMDB()
    """
    split_ = check_default_set(split, ('train', 'test'), 'IMDB')
    dataset_tar = download_from_url(URLS['IMDB'],
                                    root=root,
                                    hash_value=MD5['IMDB'],
                                    hash_type='md5')
    extracted_files = extract_archive(dataset_tar)
    return wrap_datasets(
        tuple(
            RawTextIterableDataset("IMDB",
                                   NUM_LINES["IMDB"][item],
                                   generate_imdb_data(item, extracted_files),
                                   offset=offset) for item in split_), split)
def _setup_datasets(dataset_name, root, split_, offset):
    split = check_default_set(split_, ('train', 'test'), dataset_name)
    if dataset_name == 'AG_NEWS':
        extracted_files = [
            download_from_url(URLS[dataset_name][item],
                              root=root,
                              hash_value=MD5['AG_NEWS'][item],
                              hash_type='md5') for item in ('train', 'test')
        ]
    else:
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files = extract_archive(dataset_tar)

    cvs_path = {}
    for fname in extracted_files:
        if fname.endswith('train.csv'):
            cvs_path['train'] = fname
        if fname.endswith('test.csv'):
            cvs_path['test'] = fname
    return wrap_datasets(
        tuple(
            RawTextIterableDataset(dataset_name,
                                   NUM_LINES[dataset_name][item],
                                   _create_data_from_csv(cvs_path[item]),
                                   offset=offset) for item in split), split_)
def _setup_datasets(dataset_name, tokenizer, root, vocab, split_, year, language):
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')

    split = check_default_set(split_, ('train', 'test', 'valid'), dataset_name)

    if vocab is None:
        if 'train' not in split:
            raise TypeError("Must pass a vocab if train is not selected.")
        if dataset_name == 'WMTNewsCrawl':
            raw_train, = raw.DATASETS[dataset_name](root=root, split=('train',), year=year, language=language)
        else:
            raw_train, = raw.DATASETS[dataset_name](root=root, split=('train',))
        logger_.info('Building Vocab based on train data')
        vocab = build_vocab(raw_train, tokenizer)
    logger_.info('Vocab has %d entries', len(vocab))

    def text_transform(line):
        return torch.tensor([vocab[token] for token in tokenizer(line)], dtype=torch.long)

    if dataset_name == 'WMTNewsCrawl':
        raw_datasets = raw.DATASETS[dataset_name](root=root, split=split, year=year, language=language)
    else:
        raw_datasets = raw.DATASETS[dataset_name](root=root, split=split)
    raw_data = {name: list(map(text_transform, raw_dataset)) for name, raw_dataset in zip(split, raw_datasets)}
    logger_.info('Building datasets for {}'.format(split))
    return wrap_datasets(tuple(LanguageModelingDataset(raw_data[item], vocab, text_transform)
                               for item in split), split_)
Exemple #4
0
def _setup_datasets(dataset_name, separator, root, split_, offset):
    split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name)
    extracted_files = []
    if isinstance(URLS[dataset_name], dict):
        for name, item in URLS[dataset_name].items():
            dataset_tar = download_from_url(item,
                                            root=root,
                                            hash_value=MD5[dataset_name][name],
                                            hash_type='md5')
            extracted_files.extend(extract_archive(dataset_tar))
    elif isinstance(URLS[dataset_name], str):
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files.extend(extract_archive(dataset_tar))
    else:
        raise ValueError(
            "URLS for {} has to be in a form of dictionary or string".format(
                dataset_name))

    data_filenames = {
        "train": _construct_filepath(extracted_files, "train.txt"),
        "valid": _construct_filepath(extracted_files, "dev.txt"),
        "test": _construct_filepath(extracted_files, "test.txt")
    }
    return wrap_datasets(
        tuple(
            RawTextIterableDataset(
                dataset_name,
                NUM_LINES[dataset_name][item],
                _create_data_from_iob(data_filenames[item], separator),
                offset=offset) if data_filenames[item] is not None else None
            for item in split), split_)
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, split_):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer("basic_english")
    text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams))
    split = check_default_set(split_, ('train', 'test'), dataset_name)
    raw_datasets = raw.DATASETS[dataset_name](root=root, split=split)
    # Materialize raw text iterable dataset
    raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets)}

    if vocab is None:
        if "train" not in split:
            raise TypeError("Must pass a vocab if train is not selected.")
        logger_.info('Building Vocab based on train data')
        vocab = build_vocab(raw_data["train"], text_transform)
    logger_.info('Vocab has %d entries', len(vocab))
    text_transform = sequential_transforms(
        text_transform, vocab_func(vocab), totensor(dtype=torch.long)
    )
    if dataset_name == 'IMDB':
        label_transform = sequential_transforms(lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long))
    else:
        label_transform = sequential_transforms(totensor(dtype=torch.long))
    logger_.info('Building datasets for {}'.format(split))
    return wrap_datasets(tuple(
        TextClassificationDataset(
            raw_data[item], vocab, (label_transform, text_transform)
        )
        for item in split
    ), split_)
Exemple #6
0
def _setup_datasets(dataset_name, root, split_, offset):
    split = check_default_set(split_, ('train', 'dev'), dataset_name)
    extracted_files = {
        key: download_from_url(URLS[dataset_name][key],
                               root=root,
                               hash_value=MD5[dataset_name][key],
                               hash_type='md5')
        for key in split
    }
    return wrap_datasets(
        tuple(
            RawTextIterableDataset(dataset_name,
                                   NUM_LINES[dataset_name][item],
                                   _create_data_from_json(
                                       extracted_files[item]),
                                   offset=offset) for item in split), split_)
Exemple #7
0
def _setup_datasets(dataset_name, root, split_, year, language, offset):
    if dataset_name == 'WMTNewsCrawl':
        split = check_default_set(split_, ('train', ), dataset_name)
    else:
        split = check_default_set(split_, ('train', 'test', 'valid'),
                                  dataset_name)

    if dataset_name == 'PennTreebank':
        extracted_files = [
            download_from_url(URLS['PennTreebank'][key],
                              root=root,
                              hash_value=MD5['PennTreebank'][key],
                              hash_type='md5') for key in split
        ]
    else:
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_files = extract_archive(dataset_tar)

    if dataset_name == 'WMTNewsCrawl':
        file_name = 'news.{}.{}.shuffled'.format(year, language)
        extracted_files = [f for f in extracted_files if file_name in f]

    path = {}
    for item in split:
        for fname in extracted_files:
            if item in fname:
                path[item] = fname

    datasets = []
    for item in split:
        logging.info('Creating {} data'.format(item))
        datasets.append(
            RawTextIterableDataset(dataset_name,
                                   NUM_LINES[dataset_name][item],
                                   iter(io.open(path[item], encoding="utf8")),
                                   offset=offset))

    return wrap_datasets(tuple(datasets), split_)
Exemple #8
0
def _setup_datasets(dataset_name, root, vocab, tokenizer, split_):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')
    text_transform = sequential_transforms(tokenizer)
    split = check_default_set(split_, ('train', 'dev'), dataset_name)
    raw_datasets = raw.DATASETS[dataset_name](root=root, split=split)
    raw_data = {
        name: list(raw_dataset)
        for name, raw_dataset in zip(split, raw_datasets)
    }
    if vocab is None:
        if 'train' not in split:
            raise TypeError("Must pass a vocab if train is not selected.")

        def apply_transform(data):
            for (_context, _question, _answers, _ans_pos) in data:
                tok_ans = []
                for item in _answers:
                    tok_ans += text_transform(item)
                yield text_transform(_context) + text_transform(
                    _question) + tok_ans

        logger_.info('Building Vocab based on train data')
        vocab = build_vocab_from_iterator(apply_transform(raw_data['train']),
                                          len(raw_data['train']))
    logger_.info('Vocab has %d entries', len(vocab))
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    transforms = {
        'context': text_transform,
        'question': text_transform,
        'answers': text_transform,
        'ans_pos': totensor(dtype=torch.long)
    }
    logger_.info('Building datasets for {}'.format(split))
    return wrap_datasets(
        tuple(
            QuestionAnswerDataset(raw_data[item], vocab, transforms)
            for item in split), split_)
Exemple #9
0
def _setup_datasets(dataset_name, root, vocabs, split_):
    split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name)
    raw_iter_tuple = raw.DATASETS[dataset_name](root=root, split=split)
    raw_data = {}
    for name, raw_iter in zip(split, raw_iter_tuple):
        raw_data[name] = list(raw_iter)

    if vocabs is None:
        if "train" not in split:
            raise TypeError("Must pass a vocab if train is not selected.")
        logger_.info('Building Vocab based on train data')
        vocabs = build_vocab(raw_data["train"])
    else:
        if not isinstance(vocabs, list):
            raise TypeError("vocabs must be an instance of list")

        # Find data that's not None
        notnone_data = None
        for key in raw_data.keys():
            if raw_data[key] is not None:
                notnone_data = raw_data[key]
                break
        if len(vocabs) != len(notnone_data[0]):
            raise ValueError(
                "Number of vocabs must match the number of columns "
                "in the data")

    transformers = [
        sequential_transforms(vocab_func(vocabs[idx]),
                              totensor(dtype=torch.long))
        for idx in range(len(vocabs))
    ]
    logger_.info('Building datasets for {}'.format(split))
    return wrap_datasets(
        tuple(
            SequenceTaggingDataset(raw_data[item], vocabs, transformers)
            for item in split), split_)
Exemple #10
0
def _setup_datasets(dataset_name, train_filenames, valid_filenames,
                    test_filenames, split_, root, vocab, tokenizer):
    split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name)
    src_vocab, tgt_vocab = vocab
    if tokenizer is None:
        src_tokenizer = get_tokenizer("spacy", language='de_core_news_sm')
        tgt_tokenizer = get_tokenizer("spacy", language='en_core_web_sm')
    elif isinstance(tokenizer, tuple):
        if len(tokenizer) == 2:
            src_tokenizer, tgt_tokenizer = tokenizer
        else:
            raise ValueError("tokenizer must have length of two for"
                             "source and target")
    else:
        raise ValueError(
            "tokenizer must be an instance of tuple with length two"
            "or None")
    raw_datasets = raw.DATASETS[dataset_name](train_filenames=train_filenames,
                                              valid_filenames=valid_filenames,
                                              test_filenames=test_filenames,
                                              split=split,
                                              root=root)
    raw_data = {
        name: list(raw_dataset)
        for name, raw_dataset in zip(split, raw_datasets)
    }
    src_text_vocab_transform = sequential_transforms(src_tokenizer)
    tgt_text_vocab_transform = sequential_transforms(tgt_tokenizer)

    if src_vocab is None:
        if 'train' not in split:
            raise TypeError("Must pass a vocab if train is not selected.")
        logger_.info('Building src Vocab based on train data')
        src_vocab = build_vocab(raw_data["train"],
                                src_text_vocab_transform,
                                index=0)
    else:
        if not isinstance(src_vocab, Vocab):
            raise TypeError("Passed src vocabulary is not of type Vocab")
    logger_.info('src Vocab has %d entries', len(src_vocab))

    if tgt_vocab is None:
        if 'train' not in split:
            raise TypeError("Must pass a vocab if train is not selected.")
        logger_.info('Building tgt Vocab based on train data')
        tgt_vocab = build_vocab(raw_data["train"],
                                tgt_text_vocab_transform,
                                index=1)
    else:
        if not isinstance(tgt_vocab, Vocab):
            raise TypeError("Passed tgt vocabulary is not of type Vocab")
    logger_.info('tgt Vocab has %d entries', len(tgt_vocab))

    logger_.info('Building datasets for {}'.format(split))
    datasets = []
    for key in split:
        src_text_transform = sequential_transforms(src_text_vocab_transform,
                                                   vocab_func(src_vocab),
                                                   totensor(dtype=torch.long))
        tgt_text_transform = sequential_transforms(tgt_text_vocab_transform,
                                                   vocab_func(tgt_vocab),
                                                   totensor(dtype=torch.long))
        datasets.append(
            TranslationDataset(raw_data[key], (src_vocab, tgt_vocab),
                               (src_text_transform, tgt_text_transform)))

    return wrap_datasets(tuple(datasets), split_)
Exemple #11
0
def _setup_datasets(dataset_name, train_filenames, valid_filenames,
                    test_filenames, split_, root, offset):
    split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name)
    if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \
            and not isinstance(test_filenames, tuple):
        raise ValueError("All filenames must be tuples")
    src_train, tgt_train = train_filenames
    src_eval, tgt_eval = valid_filenames
    src_test, tgt_test = test_filenames

    extracted_files = []  # list of paths to the extracted files
    if isinstance(URLS[dataset_name], list):
        for idx, f in enumerate(URLS[dataset_name]):
            dataset_tar = download_from_url(f,
                                            root=root,
                                            hash_value=MD5[dataset_name][idx],
                                            hash_type='md5')
            extracted_files.extend(extract_archive(dataset_tar))
    elif isinstance(URLS[dataset_name], str):
        dataset_tar = download_from_url(URLS[dataset_name],
                                        root=root,
                                        hash_value=MD5[dataset_name],
                                        hash_type='md5')
        extracted_dataset_tar = extract_archive(dataset_tar)
        if dataset_name == 'IWSLT':
            # IWSLT dataset's url downloads a multilingual tgz.
            # We need to take an extra step to pick out the specific language pair from it.
            src_language = train_filenames[0].split(".")[-1]
            tgt_language = train_filenames[1].split(".")[-1]
            languages = "-".join([src_language, tgt_language])
            iwslt_tar = '.data/2016-01/texts/{}/{}/{}.tgz'
            iwslt_tar = iwslt_tar.format(src_language, tgt_language, languages)
            extracted_dataset_tar = extract_archive(iwslt_tar)
        extracted_files.extend(extracted_dataset_tar)
    else:
        raise ValueError(
            "URLS for {} has to be in a form or list or string".format(
                dataset_name))

    # Clean the xml and tag file in the archives
    file_archives = []
    for fname in extracted_files:
        if 'xml' in fname:
            _clean_xml_file(fname)
            file_archives.append(os.path.splitext(fname)[0])
        elif "tags" in fname:
            _clean_tags_file(fname)
            file_archives.append(fname.replace('.tags', ''))
        else:
            file_archives.append(fname)

    data_filenames = defaultdict(dict)
    data_filenames = {
        "train": _construct_filepaths(file_archives, src_train, tgt_train),
        "valid": _construct_filepaths(file_archives, src_eval, tgt_eval),
        "test": _construct_filepaths(file_archives, src_test, tgt_test)
    }

    for key in data_filenames.keys():
        if len(data_filenames[key]) == 0 or data_filenames[key] is None:
            raise FileNotFoundError(
                "Files are not found for data type {}".format(key))

    datasets = []
    for key in split:
        src_data_iter = _read_text_iterator(data_filenames[key][0])
        tgt_data_iter = _read_text_iterator(data_filenames[key][1])

        def _iter(src_data_iter, tgt_data_iter):
            for item in zip(src_data_iter, tgt_data_iter):
                yield item

        datasets.append(
            RawTextIterableDataset(dataset_name,
                                   NUM_LINES[dataset_name][key],
                                   _iter(src_data_iter, tgt_data_iter),
                                   offset=offset))

    return wrap_datasets(tuple(datasets), split_)