Exemple #1
0
    def __init__(self, language, model: str = "bleurt-base-128"):
        super().__init__(language)
        # HACK TO SILENCE tensorflow and errors related to tf.FLAGS
        from silence_tensorflow import silence_tensorflow

        silence_tensorflow()
        import tensorflow.compat.v1 as tf

        flags = tf.flags
        flags.DEFINE_string("source", "", help="Source segments", required=False)
        flags.DEFINE_string("s", "", help="Source segments", required=False)
        flags.DEFINE_string("hypothesis", "", help="MT segments", required=False)
        flags.DEFINE_string("h", "", help="MT segments", required=False)
        flags.DEFINE_string("reference", "", help="Reference segments", required=False)
        flags.DEFINE_string("r", "", help="Reference segments", required=False)
        flags.DEFINE_string("language", "", help="Language", required=False)
        flags.DEFINE_string("l", "", help="Language", required=False)
        flags.DEFINE_string("metric", "", help="Metric to run.", required=False)
        flags.DEFINE_string("m", "", help="Metric to run.", required=False)

        self.model = model
        if not os.path.isdir(telescope_cache_folder() + model):
            download_file_maybe_extract(
                url=f"https://storage.googleapis.com/bleurt-oss/{model}.zip",
                directory=telescope_cache_folder(),
            )
        self.scorer = score.BleurtScorer(telescope_cache_folder() + model)
        self.system_only = False
    def __init__(self, dictionary: Dict[str, int]) -> None:
        super().__init__()

        if not os.path.exists(saving_directory):
            os.makedirs(saving_directory)

        download_file_maybe_extract(L93_CODES_URL,
                                    directory=saving_directory,
                                    check_files=[L93_CODES_FILE])

        download_file_maybe_extract(L93_VOCAB_URL,
                                    directory=saving_directory,
                                    check_files=[L93_VOCAB_FILE])

        self.bpe = fastBPE.fastBPE(saving_directory + L93_CODES_FILE,
                                   saving_directory + L93_VOCAB_FILE)
        self.bpe_symbol = "@@ "

        # Properties from the base class
        self.stoi = dictionary
        self.itos = [key for key in dictionary.keys()]
        self._pad_index = dictionary["<pad>"]
        self._eos_index = dictionary["</s>"]
        self._unk_index = dictionary["<unk>"]
        self._mask_index = None
Exemple #3
0
    def from_pretrained(cls,
                        hparams: HyperOptArgumentParser,
                        lm_head: bool = False):
        if not os.path.exists("pretrained/"):
            os.mkdir("pretrained/")

        pretrained_model = hparams.pretrained_model
        if pretrained_model == "roberta.base":
            download_file_maybe_extract(
                ROBERTA_BASE_URL,
                directory="pretrained",
                check_files=[ROBERTA_BASE_MODEL_NAME],
            )

        elif pretrained_model == "roberta.large":
            download_file_maybe_extract(
                ROBERTA_LARGE_URL,
                directory="pretrained",
                check_files=[ROBERTA_LARGE_MODEL_NAME],
            )
        else:
            raise Exception(f"{pretrained_model} is an invalid RoBERTa model.")

        roberta = RobertaModel.from_pretrained("pretrained/" +
                                               pretrained_model,
                                               checkpoint_file="model.pt")
        roberta.eval()
        tokenizer = RoBERTaTextEncoder(
            roberta.encode, roberta.task.source_dictionary.__dict__["indices"])
        return RoBERTa(roberta=roberta,
                       tokenizer=tokenizer,
                       hparams=hparams,
                       lm_head=lm_head)
Exemple #4
0
def download_corpus(corpus: str, saving_directory: str = None) -> None:
    """Function that downloads a corpus from AWS.

    :param corpus: Name of the corpus to be loaded.
    :param saving_directory: RELATIVE path to the saving folder.
    """
    corpus = corpus.lower()
    if not saving_directory:
        saving_directory = "data/"

    if not os.path.exists(saving_directory):
        os.makedirs(saving_directory)

    if os.path.isdir(saving_directory + corpus):
        click.secho(f"{corpus} is already in cache.", fg="yellow")
        return

    elif corpus in corpus2download:
        download_file_maybe_extract(
            corpus2download[corpus],
            directory=saving_directory,
        )

    else:
        raise Exception(f"{corpus} is not a valid corpus!")

    click.secho("Download succeeded.", fg="yellow")
    if os.path.exists(saving_directory + corpus + ".zip"):
        os.remove(saving_directory + corpus + ".zip")

    elif os.path.exists(saving_directory + corpus + ".tar.gz"):
        os.remove(saving_directory + corpus + ".tar.gz")

    else:
        click.secho("Fail to delete compressed file.", fg="red")
def squad_dataset(
    directory='./data',
    train=True,
    dev=True,
    train_filename='train-v2.0.json',
    dev_filename='dev-v2.0.json',
    check_files_train=['train-v2.0.json'],
    check_files_dev=['dev-v2.0.json'],
    url_train='https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json',
    url_dev='https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'
):

    download_file_maybe_extract(url=url_train,
                                directory=directory,
                                check_files=check_files_train)
    download_file_maybe_extract(url=url_dev,
                                directory=directory,
                                check_files=check_files_dev)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, filename)
        with open(full_path, 'r') as temp:
            ret.append(json.load(temp)['data'])

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #6
0
    def __init__(
        self,
        language,
        temperature=1.0,
        model_dir=telescope_cache_folder() + "m39v1",
        **kwargs,
    ):
        """
        model_dir should contain:
         1) checkpoint.pt: the fairseq model
         2) spm.model: the sentencepiece model
         3) dict.src.txt: the fairseq source dictionary
         4) dict.tgt.txt: the fairseq target dictionary (likely a copy of the source)
        lang: ISO 639-1 Code (e.g. "en"). Must be a language compatable with the model.
        """

        if not os.path.isdir(model_dir):
            download_file_maybe_extract(
                url="http://data.statmt.org/prism/m39v1.tar",
                directory=telescope_cache_folder(),
            )

        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_dir + "/spm.model")

        self.lang = language
        self.temperature = temperature

        # this prints things and I can't figure out how to disable it
        with open(os.devnull, "w") as sys.stdout:
            (
                self.models,
                self.args,
                self.task,
            ) = checkpoint_utils.load_model_ensemble_and_task(
                [
                    model_dir + "/checkpoint.pt",
                ],
                arg_overrides=dict(data=model_dir + "/"),
            )
            sys.stdout = sys.__stdout__

        self.use_cuda = torch.cuda.is_available()

        self.generator = SequenceScorer(self.task.target_dictionary,
                                        temperature=temperature)

        for model in self.models:
            if self.use_cuda:
                model.cuda()
            model.make_generation_fast_(
                beamable_mm_beam_size=None,
                need_attn=False,
            )

        # hash model
        self.model_hash = hash_model(model_dir)
        if not self.language_support(language):
            raise Exception(f"{language} is not supported by {self.name}.")
Exemple #7
0
class TestXLMRTextEncoder(unittest.TestCase):
    download_file_maybe_extract(
        "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz",
        directory=os.environ["HOME"] + "/.cache/torch/unbabel_comet/",
        check_files=["xlmr.base/model.pt"],
    )
    xlmr = XLMRModel.from_pretrained(
        os.environ["HOME"] + "/.cache/torch/unbabel_comet/xlmr.base",
        checkpoint_file="model.pt",
    )
    original_vocab = xlmr.task.source_dictionary.__dict__["indices"]
    tokenizer = XLMRTextEncoder(xlmr.encode, original_vocab)

    def test_unk_property(self):
        self.assertEqual(self.tokenizer.unk_index, self.original_vocab["<unk>"])

    def test_pad_property(self):
        self.assertEqual(self.tokenizer.padding_index, self.original_vocab["<pad>"])

    def test_bos_property(self):
        self.assertEqual(self.tokenizer.bos_index, self.original_vocab["<s>"])

    def test_eos_property(self):
        self.assertEqual(self.tokenizer.eos_index, self.original_vocab["</s>"])

    def test_mask_property(self):
        self.assertEqual(self.tokenizer.mask_index, self.original_vocab["<mask>"])

    def test_vocab_property(self):
        self.assertEqual(self.tokenizer.vocab, self.original_vocab)

    def test_vocab_size_property(self):
        self.assertEqual(self.tokenizer.vocab_size, len(self.original_vocab))

    def test_encode(self):
        sentence = "Hello, my dog is cute"
        expected = self.xlmr.encode(sentence)
        result = self.tokenizer.encode(sentence)
        self.assertTrue(torch.equal(expected, result))
        # Make sure the bos and eos tokens were added.
        self.assertEqual(result[0], self.tokenizer.bos_index)
        self.assertEqual(result[-1], self.tokenizer.eos_index)

    def test_batch_encode(self):
        # Test batch_encode.
        batch = ["Hello, my dog is cute", "hello world!"]
        encoded_batch, lengths = self.tokenizer.batch_encode(batch)

        self.assertTrue(torch.equal(encoded_batch[0], self.tokenizer.encode(batch[0])))
        self.assertTrue(
            torch.equal(encoded_batch[1][: lengths[1]], self.tokenizer.encode(batch[1]))
        )
        self.assertEqual(lengths[0], len(self.xlmr.encode("Hello, my dog is cute")))
        self.assertEqual(lengths[1], len(self.xlmr.encode("hello world!")))

        # Check if last sentence is padded.
        self.assertEqual(encoded_batch[1][-1], self.tokenizer.padding_index)
        self.assertEqual(encoded_batch[1][-2], self.tokenizer.padding_index)
Exemple #8
0
def download_model(model: str, saving_directory: str = None) -> ModelBase:
    """Function that loads pretrained models from AWS.
    :param model: Name of the model to be loaded.
    :param saving_directory: RELATIVE path to the saving folder (must end with /).

    Return:
        - Pretrained model.
    """
    if saving_directory is None:
        saving_directory = get_cache_folder()

    if not os.path.exists(saving_directory):
        os.makedirs(saving_directory)

    models = model2download(saving_directory)

    if os.path.isdir(saving_directory + model):
        click.secho(f"{model} is already in cache.", fg="yellow")
        if not model.endswith("/"):
            model += "/"

    elif model not in models.keys():
        raise Exception(f"{model} is not a valid COMET model!")

    elif models[model].startswith("https://"):
        download_file_maybe_extract(models[model], directory=saving_directory)

    else:
        raise Exception("Something went wrong while dowloading the model!")

    if os.path.exists(saving_directory + model + ".zip"):
        os.remove(saving_directory + model + ".zip")

    click.secho("Download succeeded. Loading model...", fg="yellow")
    experiment_folder = saving_directory + model
    checkpoints = [
        file for file in os.listdir(experiment_folder)
        if file.endswith(".ckpt")
    ]
    checkpoint = checkpoints[-1]
    checkpoint_path = experiment_folder + "/" + checkpoint
    return load_checkpoint(checkpoint_path)
Exemple #9
0
    def from_pretrained(cls, hparams: Namespace):
        """Function that loads a pretrained LASER encoder and the respective tokenizer.
        
        :param hparams: Namespace.

        :returns: LASER Encoder model
        """
        if not os.path.exists(saving_directory):
            os.makedirs(saving_directory)

        download_file_maybe_extract(
            L93_LASER_MODEL_URL,
            directory=saving_directory,
            check_files=[L93_MODEL_NAME],
        )
        state_dict = torch.load(saving_directory + L93_MODEL_NAME)
        encoder = LASEREncoder(**state_dict["params"])
        encoder.load_state_dict(state_dict["model"])
        encoder.tokenizer = FastBPEEncoder(state_dict["dictionary"])
        return encoder
Exemple #10
0
    def _get_dataset(
        self,
        dataset_path: str = "",
        data_folder: str = "data/",
    ):
        """Downloads PersonaChat corpus from S3 if no dataset_path is provided.

        :param dataset_path: Path to a json file containing the train and validation dataset.
        :param data_folder: Folder used to store data.

        :return: Returns a dictionary with the training and validation data.
        """
        if not os.path.isfile(dataset_path):
            click.secho(f"Download dataset from {PERSONACHAT_URL}", fg="yellow")
            dataset_file = download_file_maybe_extract(
                PERSONACHAT_URL,
                directory=data_folder,
                check_files=["personachat_self_original.json"],
            )
            dataset_path = "data/personachat_self_original.json"

        dataset_hash = (
            int(hashlib.sha256(dataset_path.encode("utf-8")).hexdigest(), 16) % 10 ** 8
        )
        # To avoid using cache for different models
        # split(/) for microsoft/DialoGPT-small
        pretrained_model = (
            self.hparams.pretrained_model.split("/")[1]
            if "/" in self.hparams.pretrained_model
            else self.hparams.pretrained_model
        )
        dataset_cache = data_folder + ".dataset_" + str(dataset_hash) + pretrained_model

        if os.path.isfile(dataset_cache):
            click.secho(f"Loading tokenized dataset from cache: {dataset_cache}.")
            return torch.load(dataset_cache)
        else:
            dataset_file = dataset_path

        with open(dataset_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        click.secho("Running tokenization: This might take some time!", fg="yellow")
        dataset = self._tokenize(dataset)
        torch.save(dataset, dataset_cache)

        return dataset
Exemple #11
0
    def from_pretrained(cls, hparams: HyperOptArgumentParser, lm_head: bool = False):
        if not os.path.exists("pretrained/"):
            os.mkdir("pretrained/")

        pretrained_model = hparams.pretrained_model
        if pretrained_model == "xlmr.base":
            download_file_maybe_extract(
                XLMR_BASE_URL,
                directory="pretrained",
                check_files=[XLMR_BASE_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large":
            download_file_maybe_extract(
                XLMR_LARGE_URL,
                directory="pretrained",
                check_files=[XLMR_LARGE_MODEL_NAME],
            )
        elif pretrained_model == "xlmr.base.v0":
            download_file_maybe_extract(
                XLMR_BASE_V0_URL,
                directory="pretrained",
                check_files=[XLMR_BASE_V0_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large.v0":
            download_file_maybe_extract(
                XLMR_LARGE_V0_URL,
                directory="pretrained",
                check_files=[XLMR_LARGE_V0_MODEL_NAME],
            )
        else:
            raise Exception(f"{pretrained_model} is an invalid XLM-R model.")

        xlmr = XLMRModel.from_pretrained(
            "pretrained/" + pretrained_model, checkpoint_file="model.pt"
        )
        xlmr.eval()
        tokenizer = RoBERTaTextEncoder(
            xlmr.encode, xlmr.task.source_dictionary.__dict__["indices"]
        )
        return XLMRoBERTa(
            xlmr=xlmr, tokenizer=tokenizer, hparams=hparams, lm_head=lm_head
        )
Exemple #12
0
    def from_pretrained(cls, hparams: Namespace):
        if not os.path.exists(saving_directory):
            os.makedirs(saving_directory)

        pretrained_model = hparams.pretrained_model
        if pretrained_model == "xlmr.base":
            download_file_maybe_extract(
                XLMR_BASE_URL,
                directory=saving_directory,
                check_files=[XLMR_BASE_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large":
            download_file_maybe_extract(
                XLMR_LARGE_URL,
                directory=saving_directory,
                check_files=[XLMR_LARGE_MODEL_NAME],
            )
        elif pretrained_model == "xlmr.base.v0":
            download_file_maybe_extract(
                XLMR_BASE_V0_URL,
                directory=saving_directory,
                check_files=[XLMR_BASE_V0_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large.v0":
            download_file_maybe_extract(
                XLMR_LARGE_V0_URL,
                directory=saving_directory,
                check_files=[XLMR_LARGE_V0_MODEL_NAME],
            )
        else:
            raise Exception(f"{pretrained_model} is an invalid XLM-R model.")

        xlmr = XLMRModel.from_pretrained(saving_directory + pretrained_model,
                                         checkpoint_file="model.pt")
        # xlmr.eval()
        tokenizer = XLMRTextEncoder(
            xlmr.encode, xlmr.task.source_dictionary.__dict__["indices"])
        return XLMREncoder(xlmr=xlmr, tokenizer=tokenizer, hparams=hparams)
Exemple #13
0
def model2download(
    saving_directory: str = get_cache_folder(),
    url: str = MODELS_URL,
) -> dict:
    """Download a dictionary with the mapping between models and downloading urls.
    :param saving_directory: RELATIVE path to the saving folder (must end with /).
    Return:
        - dictionary with the mapping between models and downloading urls.
    """
    if not os.path.exists(saving_directory):
        raise FileNotFoundError("The folder to save the model does not exist.")

    if os.path.exists(saving_directory + "available_models.yaml"):
        os.remove(saving_directory + "available_models.yaml")

    file_path = download_file_maybe_extract(
        url=url,
        directory=saving_directory,
        extension="yaml",
    )
    with open(file_path) as fp:
        return yaml.load(fp.read(), Loader=yaml.FullLoader)
# -*- coding: utf-8 -*-
import os
import unittest

import torch

from caption.tokenizers import RoBERTaTextEncoder
from fairseq.models.roberta import RobertaModel
from test_tube import HyperOptArgumentParser
from torchnlp.download import download_file_maybe_extract

download_file_maybe_extract(
    "https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz",
    directory=os.environ["HOME"] + "/.cache/caption/",
    check_files=["roberta.base/model.pt"],
)
roberta = RobertaModel.from_pretrained(
    os.environ["HOME"] + "/.cache/caption/roberta.base",
    checkpoint_file="model.pt",
)
original_vocab = roberta.task.source_dictionary.__dict__["indices"]
tokenizer = RoBERTaTextEncoder(roberta.encode, original_vocab)


class TestRoBERTaTextEncoder(unittest.TestCase):
    def test_unk_property(self):
        assert tokenizer.unk_index == original_vocab["<unk>"]

    def test_pad_property(self):
        assert tokenizer.padding_index == original_vocab["<pad>"]
Exemple #15
0
def squad_dataset(
    directory='data/',
    train=False,
    dev=False,
    train_filename='train-v2.0.json',
    dev_filename='dev-v2.0.json',
    check_files_train=['train-v2.0.json'],
    check_files_dev=['dev-v2.0.json'],
    url_train='https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json',
    url_dev='https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'
):
    """
    Load the Stanford Question Answering Dataset (SQuAD) dataset.

    Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of
    questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every
    question is a segment of text, or span, from the corresponding reading passage, or the question
    might be unanswerable. SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000
    unanswerable questions written adversarially by crowdworkers to look similar to answerable
    ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also
    determine when no answer is supported by the paragraph and abstain from answering.

    **Reference:** https://rajpurkar.github.io/SQuAD-explorer/
    **Citation:**
    Rajpurkar, P., Jia, R. and Liang, P., 2018.
    Know what you don't know: Unanswerable questions for SQuAD.
    arXiv preprint arXiv:1806.03822.

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        check_files_train (list, optional): All train filenames
        check_files_dev (list, optional): All development filenames
        url_train (str, optional): URL of the train dataset `.json` file.
        url_dev (str, optional): URL of the dev dataset `.json` file.

    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train and dev) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import squad_dataset  # doctest: +SKIP
        >>> train = squad_dataset(train=True)  # doctest: +SKIP
        >>> train[0]['paragraphs'][0]['qas'][0]['question']  # doctest: +SKIP
        'When did Beyonce start becoming popular?'
        >>> train[0]['paragraphs'][0]['qas'][0]['answers'][0]  # doctest: +SKIP
        {'text': 'in the late 1990s', 'answer_start': 269}
    """
    download_file_maybe_extract(url=url_dev,
                                directory=directory,
                                check_files=check_files_dev)
    download_file_maybe_extract(url=url_train,
                                directory=directory,
                                check_files=check_files_train)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, filename)
        with open(full_path, 'r') as temp:
            ret.append(json.load(temp)['data'])

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #16
0
def ud_pos_dataset(
        directory='data/',
        train=False,
        dev=False,
        test=False,
        train_filename='en-ud-tag.v2.train.txt',
        dev_filename='en-ud-tag.v2.dev.txt',
        test_filename='en-ud-tag.v2.test.txt',
        extracted_name='en-ud-v2',
        check_files=['en-ud-v2/en-ud-tag.v2.train.txt'],
        url='https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip'):
    """
    Load the Universal Dependencies - English Dependency Treebank dataset.

    Corpus of sentences annotated using Universal Dependencies annotation. The corpus comprises
    254,830 words and 16,622 sentences, taken from various web media including weblogs, newsgroups,
    emails, reviews, and Yahoo! answers.

    References:
        * http://universaldependencies.org/
        * https://github.com/UniversalDependencies/UD_English

    **Citation:**
    Natalia Silveira and Timothy Dozat and Marie-Catherine de Marneffe and Samuel Bowman and
    Miriam Connor and John Bauer and Christopher D. Manning (2014).
    A Gold Standard Dependency Corpus for {E}nglish

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import ud_pos_dataset
        >>> train = ud_pos_dataset(train=True)
        >>> train[17] # Sentence at index 17 is shortish
        {
          'tokens': ['Guerrillas', 'killed', 'an', 'engineer', ',', 'Asi', 'Ali', ',', 'from',
                     'Tikrit', '.'],
          'ud_tags': ['NOUN', 'VERB', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP',
                      'PROPN', 'PUNCT'],
          'ptb_tags': ['NNS', 'VBD', 'DT', 'NN', ',', 'NNP', 'NNP', ',', 'IN', 'NNP', '.']
        }
    """
    download_file_maybe_extract(url=url,
                                directory=directory,
                                check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
            for line in f:
                line = line.strip()
                if line == '' and len(sentence['tokens']) > 0:
                    examples.append(sentence)
                    sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
                elif line != '':
                    token, ud_tag, ptb_tag = tuple(line.split('\t'))
                    sentence['tokens'].append(token)
                    sentence['ud_tags'].append(ud_tag)
                    sentence['ptb_tags'].append(ptb_tag)
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #17
0
def wmt_dataset(directory='data/wmt16_en_de',
                train=False,
                dev=False,
                test=False,
                train_filename='train.tok.clean.bpe.32000',
                dev_filename='newstest2013.tok.bpe.32000',
                test_filename='newstest2014.tok.bpe.32000',
                check_files=['train.tok.clean.bpe.32000.en'],
                url='https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8'):
    """
    The Workshop on Machine Translation (WMT) 2014 English-German dataset.

    Initially this dataset was preprocessed by Google Brain. Though this download contains test sets
    from 2015 and 2016, the train set differs slightly from WMT 2015 and 2016 and significantly from
    WMT 2017.

    The provided data is mainly taken from version 7 of the Europarl corpus, which is freely
    available. Note that this the same data as last year, since Europarl is not anymore translted
    across all 23 official European languages. Additional training data is taken from the new News
    Commentary corpus. There are about 50 million words of training data per language from the
    Europarl corpus and 3 million words from the News Commentary corpus.

    A new data resource from 2013 is the Common Crawl corpus which was collected from web sources.
    Each parallel corpus comes with a annotation file that gives the source of each sentence pair.

    References:
        * https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/translate_ende.py # noqa: E501
        * http://www.statmt.org/wmt14/translation-task.html

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the dev split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the dev split.
        test_filename (str, optional): The filename of the test split.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
        tokens and test tokens in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import wmt_dataset
        >>> train = wmt_dataset(train=True)
        >>> train[:2]
        [{
          'en': 'Res@@ um@@ ption of the session',
          'de': 'Wiederaufnahme der Sitzungsperiode'
        }, {
          'en': 'I declare resumed the session of the European Parliament ad@@ jour@@ ned on...'
          'de': 'Ich erklär@@ e die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene...'
        }]
    """
    download_file_maybe_extract(
        url=url, directory=directory, check_files=check_files, filename='wmt16_en_de.tar.gz')

    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]

    for filename in splits:
        examples = []

        en_path = os.path.join(directory, filename + '.en')
        de_path = os.path.join(directory, filename + '.de')
        en_file = [l.strip() for l in open(en_path, 'r', encoding='utf-8')]
        de_file = [l.strip() for l in open(de_path, 'r', encoding='utf-8')]
        assert len(en_file) == len(de_file)
        for i in range(len(en_file)):
            if en_file[i] != '' and de_file[i] != '':
                examples.append({'en': en_file[i], 'de': de_file[i]})

        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #18
0
def wikitext_2_dataset(
        directory='data/',
        train=False,
        dev=False,
        test=False,
        train_filename='wiki.train.tokens',
        dev_filename='wiki.valid.tokens',
        test_filename='wiki.test.tokens',
        extracted_name='wikitext-2',
        check_files=['wikitext-2/wiki.train.tokens'],
        url='https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip',
        unknown_token=DEFAULT_UNKNOWN_TOKEN,
        eos_token=DEFAULT_EOS_TOKEN):
    """
    Load the WikiText-2 dataset.

    The WikiText language modeling dataset is a collection of over 100 million tokens extracted
    from the set of verified Good and Featured articles on Wikipedia. The dataset is available
    under the Creative Commons Attribution-ShareAlike License.

    **Reference:**
    https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.
        unknown_token (str, optional): Token to use for unknown words.
        eos_token (str, optional): Token to use at the end of sentences.

    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import wikitext_2_dataset  # doctest: +SKIP
        >>> train = wikitext_2_dataset(train=True)  # doctest: +SKIP
        >>> train[:10]  # doctest: +SKIP
        ['</s>', '=', 'Valkyria', 'Chronicles', 'III', '=', '</s>', '</s>', 'Senjō', 'no']
    """
    download_file_maybe_extract(url=url, directory=directory, check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        text = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                text.extend(line.replace('<unk>', unknown_token).split())
                text.append(eos_token)
        ret.append(text)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #19
0
def smt_dataset(
        directory='data/',
        train=False,
        dev=False,
        test=False,
        train_filename='train.txt',
        dev_filename='dev.txt',
        test_filename='test.txt',
        extracted_name='trees',
        check_files=['trees/train.txt'],
        url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip',
        fine_grained=False,
        subtrees=False):
    """
    Load the Stanford Sentiment Treebank dataset.
    Semantic word spaces have been very useful but cannot express the meaning of longer phrases in
    a principled way. Further progress towards understanding compositionality in tasks such as
    sentiment detection requires richer supervised training and evaluation resources and more
    powerful models of composition. To remedy this, we introduce a Sentiment Treebank. It includes
    fine grained sentiment labels for 215,154 phrases in the parse trees of 11,855 sentences and
    presents new challenges for sentiment compositionality.
    **Reference**:
    https://nlp.stanford.edu/sentiment/index.html
    **Citation:**
    Richard Socher, Alex Perelygin, Jean Y. Wu, Jason Chuang, Christopher D. Manning,
    Andrew Y. Ng and Christopher Potts. Recursive Deep Models for Semantic Compositionality Over a
    Sentiment Treebank
    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.
        subtrees (bool, optional): Whether to include sentiment-tagged subphrases in addition to
            complete examples.
        fine_grained (bool, optional): Whether to use 5-class instead of 3-class labeling.
    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
        tokens and test tokens in order if their respective boolean argument is true.
    Example:
        # >>> from torchnlp.datasets import smt_dataset
        # >>> train = smt_dataset(train=True)
        # >>> train[5]
        {
          'text': "Whether or not you 're enlightened by any of Derrida 's lectures on ...",
          'label': 'positive'
        }
    """
    download_file_maybe_extract(url=url,
                                directory=directory,
                                check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if subtrees:
                    examples.extend(
                        parse_tree(line,
                                   subtrees=subtrees,
                                   fine_grained=fine_grained))
                else:
                    examples.append(
                        parse_tree(line,
                                   subtrees=subtrees,
                                   fine_grained=fine_grained))
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #20
0
    def cache(self, name, cache, url=None):
        if os.path.isfile(name):
            path = name
            path_pt = os.path.join(cache, os.path.basename(name)) + '.pt'
        else:
            path = os.path.join(cache, name)
            path_pt = path + '.pt'

        if not os.path.isfile(path_pt) or self.is_include is not None:
            if url:
                download_file_maybe_extract(url=url, directory=cache, check_files=[name])

            if not os.path.isfile(path):
                raise RuntimeError('no vectors found at {}'.format(path))

            index_to_token, vectors, dim = [], None, None

            # Try to read the whole file with utf-8 encoding.
            binary_lines = False
            try:
                with io.open(path, encoding="utf8") as f:
                    lines = [line for line in f]
            # If there are malformed lines, read in binary mode
            # and manually decode each word from utf-8
            except:
                logger.warning("Could not read {} as UTF8 file, "
                               "reading file as bytes and skipping "
                               "words with malformed UTF8.".format(path))
                with open(path, 'rb') as f:
                    lines = [line for line in f]
                binary_lines = True

            logger.info("Loading vectors from {}".format(path))
            for line in tqdm(lines, total=len(lines)):
                # Explicitly splitting on " " is important, so we don't
                # get rid of Unicode non-breaking spaces in the vectors.
                entries = line.rstrip().split(b" " if binary_lines else " ")

                word, entries = entries[0], entries[1:]
                if dim is None and vectors is None and len(entries) > 1:
                    dim = len(entries)
                    vectors = torch.empty(len(lines), dim, dtype=torch.float)
                elif len(entries) == 1:
                    logger.warning("Skipping token {} with 1-dimensional "
                                   "vector {}; likely a header".format(word, entries))
                    continue
                elif dim != len(entries):
                    raise RuntimeError("Vector for token {} has {} dimensions, but previously "
                                       "read vectors have {} dimensions. All vectors must have "
                                       "the same number of dimensions.".format(
                                           word, len(entries), dim))

                if binary_lines:
                    try:
                        if isinstance(word, six.binary_type):
                            word = word.decode('utf-8')
                    except:
                        logger.info("Skipping non-UTF8 token {}".format(repr(word)))
                        continue

                if self.is_include is not None and not self.is_include(word):
                    continue

                vectors[len(index_to_token)] = torch.tensor([float(x) for x in entries])
                index_to_token.append(word)

            self.index_to_token = index_to_token
            self.token_to_index = {word: i for i, word in enumerate(index_to_token)}
            self.vectors = vectors[:len(index_to_token)]
            self.dim = dim
            logger.info('Saving vectors to {}'.format(path_pt))
            if not os.path.exists(cache):
                os.makedirs(cache)
            torch.save((self.index_to_token, self.token_to_index, self.vectors, self.dim), path_pt)
        else:
            logger.info('Loading vectors from {}'.format(path_pt))
            self.index_to_token, self.token_to_index, self.vectors, self.dim = torch.load(path_pt)
Exemple #21
0
def imdb_dataset(
        directory='data/',
        train=False,
        test=False,
        train_directory='train',
        test_directory='test',
        extracted_name='aclImdb',
        check_files=['aclImdb/README'],
        url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
        sentiments=['pos', 'neg']):
    """
    Load the IMDB dataset (Large Movie Review Dataset v1.0).

    This is a dataset for binary sentiment classification containing substantially more data than
    previous benchmark datasets. Provided a set of 25,000 highly polar movie reviews for
    training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text
    and already processed bag of words formats are provided.

    Note:
        The order examples are returned is not guaranteed due to ``iglob``.

    **Reference:** http://ai.stanford.edu/~amaas/data/sentiment/

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_directory (str, optional): The directory of the training split.
        test_directory (str, optional): The directory of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset ``tar.gz`` file.
        sentiments (list of str, optional): Sentiments to load from the dataset.

    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import imdb_dataset  # doctest: +SKIP
        >>> train = imdb_dataset(train=True)  # doctest: +SKIP
        >>> train[0:2]  # doctest: +SKIP
        [{
          'text': 'For a movie that gets no respect there sure are a lot of memorable quotes...',
          'sentiment': 'pos'
        }, {
          'text': 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines...',
          'sentiment': 'pos'
        }]
    """
    download_file_maybe_extract(url=url,
                                directory=directory,
                                check_files=check_files)

    ret = []
    splits = [
        dir_ for (requested,
                  dir_) in [(train, train_directory), (test, test_directory)]
        if requested
    ]
    for split_directory in splits:
        full_path = os.path.join(directory, extracted_name, split_directory)
        examples = []
        for sentiment in sentiments:
            for filename in glob.iglob(
                    os.path.join(full_path, sentiment, '*.txt')):
                with open(filename, 'r', encoding="utf-8") as f:
                    text = f.readline()
                examples.append({
                    'text': text,
                    'sentiment': sentiment,
                })
        ret.append(examples)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #22
0
def glue_dataset(directory='data/GLUE/',
                 train=False,
                 dev=False,
                 test=False,
                 dev_mismatched=False,
                 test_mismatched=False,
                 train_filename='train.tsv',
                 dev_filename='dev.tsv',
                 test_filename='test.tsv',
                 dev_filename_mismatched='dev_mismatched.tsv',
                 test_filename_mismatched='test_mismatched.tsv',
                 check_files=['train.tsv'],
                 dataset=None):
    """
    Load the Generalized Language Understanding Evaluation (GLUE) benchmark.

    The GLUE benchmark is a collection of resources for training, evaluating, and analyzing natural language understanding systems.

    References:
        - https://github.com/nyu-mll/GLUE-baselines
        - https://www.nyu.edu/projects/bowman/glue.pdf
        - https://gluebenchmark.com

    **Citation**
    ::
        @unpublished{wang2018glue
             title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for
                     Natural Language Understanding}
             author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill,
                     Felix and Levy, Omer and Bowman, Samuel R.}
             note={arXiv preprint 1804.07461}
             year={2018}
         }

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the dev split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        dev_mismatched (bool, optional): Only used for the MNLI set, if to load the dev mismatched split of the dataset.
        test_mismatched (bool, optional): Only used for the MNLI set, if to load the test mismatched split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        dev_filename_mismatched (str, optional) : Only used for the MNLI set, the filename of the mismatched development split
        test_filename_mismatched (str, optional) : Only used for the MNLI set, the filename of the mismatched test split
        check_files (str, optional): Check if these files exist, then this download was successful.
        dataset (str, optional): Dataset of GLUE to download.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.
        remark: when loading the MNLI dataset, values are returned in the following order if their respective boolean arguments are true :
                (train, dev_matched, dev_mismatched, test_matched, test_mismatched)

    Remarks:
        To load 'in' or 'outer' domain sets of CoLA, one can just set train_filename='original/tokenized/in_domain_train.tsv'
        The returned Dataset in train will be the asked one. 

    Example: 
        >>> from torchnlp.datasets import glue_dataset
        >>> train = glue_dataset(train=True, dataset = 'QNLI')
        >>> train[:2]
        [{
          'index': '0',
          'question': 'What is the Grotto at Notre Dame?',
          'sentence': 'Immediately behind the basilica ... ',
          'label': 'entailment'
        }, {
          'index': '1',
          'question': 'What is the Grotto at Notre Dame?',
          'sentence': 'It is a replica of the grotto at ... ',
          'label': 'not_entailment'
        }]
    """
    if dataset is None:
        print(
            'You must select one of the GLUE dataset. (CoLA, SST, MRPC, QQP, STS, MNLI, SNLI, QNLI, RTE, WNLI or diagnostic)'
        )
        return

    assert dataset in ZIP_FOLDER_NAME, "Dataset %s not found!" % dataset
    folder_path = os.path.join(directory, ZIP_FOLDER_NAME[dataset])
    url = GLUE_DATASETS_PATHS[dataset]
    check_file_list = [
        os.path.join(ZIP_FOLDER_NAME[dataset], f) for f in check_files
    ]
    if dataset == 'MRPC':
        download_file_maybe_extract(url=url,
                                    directory=folder_path,
                                    filename='dev_ids.tsv')
        download_file_maybe_extract(url=MRPC_TRAIN,
                                    directory=folder_path,
                                    filename='msr_paraphrase_train.txt')
        download_file_maybe_extract(url=MRPC_TEST,
                                    directory=folder_path,
                                    filename='msr_paraphrase_test.txt')
        mrpc_train_file = os.path.join(folder_path, 'msr_paraphrase_train.txt')
        mrpc_test_file = os.path.join(folder_path, 'msr_paraphrase_test.txt')
        mrpc_dev_file = os.path.join(folder_path, 'dev_ids.tsv')
        assert os.path.isfile(
            mrpc_dev_file), "Dev data not found at %s" % mrpc_dev_file
        assert os.path.isfile(
            mrpc_train_file), "Train data not found at %s" % mrpc_train_file
        assert os.path.isfile(
            mrpc_test_file), "Test data not found at %s" % mrpc_test_file
        mrpc_processing(folder_path, mrpc_train_file, mrpc_test_file)
        os.remove(mrpc_train_file)
        os.remove(mrpc_test_file)
        os.remove(mrpc_dev_file)
    elif dataset == 'diagnostic':
        download_file_maybe_extract(url=url,
                                    directory=folder_path,
                                    filename='diagnostic.tsv',
                                    check_files=[])
    else:
        download_file_maybe_extract(url=url,
                                    directory=directory,
                                    check_files=check_file_list)
        parse = urlparse(url)
        zip_file = os.path.join(directory, os.path.basename(parse.path))
        if os.path.isfile(zip_file):
            os.remove(zip_file)
    ret = []
    if dataset == 'MNLI':
        dev_filename = "dev_matched.tsv"
        test_filename = "test_matched.tsv"
        splits = [(train, train_filename), (dev, dev_filename),
                  (dev_mismatched, dev_filename_mismatched),
                  (test, test_filename),
                  (test_mismatched, test_filename_mismatched)]
        splits = [f for (requested, f) in splits if requested]
    else:
        splits = [(train, train_filename), (dev, dev_filename),
                  (test, test_filename)]
        splits = [f for (requested, f) in splits if requested]
    if dataset == 'diagnostic':
        examples = []
        with open(os.path.join(folder_path, 'diagnostic.tsv'),
                  newline='') as tsvfile:
            tsvreader = csv.reader(tsvfile,
                                   delimiter='\t',
                                   quoting=csv.QUOTE_NONE)
            keys = next(tsvreader)
            nb_arguments = len(keys)
            for line in tsvreader:
                examples.append(
                    {keys[i]: line[i]
                     for i in range(nb_arguments)})
        ret.append(Dataset(examples))
    elif dataset == 'CoLA':
        for filename in splits:
            examples = []
            if filename == 'test.tsv':
                with open(os.path.join(folder_path, filename),
                          newline='') as tsvfile:
                    tsvreader = csv.reader(tsvfile,
                                           delimiter='\t',
                                           quoting=csv.QUOTE_NONE)
                    keys = next(tsvreader)
                    nb_arguments = len(keys)
                    for line in tsvreader:
                        examples.append(
                            {keys[i]: line[i]
                             for i in range(nb_arguments)})
            else:
                with open(os.path.join(folder_path, filename),
                          newline='') as tsvfile:
                    tsvreader = csv.reader(tsvfile,
                                           delimiter='\t',
                                           quoting=csv.QUOTE_NONE)
                    keys = [
                        'source', 'acceptability judgment',
                        'original acceptability judgment', 'sentence'
                    ]
                    nb_arguments = len(keys)
                    for line in tsvreader:
                        examples.append(
                            {keys[i]: line[i]
                             for i in range(nb_arguments)})
            ret.append(Dataset(examples))
    elif dataset == 'MRPC':
        for filename in splits:
            examples = []
            with open(os.path.join(folder_path, filename),
                      newline='') as tsvfile:
                tsvreader = csv.reader(tsvfile,
                                       delimiter='\t',
                                       quoting=csv.QUOTE_NONE)
                keys = next(tsvreader)
                if filename != 'test.tsv':
                    # This had to be done because of an unexplained string '\ufeff' appearing sometimes in front of 'Quality' during mrpc_processing()
                    keys = [
                        'Quality', '#1 ID', '#2 ID', '#1 String', '#2 String'
                    ]
                nb_arguments = len(keys)
                for line in tsvreader:
                    examples.append(
                        {keys[i]: line[i]
                         for i in range(nb_arguments)})
            ret.append(Dataset(examples))
    else:
        for filename in splits:
            examples = []
            with open(os.path.join(folder_path, filename),
                      newline='') as tsvfile:
                tsvreader = csv.reader(tsvfile,
                                       delimiter='\t',
                                       quoting=csv.QUOTE_NONE)
                keys = next(tsvreader)
                nb_arguments = len(keys)
                for line in tsvreader:
                    """
                        problem with QQP and SNLI : len(line) is different from nb_arguments for some lines ...
                    """
                    if len(line) == nb_arguments:
                        examples.append(
                            {keys[i]: line[i]
                             for i in range(nb_arguments)})
                    else:
                        pass
            ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #23
0
def iwslt_dataset(
    directory='data/iwslt/',
    train=False,
    dev=False,
    test=False,
    language_extensions=['en', 'de'],
    train_filename='{source}-{target}/train.{source}-{target}.{lang}',
    dev_filename='{source}-{target}/IWSLT16.TED.tst2013.{source}-{target}.{lang}',
    test_filename='{source}-{target}/IWSLT16.TED.tst2014.{source}-{target}.{lang}',
    check_files=['{source}-{target}/train.tags.{source}-{target}.{source}'],
    url='https://wit3.fbk.eu/archive/2016-01/texts/{source}/{target}/{source}-{target}.tgz'
):
    """
    Load the International Workshop on Spoken Language Translation (IWSLT) 2017 translation dataset.

    In-domain training, development and evaluation sets were supplied through the website of the
    WIT3 project, while out-of-domain training data were linked in the workshop’s website. With
    respect to edition 2016 of the evaluation campaign, some of the talks added to the TED
    repository during the last year have been used to define the evaluation sets (tst2017), while
    the remaining new talks have been included in the training sets.

    The English data that participants were asked to recognize and translate consists in part of
    TED talks as in the years before, and in part of real-life lectures and talks that have been
    mainly recorded in lecture halls at KIT and Carnegie Mellon University. TED talks are
    challenging due to their variety in topics, but are very benign as they are very thoroughly
    rehearsed and planned, leading to easy to recognize and translate language.

    Note:
        The order examples are returned is not guaranteed due to ``iglob``.

    References:
      * http://workshop2017.iwslt.org/downloads/iwslt2017_proceeding_v2.pdf
      * http://workshop2017.iwslt.org/

    **Citation:**
    M. Cettolo, C. Girardi, and M. Federico. 2012. WIT3: Web Inventory of Transcribed and Translated
    Talks. In Proc. of EAMT, pp. 261-268, Trento, Italy.

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the dev split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        language_extensions (:class:`list` of :class:`str`): Two language extensions
            ['en'|'de'|'it'|'ni'|'ro'] to load.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the dev split.
        test_filename (str, optional): The filename of the test split.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset file.

    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import iwslt_dataset  # doctest: +SKIP
        >>> train = iwslt_dataset(train=True)  # doctest: +SKIP
        >>> train[:2]  # doctest: +SKIP
        [{
          'en': "David Gallo: This is Bill Lange. I'm Dave Gallo.",
          'de': 'David Gallo: Das ist Bill Lange. Ich bin Dave Gallo.'
        }, {
          'en': "And we're going to tell you some stories from the sea here in video.",
          'de': 'Wir werden Ihnen einige Geschichten über das Meer in Videoform erzählen.'
        }]
    """
    if len(language_extensions) != 2:
        raise ValueError(
            "`language_extensions` must be two language extensions "
            "['en'|'de'|'it'|'ni'|'ro'] to load.")

    # Format Filenames
    source, target = tuple(language_extensions)
    check_files = [s.format(source=source, target=target) for s in check_files]
    url = url.format(source=source, target=target)

    download_file_maybe_extract(url=url,
                                directory=directory,
                                check_files=check_files)

    iwslt_clean(
        os.path.join(directory, '{source}-{target}'.format(source=source,
                                                           target=target)))

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        examples = []
        for extension in language_extensions:
            path = os.path.join(
                directory,
                filename.format(lang=extension, source=source, target=target))
            with open(path, 'r', encoding='utf-8') as f:
                language_specific_examples = [l.strip() for l in f]

            if len(examples) == 0:
                examples = [{} for _ in range(len(language_specific_examples))]
            for i, example in enumerate(language_specific_examples):
                examples[i][extension] = example

        ret.append(examples)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #24
0
def snli_dataset(directory='data/',
                 train=False,
                 dev=False,
                 test=False,
                 train_filename='snli_1.0_train.jsonl',
                 dev_filename='snli_1.0_dev.jsonl',
                 test_filename='snli_1.0_test.jsonl',
                 extracted_name='snli_1.0',
                 check_files=['snli_1.0/snli_1.0_train.jsonl'],
                 url='http://nlp.stanford.edu/projects/snli/snli_1.0.zip'):
    """
    Load the Stanford Natural Language Inference (SNLI) dataset.

    The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs
    manually labeled for balanced classification with the labels entailment, contradiction, and
    neutral, supporting the task of natural language inference (NLI), also known as recognizing
    textual entailment (RTE). We aim for it to serve both as a benchmark for evaluating
    representational systems for text, especially including those induced by representation
    learning methods, as well as a resource for developing NLP models of any kind.

    **Reference:** https://nlp.stanford.edu/projects/snli/

    **Citation:**
    Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large
    annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference
    on Empirical Methods in Natural Language Processing (EMNLP).

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import snli_dataset  # doctest: +SKIP
        >>> train = snli_dataset(train=True)  # doctest: +SKIP
        >>> train[0]  # doctest: +SKIP
        {
          'premise': 'Kids are on a amusement ride.',
          'hypothesis': 'A car is broke down on the side of the road.',
          'label': 'contradiction',
          'premise_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...],
          'hypothesis_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...],
        }
    """
    download_file_maybe_extract(url=url,
                                directory=directory,
                                check_files=check_files)

    get_transitions = lambda parse: [
        'reduce' if t == ')' else 'shift' for t in parse if t != '('
    ]
    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                line = json.loads(line)
                examples.append({
                    'premise':
                    line['sentence1'],
                    'hypothesis':
                    line['sentence2'],
                    'label':
                    line['gold_label'],
                    'premise_transitions':
                    get_transitions(line['sentence1_binary_parse']),
                    'hypothesis_transitions':
                    get_transitions(line['sentence2_binary_parse'])
                })
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Exemple #25
0
def simple_qa_dataset(
        directory='data/',
        train=False,
        dev=False,
        test=False,
        extracted_name='SimpleQuestions_v2',
        train_filename='annotated_fb_data_train.txt',
        dev_filename='annotated_fb_data_valid.txt',
        test_filename='annotated_fb_data_test.txt',
        check_files=['SimpleQuestions_v2/annotated_fb_data_train.txt'],
        url='https://www.dropbox.com/s/tohrsllcfy7rch4/SimpleQuestions_v2.tgz?raw=1',
):  # pragma: no cover
    """
    Load the SimpleQuestions dataset.

    Single-relation factoid questions (simple questions) are common in many settings
    (e.g. Microsoft’s search query logs and WikiAnswers questions). The SimpleQuestions dataset is
    one of the most commonly used benchmarks for studying single-relation factoid questions.

    **Reference:**
    https://research.fb.com/publications/large-scale-simple-question-answering-with-memory-networks/

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        extracted_name (str, optional): Name of the extracted dataset directory.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import simple_qa_dataset  # doctest: +SKIP
        >>> train = simple_qa_dataset(train=True)  # doctest: +SKIP
        SimpleQuestions_v2.tgz:  15%|▏| 62.3M/423M [00:09<00:41, 8.76MB/s]
        >>> train[0:2]  # doctest: +SKIP
        [{
          'question': 'what is the book e about',
          'relation': 'www.freebase.com/book/written_work/subjects',
          'object': 'www.freebase.com/m/01cj3p',
          'subject': 'www.freebase.com/m/04whkz5'
        }, {
          'question': 'to what release does the release track cardiac arrest come from',
          'relation': 'www.freebase.com/music/release_track/release',
          'object': 'www.freebase.com/m/0sjc7c1',
          'subject': 'www.freebase.com/m/0tp2p24'
        }]
    """
    download_file_maybe_extract(url=url, directory=directory, check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        data = pd.read_csv(
            full_path, header=None, sep='\t', names=['subject', 'relation', 'object', 'question'])
        ret.append(
            Dataset([{
                'question': row['question'],
                'relation': row['relation'],
                'object': row['object'],
                'subject': row['subject'],
            } for _, row in data.iterrows()]))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)