Example #1
0
    def __init__(self,
                 label_type="entailment",
                 base_path: Union[str, Path] = None,
                 max_tokens_per_doc=-1,
                 max_chars_per_doc=-1,
                 use_tokenizer=True,
                 in_memory: bool = True,
                 sample_missing_splits: bool = True):
        """
        Creates a Winograd Schema Challenge Corpus formated as Natural Language Inference task (WNLI).
        The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence.
        Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.
        This file contains unlabeled test data to evaluate models on the Glue WNLI task.
        """

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        dataset_name = "glue"

        # if no base_path provided take cache root
        if not base_path:
            base_path = flair.cache_root / "datasets"
        data_folder = base_path / dataset_name

        data_file = data_folder / "WNLI/train.tsv"

        # if data is not downloaded yet, download it
        if not data_file.is_file():
            # get the zip file
            zipped_data_path = cached_path(
                "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip",
                Path("datasets") / dataset_name)

            unpack_file(zipped_data_path, data_folder, mode="zip", keep=False)

            # rename test file to eval_dataset, since it has no labels
            os.rename(str(data_folder / "WNLI/test.tsv"),
                      str(data_folder / "WNLI/eval_dataset.tsv"))

        super(GLUE_WNLI,
              self).__init__(data_folder / "WNLI",
                             label_type=label_type,
                             columns=[1, 2, 3],
                             skip_first_line=True,
                             use_tokenizer=use_tokenizer,
                             max_tokens_per_doc=max_tokens_per_doc,
                             max_chars_per_doc=max_chars_per_doc,
                             in_memory=in_memory,
                             sample_missing_splits=sample_missing_splits)

        self.eval_dataset = DataPairDataset(
            data_folder / "WNLI/eval_dataset.tsv",
            columns=[1, 2, 3],
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_first_line=True,
            label=False)
Example #2
0
    def __init__(self,
                 label_type="paraphrase",
                 base_path: Union[str, Path] = None,
                 max_tokens_per_doc=-1,
                 max_chars_per_doc=-1,
                 use_tokenizer=True,
                 in_memory: bool = True,
                 sample_missing_splits: bool = True):
        """
        Creates a Quora Question Pairs (QQP) Corpus from the Glue benchmark (https://gluebenchmark.com/tasks).
        The task is to determine whether a pair of questions are semantically equivalent.
        Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data. 
        This file contains unlabeled test data to evaluate models on the Glue QQP task.
        """

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        dataset_name = "glue"

        # if no base_path provided take cache root
        if not base_path:
            base_path = flair.cache_root / "datasets"
        data_folder = base_path / dataset_name

        data_file = data_folder / "QQP/train.tsv"

        # if data is not downloaded yet, download it
        if not data_file.is_file():
            # get the zip file
            zipped_data_path = cached_path(
                "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip",
                Path("datasets") / dataset_name)

            unpack_file(zipped_data_path, data_folder, mode="zip", keep=False)

            # rename test file to eval_dataset, since it has no labels
            os.rename(str(data_folder / "QQP/test.tsv"),
                      str(data_folder / "QQP/eval_dataset.tsv"))

        super(GLUE_QQP,
              self).__init__(data_folder / "QQP",
                             label_type=label_type,
                             columns=[3, 4, 5],
                             skip_first_line=True,
                             use_tokenizer=use_tokenizer,
                             max_tokens_per_doc=max_tokens_per_doc,
                             max_chars_per_doc=max_chars_per_doc,
                             in_memory=in_memory,
                             sample_missing_splits=sample_missing_splits)

        self.eval_dataset = DataPairDataset(
            data_folder / "QQP/eval_dataset.tsv",
            columns=[1, 2, 0],
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_first_line=True,
            label=False)
Example #3
0
    def __init__(self,
                 label_type="entailment",
                 base_path: Union[str, Path] = None,
                 max_tokens_per_doc=-1,
                 max_chars_per_doc=-1,
                 use_tokenizer=True,
                 in_memory: bool = True,
                 sample_missing_splits: bool = True):
        """
        Creates a DataPairCorpus for the Glue Recognizing Textual Entailment (RTE) data (https://gluebenchmark.com/tasks).
        Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data. 
        This file contains unlabeled test data to evaluate models on the Glue RTE task.
        """

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        dataset_name = "glue"

        # if no base_path provided take cache root
        if not base_path:
            base_path = flair.cache_root / "datasets"
        data_folder = base_path / dataset_name

        data_file = data_folder / "RTE/train.tsv"

        # if data is not downloaded yet, download it
        if not data_file.is_file():
            # get the zip file
            zipped_data_path = cached_path(
                'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
                Path("datasets") / dataset_name)

            unpack_file(zipped_data_path, data_folder, mode="zip", keep=False)

            # rename test file to eval_dataset, since it has no labels
            os.rename(str(data_folder / "RTE/test.tsv"),
                      str(data_folder / "RTE/eval_dataset.tsv"))

        super(GLUE_RTE,
              self).__init__(data_folder / "RTE",
                             label_type=label_type,
                             columns=[1, 2, 3],
                             skip_first_line=True,
                             use_tokenizer=use_tokenizer,
                             max_tokens_per_doc=max_tokens_per_doc,
                             max_chars_per_doc=max_chars_per_doc,
                             in_memory=in_memory,
                             sample_missing_splits=sample_missing_splits)

        self.eval_dataset = DataPairDataset(
            data_folder / "RTE/eval_dataset.tsv",
            columns=[1, 2, 3],
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_first_line=True,
            label=False)
Example #4
0
    def __init__(
            self,
            base_path: Union[str, Path] = None,
            max_tokens_per_doc=-1,
            max_chars_per_doc=-1,
            use_tokenizer=True,
            in_memory: bool = True,
            sample_missing_splits: bool = True
    ):
        """
        Creates a DataPairCorpus for the SuperGlue Recognizing Textual Entailment (RTE) data (https://super.gluebenchmark.com/tasks).
        Additionaly to the Corpus we have a eval_dataset containing the test file of the SuperGlue data. 
        This file contains unlabeled test data to evaluate models on the SuperGlue RTE task.
        """

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        dataset_name = "superglue"

        # if no base_path provided take cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        data_file = data_folder / "RTE/train.tsv"

        # if data not downloaded yet, download it
        if not data_file.is_file():
            # get the zip file
            zipped_data_path = cached_path(
                'https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip',
                Path("datasets") / dataset_name
            )

            unpack_file(
                zipped_data_path,
                data_folder,
                mode="zip",
                keep=False
            )

            # the downloaded files have json format, we transform them to tsv
            rte_jsonl_to_tsv(data_folder / "RTE/train.jsonl", remove=True)
            rte_jsonl_to_tsv(data_folder / "RTE/test.jsonl", remove=True, label=False)
            rte_jsonl_to_tsv(data_folder / "RTE/val.jsonl", remove=True)

            os.rename(str(data_folder / "RTE/val.tsv"), str(data_folder / "RTE/dev.tsv"))
            os.rename(str(data_folder / "RTE/test.tsv"), str(data_folder / "RTE/eval_dataset.tsv"))

        super(SUPERGLUE_RTE, self).__init__(
            data_folder / "RTE",
            columns=[0, 1, 2],
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            label_type='textual_entailment',
            sample_missing_splits=sample_missing_splits
        )

        self.eval_dataset = DataPairDataset(
            data_folder / "RTE/eval_dataset.tsv",
            columns=[0, 1, 2],
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_first_line=False,
            label=False
        )
Example #5
0
    def __init__(self,
                 label_type="entailment",
                 evaluate_on_matched: bool = True,
                 base_path: Union[str, Path] = None,
                 max_tokens_per_doc=-1,
                 max_chars_per_doc=-1,
                 use_tokenizer=True,
                 in_memory: bool = True,
                 sample_missing_splits: bool = True):
        """
        Creates a DataPairCorpus for the Multi-Genre Natural Language Inference Corpus (MNLI)
        from GLUE benchmark (https://gluebenchmark.com/tasks). Entailment annotations are: 
        entailment, contradiction, neutral. This corpus includes two dev sets mathced/mismatched 
        and two unlabeled test sets: eval_dataset_matched, eval_dataset_mismatched.
        """

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        dataset_name = "glue"

        # if no base_path provided take cache root
        if not base_path:
            base_path = flair.cache_root / "datasets"
        data_folder = base_path / dataset_name

        data_file = data_folder / "MNLI/train.tsv"

        # if data is not downloaded yet, download it
        if not data_file.is_file():
            # get the zip file
            zipped_data_path = cached_path(
                "https://dl.fbaipublicfiles.com/glue/data/MNLI.zip",
                Path("datasets") / dataset_name)

            unpack_file(zipped_data_path, data_folder, mode="zip", keep=False)

            # reorder dev datasets to have same columns as in train set: 8, 9, and 11
            # dev sets include 5 different annotations but we will only keep the gold label
            for dev_filename in ["dev_matched.tsv", "dev_mismatched.tsv"]:

                temp_file = str("temp_" + dev_filename)
                os.rename(str(data_folder / "MNLI" / dev_filename),
                          str(data_folder / "MNLI" / temp_file))

                with open(data_folder / "MNLI" / dev_filename,
                          "a") as out_file, open(data_folder / "MNLI" /
                                                 temp_file) as in_file:
                    for line in in_file:
                        fields = line.split('\t')
                        reordered_columns = '\t'.join(
                            fields[column_id] for column_id in range(11))
                        reordered_columns += '\t' + fields[15]
                        out_file.write(reordered_columns)
                os.remove(str(data_folder / "MNLI" / temp_file))

            # rename test file to eval_dataset, since it has no labels
            os.rename(str(data_folder / "MNLI/test_matched.tsv"),
                      str(data_folder / "MNLI/eval_dataset_matched.tsv"))
            os.rename(str(data_folder / "MNLI/test_mismatched.tsv"),
                      str(data_folder / "MNLI/eval_dataset_mismatched.tsv"))

        matched_suffix = "matched" if evaluate_on_matched else "mismatched"

        dev_dataset = "dev_" + matched_suffix + ".tsv"
        eval_dataset = "eval_dataset_" + matched_suffix + ".tsv"

        self.evaluate_on_matched = evaluate_on_matched

        super(GLUE_MNLI,
              self).__init__(data_folder / "MNLI",
                             train_file=data_file,
                             dev_file=dev_dataset,
                             label_type=label_type,
                             columns=[8, 9, 11],
                             skip_first_line=True,
                             use_tokenizer=use_tokenizer,
                             max_tokens_per_doc=max_tokens_per_doc,
                             max_chars_per_doc=max_chars_per_doc,
                             in_memory=in_memory,
                             sample_missing_splits=sample_missing_splits)

        self.eval_dataset = DataPairDataset(
            data_folder / "MNLI" / eval_dataset,
            columns=[8, 9, 11],
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_first_line=True,
            label=False)
Example #6
0
    def __init__(
        self,
        label_type="entailment",
        base_path: Union[str, Path] = None,
        max_tokens_per_doc=-1,
        max_chars_per_doc=-1,
        use_tokenizer=True,
        in_memory: bool = True,
        sample_missing_splits: bool = True,
    ):
        """
        Creates a DataPairCorpus for the Question-answering Natural Language Inference dataset
        (QNLI) from GLUE benchmark (https://gluebenchmark.com/tasks).
        Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.
        This file contains unlabeled test data to evaluate models on the Glue QNLI task.
        """

        if not base_path:
            base_path = flair.cache_root / "datasets"
        else:
            base_path = Path(base_path)

        dataset_name = "glue"

        data_folder = base_path / dataset_name

        data_file = data_folder / "QNLI/train.tsv"

        # if data is not downloaded yet, download it
        if not data_file.is_file():
            # get the zip file
            zipped_data_path = cached_path(
                "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip",
                Path("datasets") / dataset_name,
            )

            unpack_file(zipped_data_path, data_folder, mode="zip", keep=False)

            # rename test file to eval_dataset, since it has no labels
            os.rename(
                str(data_folder / "QNLI/test.tsv"),
                str(data_folder / "QNLI/eval_dataset.tsv"),
            )

        super(GLUE_QNLI, self).__init__(
            data_folder / "QNLI",
            label_type=label_type,
            columns=[1, 2, 3],
            skip_first_line=True,
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            sample_missing_splits=sample_missing_splits,
        )

        self.eval_dataset = DataPairDataset(
            data_folder / "QNLI/eval_dataset.tsv",
            label_type=label_type,
            columns=[1, 2, 3],
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_first_line=True,
            label=False,
        )