Exemple #1
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     # TODO(wiki_qa): Downloads the data and defines the splits
     # dl_manager is a nlp.download.DownloadManager that can be used to
     # download and extract URLs
     dl_dir = dl_manager.download_and_extract(_DATA_URL)
     dl_dir = os.path.join(dl_dir, "WikiQACorpus")
     # dl_dir = os.path.join(dl_dir, '')
     return [
         nlp.SplitGenerator(name=nlp.Split.TEST,
                            gen_kwargs={
                                "filepath":
                                os.path.join(dl_dir, "WikiQA-test.tsv")
                            }),
         nlp.SplitGenerator(name=nlp.Split.VALIDATION,
                            gen_kwargs={
                                "filepath":
                                os.path.join(dl_dir, "WikiQA-dev.tsv")
                            }),
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath": os.path.join(dl_dir, "WikiQA-train.tsv")
             },
         ),
     ]
Exemple #2
0
    def _split_generators(self, dl_manager):
        """ The `datafiles` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].

            If str or List[str], then the dataset returns only the 'train' split.
            If dict, then keys should be from the `nlp.Split` enum.
        """
        if isinstance(self.config.data_files, (str, list, tuple)):
            # Handle case with only one split
            files = self.config.data_files
            if isinstance(files, str):
                files = [files]
            return [
                nlp.SplitGenerator(name=nlp.Split.TRAIN,
                                   gen_kwargs={"files": files})
            ]
        else:
            # Handle case with several splits and a dict mapping
            splits = []
            for split_name in [
                    nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST
            ]:
                if split_name in self.config.data_files:
                    files = self.config.data_files[split_name]
                    if isinstance(files, str):
                        files = [files]
                    splits.append(
                        nlp.SplitGenerator(name=split_name,
                                           gen_kwargs={"files": files}))
            return splits
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     # TODO(race): Downloads the data and defines the splits
     # dl_manager is a nlp.download.DownloadManager that can be used to
     # download and extract URLs
     dl_dir = dl_manager.download_and_extract(_URL)
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "files": os.listdir(os.path.join(dl_dir,
                                                  "RACE/test/high")),
                 "filespath": os.path.join(dl_dir, "RACE/test/high"),
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "files": os.listdir(os.path.join(dl_dir,
                                                  "RACE/train/high")),
                 "filespath": os.path.join(dl_dir, "RACE/train/high"),
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "files": os.listdir(os.path.join(dl_dir, "RACE/dev/high")),
                 "filespath": os.path.join(dl_dir, "RACE/dev/high"),
             },
         ),
     ]
Exemple #4
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     dl_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             gen_kwargs={
                 "filename": os.path.join(dl_path, "train.csv"),
                 "toxicity_label": "target"
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             gen_kwargs={
                 "filename": os.path.join(dl_path,
                                          "test_public_expanded.csv"),
                 "toxicity_label": "toxicity",
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             gen_kwargs={
                 "filename": os.path.join(dl_path,
                                          "test_private_expanded.csv"),
                 "toxicity_label": "toxicity",
             },
         ),
     ]
Exemple #5
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     dl_path = dl_manager.download_and_extract(_URL)
     input_path = os.path.join(dl_path, "AESLC-master",
                               "enron_subject_line")
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             gen_kwargs={
                 "pattern": os.path.join(input_path, "train", "*.subject")
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             gen_kwargs={
                 "pattern": os.path.join(input_path, "dev", "*.subject")
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             gen_kwargs={
                 "pattern": os.path.join(input_path, "test", "*.subject")
             },
         ),
     ]
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        en_data_path = dl_manager.download_and_extract(_EN_URL)
        de_data_path = dl_manager.download_and_extract(_DE_URL)
        fr_data_path = dl_manager.download_and_extract(_FR_URL)

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "en_path": os.path.join(en_data_path, "train.jsonl"),
                    "de_path": os.path.join(de_data_path, "train.jsonl"),
                    "fr_path": os.path.join(fr_data_path, "train.jsonl"),
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={
                    "en_path": os.path.join(en_data_path, "validation.jsonl"),
                    "de_path": os.path.join(de_data_path, "validation.jsonl"),
                    "fr_path": os.path.join(fr_data_path, "validation.jsonl"),
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                gen_kwargs={
                    "en_path": os.path.join(en_data_path, "test.jsonl"),
                    "de_path": os.path.join(de_data_path, "test.jsonl"),
                    "fr_path": os.path.join(fr_data_path, "test.jsonl"),
                },
            ),
        ]
Exemple #7
0
    def _split_generators(self, dl_manager):
        path = dl_manager.download_and_extract(_URL)
        test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files")
        train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files")
        dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files")
        with open(test_file) as f:
            test_splits = [file.replace("\n", "") for file in f.readlines()]

        with open(train_file) as f:
            train_splits = [file.replace("\n", "") for file in f.readlines()]
        with open(dev_file) as f:
            dev_splits = [file.replace("\n", "") for file in f.readlines()]
        c2 = "CRD3-master/data/aligned data/c=2"
        c3 = "CRD3-master/data/aligned data/c=3"
        c4 = "CRD3-master/data/aligned data/c=4"
        files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))]
        files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))])
        files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))])

        test_files, train_files, dev_files = get_train_test_dev_files(files, test_splits, train_splits, dev_splits)

        return [
            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files_path": train_files},),
            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"files_path": test_files},),
            nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"files_path": dev_files},),
        ]
Exemple #8
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     # TODO(wiki_split): Downloads the data and defines the splits
     # dl_manager is a nlp.download.DownloadManager that can be used to
     # download and extract URLs
     urls_to_download = {
         'train': os.path.join(_URL, _TRAIN_FILE),
         'test': os.path.join(_URL, _TEST_FILE),
         'dev': os.path.join(_URL, _DEV_FILE)
     }
     dl_dir = dl_manager.download_and_extract(urls_to_download)
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 'filepath': os.path.join(dl_dir['train'], 'train.tsv')
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={'filepath': dl_dir['test']},
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={'filepath': dl_dir['dev']},
         ),
     ]
Exemple #9
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(reclor): Downloads the data and defines the splits
        # dl_manager is a nlp.download.DownloadManager that can be used to
        # download and extract URLs
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        if not os.path.exists(data_dir):
            raise FileNotFoundError(
                "{} does not exist. Make sure you insert a manual dir via `nlp.load('wikihow', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format(
                    data_dir, self.MANUAL_DOWNLOAD_INSTRUCTIONS
                )
            )
        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": os.path.join(data_dir, "train.json")},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": os.path.join(data_dir, "test.json")},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": os.path.join(data_dir, "val.json")},
            ),
        ]
Exemple #10
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(squad_es): Downloads the data and defines the splits
        # dl_manager is a nlp.download.DownloadManager that can be used to

        # download and extract URLs
        v1_urls = {
            "train": os.path.join(_URL, "SQuAD-es-v1.1/train-v1.1-es.json"),
            "dev": os.path.join(_URL, "SQuAD-es-v1.1/dev-v1.1-es.json"),
        }
        v2_urls = {
            "train": os.path.join(_URL, "SQuAD-es-v2.0/train-v2.0-es.json"),
            "dev": os.path.join(_URL, "SQuAD-es-v2.0/dev-v2.0-es.json"),
        }
        if self.config.name == "v1.1.0":
            dl_dir = dl_manager.download_and_extract(v1_urls)
        elif self.config.name == "v2.0.0":
            dl_dir = dl_manager.download_and_extract(v2_urls)
        else:
            raise Exception("version does not match any existing one")
        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": dl_dir["train"]},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": dl_dir["dev"]},
            ),
        ]
Exemple #11
0
    def _split_generators(self, dl_manager):
        downloads = {}
        for key in _URLS.keys():
            downloads[key] = dl_manager.download_and_extract(_URLS[key])
            #  Fix for dummy data
            if os.path.isdir(downloads[key]):
                downloads[key] = os.path.join(downloads[key], key + ".json")

        return [
            nlp.SplitGenerator(name=nlp.Split.VALIDATION,
                               gen_kwargs={
                                   "filepath": downloads["dev"],
                                   "rel_info": downloads["rel_info"]
                               }),
            nlp.SplitGenerator(name=nlp.Split.TEST,
                               gen_kwargs={
                                   "filepath": downloads["test"],
                                   "rel_info": downloads["rel_info"]
                               }),
            nlp.SplitGenerator(
                name="train_annotated",
                gen_kwargs={
                    "filepath": downloads["train_annotated"],
                    "rel_info": downloads["rel_info"]
                },
            ),
            nlp.SplitGenerator(
                name="train_distant",
                gen_kwargs={
                    "filepath": downloads["train_distant"],
                    "rel_info": downloads["rel_info"]
                },
            ),
        ]
Exemple #12
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        dl_dir = dl_manager.download_and_extract(_DOWNLOAD_URL)
        data_dir = os.path.join(dl_dir, "multirc")
        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_dir, "train.jsonl")
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_dir, "val.jsonl")
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_dir, "test.jsonl")
                },
            ),
        ]
Exemple #13
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        splits = [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "articles_file":
                    os.path.join(
                        data_dir, "articles-training-" + self.config.name +
                        "-20181122.xml"),
                    "labels_file":
                    os.path.join(
                        data_dir, "ground-truth-training-" + self.config.name +
                        "-20181122.xml"),
                },
            )
        ]
        if self.config.name == "bypublisher":
            splits.append(
                nlp.SplitGenerator(
                    name=nlp.Split.VALIDATION,
                    gen_kwargs={
                        "articles_file":
                        os.path.join(
                            data_dir, "articles-validation-" +
                            self.config.name + "-20181122.xml"),
                        "labels_file":
                        os.path.join(
                            data_dir, "ground-truth-validation-" +
                            self.config.name + "-20181122.xml"),
                    },
                ))
        return splits
Exemple #14
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        schemas_file = dl_manager.download_and_extract(_DATA_URL)

        if os.path.isdir(schemas_file):
            # During testing the download manager mock gives us a directory
            schemas_file = os.path.join(schemas_file, "schema.txt")

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "filepath": schemas_file,
                    "split": "train"
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                gen_kwargs={
                    "filepath": schemas_file,
                    "split": "test"
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={
                    "filepath": schemas_file,
                    "split": "dev"
                },
            )
        ]
Exemple #15
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # dl_manager is a nlp.download.DownloadManager that can be used to
        # download and extract URLs
        dl_dir = dl_manager.download_and_extract(_URL)

        # This folder contains the orginal/2013 dataset
        data_dir = os.path.join(dl_dir, "Guardian", "Guardian_original")

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"data_dir": data_dir, "samples_folders": self.config.train_folder, "split": "train"},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"data_dir": data_dir, "samples_folders": self.config.test_folder, "split": "test"},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"data_dir": data_dir, "samples_folders": self.config.valid_folder, "split": "valid"},
            ),
        ]
Exemple #16
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        dl_dir = os.path.join(dl_dir, "relation_splits")

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                gen_kwargs={
                    "filepaths": [
                        os.path.join(dl_dir, "test." + str(i))
                        for i in range(10)
                    ],
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={
                    "filepaths":
                    [os.path.join(dl_dir, "dev." + str(i)) for i in range(10)],
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "filepaths": [
                        os.path.join(dl_dir, "train." + str(i))
                        for i in range(10)
                    ],
                },
            ),
        ]
Exemple #17
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
     if not os.path.exists(data_dir):
         raise FileNotFoundError(
             "{} does not exist. Make sure you insert a manual dir via `nlp.load('newsroom', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}"
             .format(data_dir, self.MANUAL_DOWNLOAD_INSTRUCTIONS))
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             gen_kwargs={
                 "input_file": os.path.join(data_dir, "train.jsonl")
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             gen_kwargs={"input_file": os.path.join(data_dir, "dev.jsonl")},
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             gen_kwargs={
                 "input_file": os.path.join(data_dir, "test.jsonl")
             },
         ),
     ]
Exemple #18
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     # TODO(fquad): Downloads the data and defines the splits
     # dl_manager is a nlp.download.DownloadManager that can be used to
     # download and extract URLs
     download_urls = {
         'train': os.path.join(_URL, _TRAIN_DATA),
         'valid': os.path.join(_URL, _VALID_DATA)
     }
     dl_dir = dl_manager.download_and_extract(download_urls)
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 'filepath': os.path.join(dl_dir['train'], 'train.json')
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 'filepath': os.path.join(dl_dir['valid'], 'valid.json')
             },
         ),
     ]
Exemple #19
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(ubuntu_dialogs_corpus): Downloads the data and defines the splits
        # dl_manager is a nlp.download.DownloadManager that can be used to
        # download and extract URLs
        manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        if self.config.name == 'train':
            return [
                nlp.SplitGenerator(
                    name=nlp.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        'filepath': os.path.join(manual_dir, 'train.csv')
                    },
                ),
            ]
        else:
            return [
                nlp.SplitGenerator(
                    name=nlp.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        'filepath': os.path.join(manual_dir, 'test.csv')
                    },
                ),
                nlp.SplitGenerator(
                    name=nlp.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        'filepath': os.path.join(manual_dir, 'valid.csv')
                    },
                ),
            ]
Exemple #20
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     # TODO(quarel): Downloads the data and defines the splits
     # dl_manager is a nlp.download.DownloadManager that can be used to
     # download and extract URLs
     dl_dir = dl_manager.download_and_extract(_URL)
     data_dir = os.path.join(dl_dir, 'quarel-dataset-v1')
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 'filepath': os.path.join(data_dir, 'quarel-v1-train.jsonl')
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 'filepath': os.path.join(data_dir, 'quarel-v1-test.jsonl')
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 'filepath': os.path.join(data_dir, 'quarel-v1-dev.jsonl')
             },
         ),
     ]
Exemple #21
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        del dl_manager  # Unused

        lang = self._builder_config.language

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "filepaths":
                    "%s/train/%s_examples-*" % (_DATA_DIRECTORY, lang)
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={
                    "filepaths":
                    "%s/dev/%s_examples-*" % (_DATA_DIRECTORY, lang)
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                gen_kwargs={
                    "filepaths":
                    "%s/test/%s_examples-*" % (_DATA_DIRECTORY, lang)
                },
            ),
        ]
Exemple #22
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     dl_path = dl_manager.download_and_extract(_URL)
     pattern = os.path.join(dl_path, "org_data", "%s.%s.txt")
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             gen_kwargs={
                 "src_path": pattern % ("train", "src"),
                 "tgt_path": pattern % ("train", "tgt"),
                 "replace_unk": True,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             gen_kwargs={
                 "src_path": pattern % ("dev", "src"),
                 "tgt_path": pattern % ("dev", "tgt"),
                 "replace_unk": True,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             gen_kwargs={
                 "src_path": pattern % ("test", "src"),
                 "tgt_path": pattern % ("test", "tgt"),
                 "replace_unk": False,
             },
         ),
     ]
Exemple #23
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(scifact): Downloads the data and defines the splits
        # dl_manager is a nlp.download.DownloadManager that can be used to
        # download and extract URLs
        dl_dir = dl_manager.download_and_extract(_URL)

        if self.config.name == "corpus":
            return [
                nlp.SplitGenerator(
                    name=nlp.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={"filepath": os.path.join(dl_dir, "data", "corpus.jsonl"), "split": "train"},
                ),
            ]
        else:
            return [
                nlp.SplitGenerator(
                    name=nlp.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={"filepath": os.path.join(dl_dir, "data", "claims_train.jsonl"), "split": "train"},
                ),
                nlp.SplitGenerator(
                    name=nlp.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={"filepath": os.path.join(dl_dir, "data", "claims_test.jsonl"), "split": "test"},
                ),
                nlp.SplitGenerator(
                    name=nlp.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={"filepath": os.path.join(dl_dir, "data", "claims_dev.jsonl"), "split": "dev"},
                ),
            ]
 def _split_generators(self, dl_manager):
   """Returns SplitGenerators."""
   # TODO(social_i_qa): Downloads the data and defines the splits
   # dl_manager is a nlp.download.DownloadManager that can be used to
   # download and extract URLs
   dl_dir = dl_manager.download_and_extract(_URL)
   dl_dir = os.path.join(dl_dir, 'socialiqa-train-dev')
   return [
       nlp.SplitGenerator(
           name=nlp.Split.TRAIN,
           # These kwargs will be passed to _generate_examples
           gen_kwargs={
               'filepath': os.path.join(dl_dir, 'train.jsonl'),
               'labelpath': os.path.join(dl_dir, 'train-labels.lst')
           },
       ),
       nlp.SplitGenerator(
           name=nlp.Split.VALIDATION,
           # These kwargs will be passed to _generate_examples
           gen_kwargs={
               'filepath': os.path.join(dl_dir, 'dev.jsonl'),
               'labelpath': os.path.join(dl_dir, 'dev-labels.lst')
           },
       ),
   ]
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        if self.config.name == "blog-authorship-corpus":
            data = dl_manager.download_and_extract(self.config.data_url)
            data_dir = os.path.join(data, "blogs")
            files = glob.glob(os.path.join(data_dir, "*.xml"))
            train_files = []
            validation_files = []

            for i, file_path in enumerate(files):
                # 95% / 5% (train / val) split
                if i % 20 == 0:
                    validation_files.append(file_path)
                else:
                    train_files.append(file_path)

            return [
                nlp.SplitGenerator(
                    name=nlp.Split.TRAIN,
                    gen_kwargs={
                        "files": train_files,
                        "split": "train"
                    },
                ),
                nlp.SplitGenerator(
                    name=nlp.Split.VALIDATION,
                    gen_kwargs={
                        "files": validation_files,
                        "split": "validation"
                    },
                ),
            ]
        else:
            raise ValueError("{} does not exist".format(self.config.name))
Exemple #26
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO(xcopa): Downloads the data and defines the splits
        # dl_manager is a nlp.download.DownloadManager that can be used to
        # download and extract URLs
        dl_dir = dl_manager.download_and_extract(_URL)

        data_dir = os.path.join(dl_dir, 'xcopa-master', 'data',
                                self.config.name)
        return [
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    'filepath':
                    os.path.join(data_dir,
                                 'test.' + self.config.name + '.jsonl')
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    'filepath':
                    os.path.join(data_dir,
                                 'val.' + self.config.name + '.jsonl')
                },
            ),
        ]
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        # NB: The CQA Dataset should be read only once, and only by callers who
        # want to _create_ the Cos-E dataset from scratch.
        cqa_indexed = _download_and_index_cqa(dl_manager)

        files = dl_manager.download_and_extract({
            "dev": [
                os.path.join(_COS_E_URL,
                             "v1.11/dev/cose_dev_v1.11_processed.jsonl")
            ],
            "train": [
                os.path.join(_COS_E_URL,
                             "v1.11/train/cose_train_v1.11_processed.jsonl")
            ],
        })

        # We use the CoS-E/CQA dev set as our validation set.
        return [
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={
                    "files": files["dev"],
                    "cqa_indexed": cqa_indexed
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "files": files["train"],
                    "cqa_indexed": cqa_indexed
                },
            ),
        ]
Exemple #28
0
 def _split_generators(self, dl_manager):
     """ The `datafiles` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].
         If str or List[str], then the dataset returns only the 'train' split.
         If dict, then keys should be from the `nlp.Split` enum.
     """
     data_dir = self.config.data_dir
     return [
         nlp.SplitGenerator(
             name=nlp.Split.TRAIN,
             gen_kwargs={
                 "filepath": os.path.join(data_dir, "train.data"),
                 "split": "train",
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.TEST,
             gen_kwargs={
                 "filepath": os.path.join(data_dir, "test.data"),
                 "split": "test"
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split.VALIDATION,
             gen_kwargs={
                 "filepath": os.path.join(data_dir, "dev.data"),
                 "split": "dev",
             },
         ),
     ]
Exemple #29
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        train_dir = os.path.join(data_dir, "train",
                                 "dataset-" + self.config.name)
        val_dir = os.path.join(data_dir, "validation",
                               "dataset-" + self.config.name)

        if not os.path.exists(train_dir):
            raise FileNotFoundError(
                "{} does not exist. Make sure you insert a manual dir via `nlp.load_dataset('style_change_detection', data_dir=...)` that includes {}. Manual download instructions: {}"
                .format(train_dir, train_dir,
                        self.manual_download_instructions))

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "articles":
                    [f for f in os.listdir(train_dir) if f.endswith(".txt")],
                    "base_dir":
                    train_dir,
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={
                    "articles":
                    [f for f in os.listdir(val_dir) if f.endswith(".txt")],
                    "base_dir":
                    val_dir
                },
            ),
        ]
Exemple #30
0
 def _split_generators(self, dl_manager):
     train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL)
     test_path = dl_manager.download_and_extract(_TEST_DOWNLOAD_URL)
     return [
         nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": train_path }),
         nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": test_path}),
     ]