Exemple #1
0
    def _split_generators(self, dl_manager: datasets.DownloadManager):

        train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL)
        dev_path = dl_manager.download_and_extract(_DEV_DOWNLOAD_URL)

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN,
                                    gen_kwargs={"filepath": train_path}),
            datasets.SplitGenerator(name=datasets.Split.VALIDATION,
                                    gen_kwargs={"filepath": dev_path}),
        ]
Exemple #2
0
    def _split_generators(
            self, dl_manager: datasets.DownloadManager
    ) -> List[datasets.SplitGenerator]:
        SPLITS = self.SPLITS
        _URL = self.info["raw_url"]
        urls_to_download = {}
        for split in SPLITS:
            if split not in urls_to_download:
                urls_to_download[split] = {}

            for key, url in self.generate_urls(split):
                if not url.startswith("http"):
                    url = _URL + "/" + url
                urls_to_download[split][key] = url

        downloaded_files = {}
        for k, v in urls_to_download.items():
            downloaded_files[k] = dl_manager.download_and_extract(v)

        return [
            datasets.SplitGenerator(
                name=SPLITS[k],
                gen_kwargs={
                    "split_name": k,
                    "file_paths": downloaded_files[k]
                },
            ) for k in SPLITS
        ]
Exemple #3
0
 def _split_generators(
         self, dl_manager: datasets.DownloadManager
 ) -> List[datasets.SplitGenerator]:
     images_path = Path(dl_manager.download_and_extract(_URL)) / "PetImages"
     return [
         datasets.SplitGenerator(name=datasets.Split.TRAIN,
                                 gen_kwargs={"images_path": images_path}),
     ]
 def _split_generators(
         self, dl_manager: datasets.DownloadManager
 ) -> List[datasets.SplitGenerator]:
     images_path = os.path.join(dl_manager.download_and_extract(_URL),
                                "PetImages")
     return [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN,
             gen_kwargs={"files": dl_manager.iter_files([images_path])}),
     ]
Exemple #5
0
 def _split_generators(self, dl_manager: datasets.DownloadManager):
     """Return the split generators for the uniref datasets."""
     downloaded_file = dl_manager.download_and_extract(self.config.url)
     return [
         datasets.SplitGenerator(
             name=str(self.config.name),
             # These kwargs will be passed to _generate_examples
             gen_kwargs={"file_paths": [downloaded_file]},
         ),
     ]
Exemple #6
0
 def _split_generators(
         self, dl_manager: datasets.DownloadManager
 ) -> List[datasets.SplitGenerator]:
     """Returns SplitGenerators."""
     urls_to_download = {
         "train": self.config.data_dir + "train.csv",
         "test": self.config.data_dir + "test.csv"
     }
     downloaded_files = dl_manager.download_and_extract(urls_to_download)
     return [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN,
             gen_kwargs={"filepath": downloaded_files["train"]}),
         datasets.SplitGenerator(
             name=datasets.Split.TEST,
             gen_kwargs={"filepath": downloaded_files["test"]})
     ]
Exemple #7
0
    def _split_generators(self, dl_manager: datasets.DownloadManager):
        """Returns SplitGenerators."""
        # dl_manager is a datasets.download.DownloadManager that can be used to
        # download and extract URLs
        dl_dir = dl_manager.download_and_extract(_URL)
        data_dir = os.path.join(dl_dir, "ijcnlp_dailydialog")

        # The splits are nested inside the zip
        for name in ("train", "validation", "test"):
            zip_fpath = os.path.join(data_dir, f"{name}.zip")
            with ZipFile(zip_fpath) as zip_file:
                zip_file.extractall(path=data_dir)
                zip_file.close()

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "file_path": os.path.join(data_dir, "train", "dialogues_train.txt"),
                    "act_path": os.path.join(data_dir, "train", "dialogues_act_train.txt"),
                    "emotion_path": os.path.join(data_dir, "train", "dialogues_emotion_train.txt"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "file_path": os.path.join(data_dir, "test", "dialogues_test.txt"),
                    "act_path": os.path.join(data_dir, "test", "dialogues_act_test.txt"),
                    "emotion_path": os.path.join(data_dir, "test", "dialogues_emotion_test.txt"),
                    "split": "test",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "file_path": os.path.join(data_dir, "validation", "dialogues_validation.txt"),
                    "act_path": os.path.join(data_dir, "validation", "dialogues_act_validation.txt"),
                    "emotion_path": os.path.join(data_dir, "validation", "dialogues_emotion_validation.txt"),
                    "split": "dev",
                },
            ),
        ]
Exemple #8
0
 def _split_generators(self, dl_manager: datasets.DownloadManager):
     dl_dir = dl_manager.download_and_extract(_URL)
     data_dir = os.path.join(dl_dir, "ijcnlp_dailydialog")
     splits = [
         datasets.Split.TRAIN, datasets.Split.VALIDATION,
         datasets.Split.TEST
     ]
     return [
         datasets.SplitGenerator(
             name=split,
             gen_kwargs={
                 "data_zip": os.path.join(data_dir, f"{split}.zip"),
                 "dialog_path": f"{split}/dialogues_{split}.txt",
                 "act_path": f"{split}/dialogues_act_{split}.txt",
                 "emotion_path": f"{split}/dialogues_emotion_{split}.txt",
             },
         ) for split in splits
     ]
Exemple #9
0
 def _split_generators(self, dl_manager: datasets.DownloadManager):
     dl_dir = dl_manager.download_and_extract(self.config.data_url) or ""
     task_name = _get_task_name_from_data_url(self.config.data_url)
     dl_dir = os.path.join(dl_dir, task_name)
     if self.config.name == "lidirus":
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
                     "data_file": os.path.join(dl_dir, f"{task_name}.jsonl"),
                     "split": datasets.Split.TEST,
                 },
             ),
         ]
     else:
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
                     "data_file": os.path.join(dl_dir, "train.jsonl"),
                     "split": datasets.Split.TRAIN,
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
                     "data_file": os.path.join(dl_dir, "val.jsonl"),
                     "split": datasets.Split.VALIDATION,
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
                     "data_file": os.path.join(dl_dir, "test.jsonl"),
                     "split": datasets.Split.TEST,
                 },
             ),
         ]
Exemple #10
0
 def _split_generators(self, dl_manager: datasets.DownloadManager):
     """Returns SplitGenerators.
     Uses local files if a data_dir is specified. Otherwise downloads the files from their official url."""
     if self.config.data_dir:
         data_dir = self.config.data_dir
     else:
         url = _URLS[self.config.name]
         data_dir = dl_manager.download_and_extract(url)
         if self.config.name in ["NER", "ROLES", "BORING"]:
             data_dir += "/sd_panels"
         elif self.config.name == "PANELIZATION":
             data_dir += "/sd_figs"
         else:
             raise ValueError(f"unkonwn config name: {self.config.name}")
     return [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath": data_dir + "/train.jsonl",
                 "split": "train",
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split.TEST,
             gen_kwargs={
                 "filepath": data_dir + "/test.jsonl",
                 "split": "test"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split.VALIDATION,
             gen_kwargs={
                 "filepath": data_dir + "/eval.jsonl",
                 "split": "eval",
             },
         ),
     ]