def _split_generators( self, dl_manager: datasets.DownloadManager ) -> List[datasets.SplitGenerator]: name = self.config.name info = DEFINITIONS[name] archive = dl_manager.download(info["raw_url"] + "/programs.tar.gz") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "files": dl_manager.iter_archive(archive), "split_name": "train" }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "files": dl_manager.iter_archive(archive), "split_name": "valid" }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "files": dl_manager.iter_archive(archive), "split_name": "test" }, ), ]
def _split_generators( self, dl_manager: datasets.DownloadManager ) -> List[datasets.SplitGenerator]: images_path = os.path.join(dl_manager.download_and_extract(_URL), "PetImages") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_files([images_path])}), ]
def _split_generators(self, dl_manager: datasets.DownloadManager): train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL) dev_path = dl_manager.download_and_extract(_DEV_DOWNLOAD_URL) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": train_path}), datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": dev_path}), ]
def _split_generators(self, dl_manager: datasets.DownloadManager): archive = dl_manager.download(_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "files": dl_manager.iter_archive(archive), }, ) ]
def _split_generators( self, dl_manager: datasets.DownloadManager ) -> List[datasets.SplitGenerator]: SPLITS = self.SPLITS _URL = self.info["raw_url"] urls_to_download = {} for split in SPLITS: if split not in urls_to_download: urls_to_download[split] = {} for key, url in self.generate_urls(split): if not url.startswith("http"): url = _URL + "/" + url urls_to_download[split][key] = url downloaded_files = {} for k, v in urls_to_download.items(): downloaded_files[k] = dl_manager.download_and_extract(v) return [ datasets.SplitGenerator( name=SPLITS[k], gen_kwargs={ "split_name": k, "file_paths": downloaded_files[k] }, ) for k in SPLITS ]
def _split_generators( self, dl_manager: datasets.DownloadManager ) -> List[datasets.SplitGenerator]: images_path = Path(dl_manager.download_and_extract(_URL)) / "PetImages" return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"images_path": images_path}), ]
def _split_generators(self, dl_manager: datasets.DownloadManager): """Return the split generators for the uniref datasets.""" downloaded_file = dl_manager.download_and_extract(self.config.url) return [ datasets.SplitGenerator( name=str(self.config.name), # These kwargs will be passed to _generate_examples gen_kwargs={"file_paths": [downloaded_file]}, ), ]
def _split_generators( self, dl_manager: datasets.DownloadManager ) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" urls_to_download = { "train": self.config.data_dir + "train.csv", "test": self.config.data_dir + "test.csv" } downloaded_files = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}) ]
def _split_generators(self, dl_manager: datasets.DownloadManager): """Returns SplitGenerators.""" # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) data_dir = os.path.join(dl_dir, "ijcnlp_dailydialog") # The splits are nested inside the zip for name in ("train", "validation", "test"): zip_fpath = os.path.join(data_dir, f"{name}.zip") with ZipFile(zip_fpath) as zip_file: zip_file.extractall(path=data_dir) zip_file.close() return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "file_path": os.path.join(data_dir, "train", "dialogues_train.txt"), "act_path": os.path.join(data_dir, "train", "dialogues_act_train.txt"), "emotion_path": os.path.join(data_dir, "train", "dialogues_emotion_train.txt"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "file_path": os.path.join(data_dir, "test", "dialogues_test.txt"), "act_path": os.path.join(data_dir, "test", "dialogues_act_test.txt"), "emotion_path": os.path.join(data_dir, "test", "dialogues_emotion_test.txt"), "split": "test", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "file_path": os.path.join(data_dir, "validation", "dialogues_validation.txt"), "act_path": os.path.join(data_dir, "validation", "dialogues_act_validation.txt"), "emotion_path": os.path.join(data_dir, "validation", "dialogues_emotion_validation.txt"), "split": "dev", }, ), ]
def _split_generators(self, dl_manager: datasets.DownloadManager): dl_dir = dl_manager.download_and_extract(_URL) data_dir = os.path.join(dl_dir, "ijcnlp_dailydialog") splits = [ datasets.Split.TRAIN, datasets.Split.VALIDATION, datasets.Split.TEST ] return [ datasets.SplitGenerator( name=split, gen_kwargs={ "data_zip": os.path.join(data_dir, f"{split}.zip"), "dialog_path": f"{split}/dialogues_{split}.txt", "act_path": f"{split}/dialogues_act_{split}.txt", "emotion_path": f"{split}/dialogues_emotion_{split}.txt", }, ) for split in splits ]
def _split_generators(self, dl_manager: datasets.DownloadManager): dl_dir = dl_manager.download_and_extract(self.config.data_url) or "" task_name = _get_task_name_from_data_url(self.config.data_url) dl_dir = os.path.join(dl_dir, task_name) if self.config.name == "lidirus": return [ datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "data_file": os.path.join(dl_dir, f"{task_name}.jsonl"), "split": datasets.Split.TEST, }, ), ] else: return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "data_file": os.path.join(dl_dir, "train.jsonl"), "split": datasets.Split.TRAIN, }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "data_file": os.path.join(dl_dir, "val.jsonl"), "split": datasets.Split.VALIDATION, }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "data_file": os.path.join(dl_dir, "test.jsonl"), "split": datasets.Split.TEST, }, ), ]
def _split_generators(self, dl_manager: datasets.DownloadManager): """Returns SplitGenerators. Uses local files if a data_dir is specified. Otherwise downloads the files from their official url.""" if self.config.data_dir: data_dir = self.config.data_dir else: url = _URLS[self.config.name] data_dir = dl_manager.download_and_extract(url) if self.config.name in ["NER", "ROLES", "BORING"]: data_dir += "/sd_panels" elif self.config.name == "PANELIZATION": data_dir += "/sd_figs" else: raise ValueError(f"unkonwn config name: {self.config.name}") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir + "/train.jsonl", "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": data_dir + "/test.jsonl", "split": "test"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": data_dir + "/eval.jsonl", "split": "eval", }, ), ]
def _split_generators(self, dl_manager: datasets.DownloadManager): urls_to_download = self.config.data_urls downloaded_archives = dl_manager.download(urls_to_download) downloaded_archives = [dl_manager.iter_archive(archive) for archive in downloaded_archives] return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"data_dirs": downloaded_archives})]