def _split_generators(self, dl_manager): """Returns SplitGenerators.""" urls_to_download = { "train": "{}/stsb-{}-train.csv".format(_BASE_URL, self.config.name), "dev": "{}/stsb-{}-dev.csv".format(_BASE_URL, self.config.name), "test": "{}/stsb-{}-test.csv".format(_BASE_URL, self.config.name), } downloaded_files = dl_manager.download(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": downloaded_files["train"], }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": downloaded_files["test"], }, ), datasets.SplitGenerator( name=datasets.NamedSplit("dev"), # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": downloaded_files["dev"], }, ), ]
def _split_generators(self, dl_manager): if self.config.name == "asr": _DL_URLS = { "dev": self.config.data_url + "dev-clean.tar.gz", "test": self.config.data_url + "test-clean.tar.gz", "train": self.config.data_url + "train-clean-100.tar.gz", } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train"]}), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]} ), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}), ] elif self.config.name == "ks": _DL_URLS = { "train_val_test": self.config.data_url.format(filename="speech_commands_v0.01.tar.gz"), "test": self.config.data_url.format(filename="speech_commands_test_set_v0.01.tar.gz"), } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "train"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "val"}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"], "split": "test"} ), ] elif self.config.name == "ic": archive_path = dl_manager.download_and_extract(self.config.data_url) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path, "split": "train"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path, "split": "valid"}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path, "split": "test"} ), ] elif self.config.name == "si": manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": manual_dir, "split": 1}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": manual_dir, "split": 2}, ), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": manual_dir, "split": 3}), ] elif self.config.name == "sd": splits = ["train", "dev", "test"] _DL_URLS = { split: { filename: self.config.data_url.format(split=split, filename=filename) for filename in ["reco2dur", "segments", "utt2spk", "wav.zip"] } for split in splits } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator( name=datasets.NamedSplit(split), gen_kwargs={"archive_path": archive_path[split], "split": split} ) for split in splits ] elif self.config.name == "er": manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) return [ datasets.SplitGenerator( name=f"session{i}", gen_kwargs={"archive_path": manual_dir, "split": i}, ) for i in range(1, 6) ]