def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(wiki_qa): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_DATA_URL) dl_dir = os.path.join(dl_dir, "WikiQACorpus") # dl_dir = os.path.join(dl_dir, '') return [ nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={ "filepath": os.path.join(dl_dir, "WikiQA-test.tsv") }), nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={ "filepath": os.path.join(dl_dir, "WikiQA-dev.tsv") }), nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(dl_dir, "WikiQA-train.tsv") }, ), ]
def _split_generators(self, dl_manager): """ The `datafiles` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]]. If str or List[str], then the dataset returns only the 'train' split. If dict, then keys should be from the `nlp.Split` enum. """ if isinstance(self.config.data_files, (str, list, tuple)): # Handle case with only one split files = self.config.data_files if isinstance(files, str): files = [files] return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files}) ] else: # Handle case with several splits and a dict mapping splits = [] for split_name in [ nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST ]: if split_name in self.config.data_files: files = self.config.data_files[split_name] if isinstance(files, str): files = [files] splits.append( nlp.SplitGenerator(name=split_name, gen_kwargs={"files": files})) return splits
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(race): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) return [ nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "files": os.listdir(os.path.join(dl_dir, "RACE/test/high")), "filespath": os.path.join(dl_dir, "RACE/test/high"), }, ), nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "files": os.listdir(os.path.join(dl_dir, "RACE/train/high")), "filespath": os.path.join(dl_dir, "RACE/train/high"), }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "files": os.listdir(os.path.join(dl_dir, "RACE/dev/high")), "filespath": os.path.join(dl_dir, "RACE/dev/high"), }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_path = dl_manager.download_and_extract(_DOWNLOAD_URL) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "filename": os.path.join(dl_path, "train.csv"), "toxicity_label": "target" }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "filename": os.path.join(dl_path, "test_public_expanded.csv"), "toxicity_label": "toxicity", }, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "filename": os.path.join(dl_path, "test_private_expanded.csv"), "toxicity_label": "toxicity", }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_path = dl_manager.download_and_extract(_URL) input_path = os.path.join(dl_path, "AESLC-master", "enron_subject_line") return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "pattern": os.path.join(input_path, "train", "*.subject") }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "pattern": os.path.join(input_path, "dev", "*.subject") }, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "pattern": os.path.join(input_path, "test", "*.subject") }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" en_data_path = dl_manager.download_and_extract(_EN_URL) de_data_path = dl_manager.download_and_extract(_DE_URL) fr_data_path = dl_manager.download_and_extract(_FR_URL) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "en_path": os.path.join(en_data_path, "train.jsonl"), "de_path": os.path.join(de_data_path, "train.jsonl"), "fr_path": os.path.join(fr_data_path, "train.jsonl"), }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "en_path": os.path.join(en_data_path, "validation.jsonl"), "de_path": os.path.join(de_data_path, "validation.jsonl"), "fr_path": os.path.join(fr_data_path, "validation.jsonl"), }, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "en_path": os.path.join(en_data_path, "test.jsonl"), "de_path": os.path.join(de_data_path, "test.jsonl"), "fr_path": os.path.join(fr_data_path, "test.jsonl"), }, ), ]
def _split_generators(self, dl_manager): path = dl_manager.download_and_extract(_URL) test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files") train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files") dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files") with open(test_file) as f: test_splits = [file.replace("\n", "") for file in f.readlines()] with open(train_file) as f: train_splits = [file.replace("\n", "") for file in f.readlines()] with open(dev_file) as f: dev_splits = [file.replace("\n", "") for file in f.readlines()] c2 = "CRD3-master/data/aligned data/c=2" c3 = "CRD3-master/data/aligned data/c=3" c4 = "CRD3-master/data/aligned data/c=4" files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))] files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))]) files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))]) test_files, train_files, dev_files = get_train_test_dev_files(files, test_splits, train_splits, dev_splits) return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files_path": train_files},), nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"files_path": test_files},), nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"files_path": dev_files},), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(wiki_split): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs urls_to_download = { 'train': os.path.join(_URL, _TRAIN_FILE), 'test': os.path.join(_URL, _TEST_FILE), 'dev': os.path.join(_URL, _DEV_FILE) } dl_dir = dl_manager.download_and_extract(urls_to_download) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(dl_dir['train'], 'train.tsv') }, ), nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={'filepath': dl_dir['test']}, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={'filepath': dl_dir['dev']}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(reclor): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) if not os.path.exists(data_dir): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `nlp.load('wikihow', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format( data_dir, self.MANUAL_DOWNLOAD_INSTRUCTIONS ) ) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(data_dir, "train.json")}, ), nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(data_dir, "test.json")}, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(data_dir, "val.json")}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(squad_es): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs v1_urls = { "train": os.path.join(_URL, "SQuAD-es-v1.1/train-v1.1-es.json"), "dev": os.path.join(_URL, "SQuAD-es-v1.1/dev-v1.1-es.json"), } v2_urls = { "train": os.path.join(_URL, "SQuAD-es-v2.0/train-v2.0-es.json"), "dev": os.path.join(_URL, "SQuAD-es-v2.0/dev-v2.0-es.json"), } if self.config.name == "v1.1.0": dl_dir = dl_manager.download_and_extract(v1_urls) elif self.config.name == "v2.0.0": dl_dir = dl_manager.download_and_extract(v2_urls) else: raise Exception("version does not match any existing one") return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": dl_dir["train"]}, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": dl_dir["dev"]}, ), ]
def _split_generators(self, dl_manager): downloads = {} for key in _URLS.keys(): downloads[key] = dl_manager.download_and_extract(_URLS[key]) # Fix for dummy data if os.path.isdir(downloads[key]): downloads[key] = os.path.join(downloads[key], key + ".json") return [ nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={ "filepath": downloads["dev"], "rel_info": downloads["rel_info"] }), nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={ "filepath": downloads["test"], "rel_info": downloads["rel_info"] }), nlp.SplitGenerator( name="train_annotated", gen_kwargs={ "filepath": downloads["train_annotated"], "rel_info": downloads["rel_info"] }, ), nlp.SplitGenerator( name="train_distant", gen_kwargs={ "filepath": downloads["train_distant"], "rel_info": downloads["rel_info"] }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_dir = dl_manager.download_and_extract(_DOWNLOAD_URL) data_dir = os.path.join(dl_dir, "multirc") return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": data_dir, "filepath": os.path.join(data_dir, "train.jsonl") }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": data_dir, "filepath": os.path.join(data_dir, "val.jsonl") }, ), nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": data_dir, "filepath": os.path.join(data_dir, "test.jsonl") }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) splits = [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "articles_file": os.path.join( data_dir, "articles-training-" + self.config.name + "-20181122.xml"), "labels_file": os.path.join( data_dir, "ground-truth-training-" + self.config.name + "-20181122.xml"), }, ) ] if self.config.name == "bypublisher": splits.append( nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "articles_file": os.path.join( data_dir, "articles-validation-" + self.config.name + "-20181122.xml"), "labels_file": os.path.join( data_dir, "ground-truth-validation-" + self.config.name + "-20181122.xml"), }, )) return splits
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" schemas_file = dl_manager.download_and_extract(_DATA_URL) if os.path.isdir(schemas_file): # During testing the download manager mock gives us a directory schemas_file = os.path.join(schemas_file, "schema.txt") return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "filepath": schemas_file, "split": "train" }, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "filepath": schemas_file, "split": "test" }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "filepath": schemas_file, "split": "dev" }, ) ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) # This folder contains the orginal/2013 dataset data_dir = os.path.join(dl_dir, "Guardian", "Guardian_original") return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"data_dir": data_dir, "samples_folders": self.config.train_folder, "split": "train"}, ), nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={"data_dir": data_dir, "samples_folders": self.config.test_folder, "split": "test"}, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"data_dir": data_dir, "samples_folders": self.config.valid_folder, "split": "valid"}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_dir = dl_manager.download_and_extract(_DATA_URL) dl_dir = os.path.join(dl_dir, "relation_splits") return [ nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "filepaths": [ os.path.join(dl_dir, "test." + str(i)) for i in range(10) ], }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "filepaths": [os.path.join(dl_dir, "dev." + str(i)) for i in range(10)], }, ), nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "filepaths": [ os.path.join(dl_dir, "train." + str(i)) for i in range(10) ], }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) if not os.path.exists(data_dir): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `nlp.load('newsroom', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}" .format(data_dir, self.MANUAL_DOWNLOAD_INSTRUCTIONS)) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "input_file": os.path.join(data_dir, "train.jsonl") }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={"input_file": os.path.join(data_dir, "dev.jsonl")}, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "input_file": os.path.join(data_dir, "test.jsonl") }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(fquad): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs download_urls = { 'train': os.path.join(_URL, _TRAIN_DATA), 'valid': os.path.join(_URL, _VALID_DATA) } dl_dir = dl_manager.download_and_extract(download_urls) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(dl_dir['train'], 'train.json') }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(dl_dir['valid'], 'valid.json') }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(ubuntu_dialogs_corpus): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) if self.config.name == 'train': return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(manual_dir, 'train.csv') }, ), ] else: return [ nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(manual_dir, 'test.csv') }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(manual_dir, 'valid.csv') }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(quarel): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) data_dir = os.path.join(dl_dir, 'quarel-dataset-v1') return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(data_dir, 'quarel-v1-train.jsonl') }, ), nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(data_dir, 'quarel-v1-test.jsonl') }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(data_dir, 'quarel-v1-dev.jsonl') }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" del dl_manager # Unused lang = self._builder_config.language return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "filepaths": "%s/train/%s_examples-*" % (_DATA_DIRECTORY, lang) }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "filepaths": "%s/dev/%s_examples-*" % (_DATA_DIRECTORY, lang) }, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "filepaths": "%s/test/%s_examples-*" % (_DATA_DIRECTORY, lang) }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_path = dl_manager.download_and_extract(_URL) pattern = os.path.join(dl_path, "org_data", "%s.%s.txt") return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "src_path": pattern % ("train", "src"), "tgt_path": pattern % ("train", "tgt"), "replace_unk": True, }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "src_path": pattern % ("dev", "src"), "tgt_path": pattern % ("dev", "tgt"), "replace_unk": True, }, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "src_path": pattern % ("test", "src"), "tgt_path": pattern % ("test", "tgt"), "replace_unk": False, }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(scifact): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) if self.config.name == "corpus": return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(dl_dir, "data", "corpus.jsonl"), "split": "train"}, ), ] else: return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(dl_dir, "data", "claims_train.jsonl"), "split": "train"}, ), nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(dl_dir, "data", "claims_test.jsonl"), "split": "test"}, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": os.path.join(dl_dir, "data", "claims_dev.jsonl"), "split": "dev"}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(social_i_qa): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) dl_dir = os.path.join(dl_dir, 'socialiqa-train-dev') return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(dl_dir, 'train.jsonl'), 'labelpath': os.path.join(dl_dir, 'train-labels.lst') }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(dl_dir, 'dev.jsonl'), 'labelpath': os.path.join(dl_dir, 'dev-labels.lst') }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" if self.config.name == "blog-authorship-corpus": data = dl_manager.download_and_extract(self.config.data_url) data_dir = os.path.join(data, "blogs") files = glob.glob(os.path.join(data_dir, "*.xml")) train_files = [] validation_files = [] for i, file_path in enumerate(files): # 95% / 5% (train / val) split if i % 20 == 0: validation_files.append(file_path) else: train_files.append(file_path) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "files": train_files, "split": "train" }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "files": validation_files, "split": "validation" }, ), ] else: raise ValueError("{} does not exist".format(self.config.name))
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(xcopa): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs dl_dir = dl_manager.download_and_extract(_URL) data_dir = os.path.join(dl_dir, 'xcopa-master', 'data', self.config.name) return [ nlp.SplitGenerator( name=nlp.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(data_dir, 'test.' + self.config.name + '.jsonl') }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ 'filepath': os.path.join(data_dir, 'val.' + self.config.name + '.jsonl') }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # NB: The CQA Dataset should be read only once, and only by callers who # want to _create_ the Cos-E dataset from scratch. cqa_indexed = _download_and_index_cqa(dl_manager) files = dl_manager.download_and_extract({ "dev": [ os.path.join(_COS_E_URL, "v1.11/dev/cose_dev_v1.11_processed.jsonl") ], "train": [ os.path.join(_COS_E_URL, "v1.11/train/cose_train_v1.11_processed.jsonl") ], }) # We use the CoS-E/CQA dev set as our validation set. return [ nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "files": files["dev"], "cqa_indexed": cqa_indexed }, ), nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "files": files["train"], "cqa_indexed": cqa_indexed }, ), ]
def _split_generators(self, dl_manager): """ The `datafiles` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]]. If str or List[str], then the dataset returns only the 'train' split. If dict, then keys should be from the `nlp.Split` enum. """ data_dir = self.config.data_dir return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "filepath": os.path.join(data_dir, "train.data"), "split": "train", }, ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={ "filepath": os.path.join(data_dir, "test.data"), "split": "test" }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "filepath": os.path.join(data_dir, "dev.data"), "split": "dev", }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) train_dir = os.path.join(data_dir, "train", "dataset-" + self.config.name) val_dir = os.path.join(data_dir, "validation", "dataset-" + self.config.name) if not os.path.exists(train_dir): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `nlp.load_dataset('style_change_detection', data_dir=...)` that includes {}. Manual download instructions: {}" .format(train_dir, train_dir, self.manual_download_instructions)) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={ "articles": [f for f in os.listdir(train_dir) if f.endswith(".txt")], "base_dir": train_dir, }, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={ "articles": [f for f in os.listdir(val_dir) if f.endswith(".txt")], "base_dir": val_dir }, ), ]
def _split_generators(self, dl_manager): train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL) test_path = dl_manager.download_and_extract(_TEST_DOWNLOAD_URL) return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": train_path }), nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": test_path}), ]