def _split_generators(self, dl_manager): _URL = "https://jingshensn2.github.io/eli5c/datasets/" downloaded_files = dl_manager.download_and_extract({ "train": _URL + "eli5-category-train.json.gz", "val1": _URL + "eli5-category-validation-1.json.gz", "val2": _URL + "eli5-category-validation-2.json.gz", "test": _URL + "eli5-category-test.json.gz", }) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}, ), datasets.SplitGenerator( name=datasets.Split("validation1"), gen_kwargs={"filepath": downloaded_files["val1"]}, ), datasets.SplitGenerator( name=datasets.Split("validation2"), gen_kwargs={"filepath": downloaded_files["val2"]}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}, ), ]
def _split_generators(self, dl_manager): all_data_folder = dl_manager.download_and_extract(_XGLUE_ALL_DATA) data_folder = os.path.join(all_data_folder, "xglue_full_dataset", self.config.data_dir) name = self.config.name languages = _LANGUAGES[name] return ( [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"data_file": os.path.join(data_folder, _PATHS[name]["train"]), "split": "train"}, ), ] + [ datasets.SplitGenerator( name=datasets.Split(f"validation.{lang}"), gen_kwargs={ "data_file": os.path.join(data_folder, _PATHS[name]["dev"].format(lang)), "split": "dev", }, ) for lang in languages ] + [ datasets.SplitGenerator( name=datasets.Split(f"test.{lang}"), gen_kwargs={ "data_file": os.path.join(data_folder, _PATHS[name]["test"].format(lang)), "split": "test", }, ) for lang in languages ] )
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" if self.config.name != "task4_reddit": my_urls = ZIP_URL # Cannot download just one single type as it is a compressed file. else: my_urls = REDDIT_URL data_dir = dl_manager.download_and_extract(my_urls) splits = [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, paths[self.config.name]["train"]), }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, paths[self.config.name]["test"]), }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, paths[self.config.name]["dev"]), }, ), ] if self.config.name == "task4_reddit": splits += [ datasets.SplitGenerator( name=datasets.Split("cand_valid"), # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, paths[self.config.name]["cand_valid"]), }, ), datasets.SplitGenerator( name=datasets.Split("cand_test"), # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, paths[self.config.name]["cand_test"]), }, ), ] return splits
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" my_urls = _URL data_dir = dl_manager.download_and_extract(my_urls) data_path = os.path.join(data_dir, "wikipedia-biography-dataset") return [ datasets.SplitGenerator( name=datasets.Split("train"), gen_kwargs={ "id_file": os.path.join(data_path, "train", "train.id"), "infobox_file": os.path.join(data_path, "train", "train.box"), "nb_lines_file": os.path.join(data_path, "train", "train.nb"), "sentences_file": os.path.join(data_path, "train", "train.sent"), "article_title_file": os.path.join(data_path, "train", "train.title"), }, ), datasets.SplitGenerator( name=datasets.Split("test"), gen_kwargs={ "id_file": os.path.join(data_path, "test", "test.id"), "infobox_file": os.path.join(data_path, "test", "test.box"), "nb_lines_file": os.path.join(data_path, "test", "test.nb"), "sentences_file": os.path.join(data_path, "test", "test.sent"), "article_title_file": os.path.join(data_path, "test", "test.title"), }, ), datasets.SplitGenerator( name=datasets.Split("val"), gen_kwargs={ "id_file": os.path.join(data_path, "valid", "valid.id"), "infobox_file": os.path.join(data_path, "valid", "valid.box"), "nb_lines_file": os.path.join(data_path, "valid", "valid.nb"), "sentences_file": os.path.join(data_path, "valid", "valid.sent"), "article_title_file": os.path.join(data_path, "valid", "valid.title"), }, ), ]
def _split_generators(self, dl_manager): qa_data_file = pjoin( self._cache_dir_root, self._relative_data_dir(with_version=False), "reddit_downloaded_qa_lists.json" ) if isfile(qa_data_file): logger.info("loading pre-computed QA list") self.filtered_reddit = json.load(open(qa_data_file)) else: self.filtered_reddit = _download_and_filter_reddit( dl_manager, start_year=2011, start_month=7, end_year=2019, end_month=7 ) logger.info("saving pre-computed QA list") json.dump(self.filtered_reddit, open(qa_data_file, "w")) # download data splits from AWS fpath_splits = dl_manager.download(self._DATA_SPLIT_URL) self.data_split = json.load(open(fpath_splits)) return [ datasets.SplitGenerator( name=datasets.Split("train_eli5"), gen_kwargs={"split": "train", "subreddit_name": "explainlikeimfive"}, ), datasets.SplitGenerator( name=datasets.Split("validation_eli5"), gen_kwargs={"split": "validation", "subreddit_name": "explainlikeimfive"}, ), datasets.SplitGenerator( name=datasets.Split("test_eli5"), gen_kwargs={"split": "test", "subreddit_name": "explainlikeimfive"}, ), datasets.SplitGenerator( name=datasets.Split("train_asks"), gen_kwargs={"split": "train", "subreddit_name": "askscience"}, ), datasets.SplitGenerator( name=datasets.Split("validation_asks"), gen_kwargs={"split": "validation", "subreddit_name": "askscience"}, ), datasets.SplitGenerator( name=datasets.Split("test_asks"), gen_kwargs={"split": "test", "subreddit_name": "askscience"}, ), datasets.SplitGenerator( name=datasets.Split("train_askh"), gen_kwargs={"split": "train", "subreddit_name": "AskHistorians"}, ), datasets.SplitGenerator( name=datasets.Split("validation_askh"), gen_kwargs={"split": "validation", "subreddit_name": "AskHistorians"}, ), datasets.SplitGenerator( name=datasets.Split("test_askh"), gen_kwargs={"split": "test", "subreddit_name": "AskHistorians"}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_dir = dl_manager.download_and_extract(self.config.data_url) splits_gen = [] for split_id, split_filename in self.config.splits.items(): if self.config.gameplay_scenario == "original": if "train" in split_id: split_name = datasets.Split.TRAIN elif "valid" in split_id: split_name = datasets.Split.VALIDATION elif "test" in split_id: split_name = datasets.Split.TEST else: split_name = datasets.Split(split_id) full_split_name = "-".join(["compguesswhat", self.config.gameplay_scenario]) splits_gen.append( datasets.SplitGenerator( name=split_name, gen_kwargs={ "filepath": os.path.join( dl_dir, full_split_name, self.VERSION.version_str, split_filename, ) }, ) ) return splits_gen
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL) valid_path = dl_manager.download_and_extract(_VALID_DOWNLOAD_URL) test_lay_path = dl_manager.download_and_extract(_TEST_LAY_DOWNLOAD_URL) test_expert_path = dl_manager.download_and_extract( _TEST_EXPERT_DOWNLOAD_URL) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": train_path}), datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": valid_path}), datasets.SplitGenerator(name=datasets.Split("test_lay"), gen_kwargs={"filepath": test_lay_path}), datasets.SplitGenerator(name=datasets.Split("test_expert"), gen_kwargs={"filepath": test_expert_path}), ]
def _split_generators(self, dl_manager): dl_dir = dl_manager.download_and_extract(self.config.data_url) data_dir = os.path.join(dl_dir, self.config.data_dir) if self.config.name in {"chid", "c3"}: test_file = "test1.1.json" elif self.config.name == "diagnostics": test_file = "diagnostics_test.json" else: test_file = "test.json" test_split = datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "data_file": os.path.join(data_dir, test_file), "split": "test", }, ) split_list = [test_split] if self.config.name != "diagnostics": train_split = datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "data_file": os.path.join( data_dir or "", "train.json" if self.config.name != "c3" else "d-train.json" ), "split": "train", }, ) val_split = datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "data_file": os.path.join( data_dir or "", "dev.json" if self.config.name != "c3" else "d-dev.json" ), "split": "dev", }, ) split_list += [train_split, val_split] if self.config.name == "cmrc2018": split_list.append( datasets.SplitGenerator( name=datasets.Split("trial"), gen_kwargs={ "data_file": os.path.join(data_dir or "", "trial.json"), "split": "trial", }, ) ) return split_list
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" downloaded_files = dl_manager.download_and_extract(_URLS) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}), datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}), datasets.SplitGenerator( name=datasets.Split("test_wikipedia"), gen_kwargs={"filepath": downloaded_files["test_wikipedia"]} ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) if not os.path.exists(data_dir): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('jigsaw_unintended_bias', data_dir=...)`. Manual download instructions: {}" .format(data_dir, self.manual_download_instructions)) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "path": os.path.join(data_dir, "train.csv"), "split": "train" }, ), datasets.SplitGenerator( name=datasets.Split("test_private_leaderboard"), # These kwargs will be passed to _generate_examples gen_kwargs={ "path": os.path.join(data_dir, "test_private_expanded.csv"), "split": "test" }, ), datasets.SplitGenerator( name=datasets.Split("test_public_leaderboard"), # These kwargs will be passed to _generate_examples gen_kwargs={ "path": os.path.join(data_dir, "test_public_expanded.csv"), "split": "test" }, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URL) return [ datasets.SplitGenerator( name=datasets.Split("auxiliary_train"), # These kwargs will be passed to _generate_examples gen_kwargs={ "datadir": os.path.join(data_dir, "data", "auxiliary_train"), "split": "auxiliary_train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "datadir": os.path.join(data_dir, "data", "test"), "split": "test" }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "datadir": os.path.join(data_dir, "data", "val"), "split": "val", }, ), datasets.SplitGenerator( name=datasets.Split("dev"), # These kwargs will be passed to _generate_examples gen_kwargs={ "datadir": os.path.join(data_dir, "data", "dev"), "split": "dev", }, ), ]
def _split_generators(self, dl_manager): archive = dl_manager.download(_DOWNLOAD_URL) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"} ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"} ), datasets.SplitGenerator( name=datasets.Split("unsupervised"), gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False}, ), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URL) ROOT = "semeval-2020-task-7-dataset" return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, ROOT, self.config.name, "train.csv"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, ROOT, self.config.name, "test.csv"), "split": "test" }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, ROOT, self.config.name, "dev.csv"), "split": "dev", }, ), datasets.SplitGenerator( name=datasets.Split("funlines"), # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, ROOT, self.config.name, "train_funlines.csv"), "split": "funlines", }, ), ]
def _split_generators(self, mgr): all_data_folder = mgr.download_and_extract(_XGLUE_ALL_DATA) data_folder = os.path.join(all_data_folder, "xglue_full_dataset", self.config.data_dir) name = self.config.name languages = _LANGS[name] return ([ ds.SplitGenerator( name=ds.Split.TRAIN, gen_kw={ "data_file": os.path.join(data_folder, _PATHS[name]["train"]), "split": "train", }, ), ] + [ ds.SplitGenerator( name=ds.Split(f"validation.{c}"), gen_kw={ "data_file": os.path.join(data_folder, _PATHS[name]["dev"].format(c)), "split": "dev", }, ) for c in languages ] + [ ds.SplitGenerator( name=ds.Split(f"test.{x}"), gen_kw={ "data_file": os.path.join(data_folder, _PATHS[name]["test"].format(x)), "split": "test", }, ) for x in languages ])
def _split_generators(self, dl_manager): archive = dl_manager.download(_XGLUE_ALL_DATA) data_folder = f"xglue_full_dataset/{self.config.data_dir}" name = self.config.name languages = _LANGUAGES[name] return ([ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "archive": dl_manager.iter_archive(archive), "data_path": f"{data_folder}/{_PATHS[name]['train']}", "split": "train", }, ), ] + [ datasets.SplitGenerator( name=datasets.Split(f"validation.{lang}"), gen_kwargs={ "archive": dl_manager.iter_archive(archive), "data_path": f"{data_folder}/{_PATHS[name]['dev'].format(lang)}", "split": "dev", }, ) for lang in languages ] + [ datasets.SplitGenerator( name=datasets.Split(f"test.{lang}"), gen_kwargs={ "archive": dl_manager.iter_archive(archive), "data_path": f"{data_folder}/{_PATHS[name]['test'].format(lang)}", "split": "test", }, ) for lang in languages ])
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract(_URLs) return [ datasets.SplitGenerator( name=datasets.Split(key), # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir[key], "pid2name": data_dir["pid2name"], "return_names": key in ["train_wiki", "val_wiki", "val_nyt"], }, ) for key in data_dir.keys() if key != "pid2name" ]
def _split_generators(self, dl_manager): arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL) data_dir = os.path.join(arch_path, "aclImdb") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")} ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")} ), datasets.SplitGenerator( name=datasets.Split("unsupervised"), gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False}, ), ]
def _split_generators(self, dl_manager): arch_path = dl_manager.download_and_extract(self.config.data_url) if "relations" in self.config.name: train_file = "train.csv" test_file = "test.csv" generators = [] # for k in [1, 2, 3, 4]: for aspect in self.config.aspects: for k in ["sample"] + [1, 2, 3, 4]: folds_path = os.path.join(arch_path, 'folds', aspect, str(k)) generators += [ datasets.SplitGenerator( name=get_train_split(aspect, k), gen_kwargs={ 'filepath': os.path.join(folds_path, train_file) }), datasets.SplitGenerator(name=get_test_split(aspect, k), gen_kwargs={ 'filepath': os.path.join( folds_path, test_file) }) ] return generators elif "docs" in self.config.name: # docs docs_file = os.path.join(arch_path, "docs.jsonl") return [ datasets.SplitGenerator(name=datasets.Split('docs'), gen_kwargs={"filepath": docs_file}), ] else: raise ValueError()
def _split_generators(self, dl_manager): urls = _URLS data_dir = dl_manager.download_and_extract(urls) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir["train"], "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir["dev"], "split": "dev", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir["test_unlabeled"], "split": "test_unlabeled" }, ), datasets.SplitGenerator( name=datasets.Split("test_ood"), # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir["ood_unlabeled"], "split": "ood_unlabeled" }, ), ]
def get_test_split(k): return datasets.Split(f'fold_{k}_test')
def get_train_split(k): return datasets.Split(f'fold_{k}_train')
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_dir = dl_manager.download_and_extract(_URL) return [ datasets.SplitGenerator( name=datasets.Split("exercise_contest_train"), gen_kwargs={ "filepath": os.path.join( dl_dir, "final_all_data/exercise_contest/data_train.json"), "split": "exercise_contest_train", }, ), datasets.SplitGenerator( name=datasets.Split("exercise_contest_valid"), gen_kwargs={ "filepath": os.path.join( dl_dir, "final_all_data/exercise_contest/data_valid.json"), "split": "exercise_contest_valid", }, ), datasets.SplitGenerator( name=datasets.Split("exercise_contest_test"), gen_kwargs={ "filepath": os.path.join( dl_dir, "final_all_data/exercise_contest/data_test.json"), "split": "exercise_contest_test", }, ), datasets.SplitGenerator( name=datasets.Split("first_stage_train"), gen_kwargs={ "filepath": os.path.join(dl_dir, "final_all_data/first_stage/train.json"), "split": "first_stage_train", }, ), datasets.SplitGenerator( name=datasets.Split("first_stage_test"), gen_kwargs={ "filepath": os.path.join(dl_dir, "final_all_data/first_stage/test.json"), "split": "first_stage_test", }, ), datasets.SplitGenerator( name=datasets.Split("final_test"), gen_kwargs={ "filepath": os.path.join(dl_dir, "final_all_data/final_test.json"), "split": "final_test" }, ), ]
def main(): # Auto-environment env = get_env() parser = HfArgumentParser( (ModelArguments, TrainingArguments, ExperimentArguments)) model_args, training_args, experiment_args = parser.parse_args_into_dataclasses( ) # Adjust output with folds and model name #TODO disabled # training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name()) # Model path from env if not os.path.exists(model_args.model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_args.model_name_or_path)): model_args.model_name_or_path = os.path.join( env['bert_dir'], model_args.model_name_or_path) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Dataset args label_classes = get_label_classes_from_hf_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset)) num_labels = len(label_classes) if num_labels > 1 and experiment_args.binary_classification: # In binary classification we have only single label (with y=[0;1]) num_labels = 1 logger.warning(f'Forcing label classes to binary: {label_classes}') columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] # Input to transformers.forward # Build dataset for splits train_ds = load_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset), name='relations', cache_dir=experiment_args.hf_dataset_cache_dir, split=get_train_split(experiment_args.aspect, experiment_args.cv_fold)) test_ds = load_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset), name='relations', cache_dir=experiment_args.hf_dataset_cache_dir, split=get_test_split(experiment_args.aspect, experiment_args.cv_fold)) docs_ds = load_dataset(get_local_hf_dataset_path( experiment_args.hf_dataset), name='docs', cache_dir=experiment_args.hf_dataset_cache_dir, split=datasets.Split('docs')) # Forced limit if experiment_args.dataset_limit > 0: logger.info( f'Train and test datasets limited to {experiment_args.dataset_limit} samples' ) train_ds = Dataset(train_ds.data[:experiment_args.dataset_limit]) test_ds = Dataset(test_ds.data[:experiment_args.dataset_limit]) # Build ID => Doc mapping doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds} if model_args.model_name_or_path.startswith('baseline-rnn'): # Load Spacy as tokenizer spacy_nlp = spacy.load(experiment_args.spacy_model, disable=["tagger", "ner", "textcat"]) if experiment_args.multi_label: # Baseline models model = RNNForMultiLabelSequenceClassification( word_vectors=get_vectors_from_spacy_model(spacy_nlp), hidden_size=experiment_args.rnn_hidden_size, rnn=experiment_args.rnn_type, num_labels=num_labels, num_layers=experiment_args.rnn_num_layers, dropout=experiment_args.rnn_dropout, ) else: raise NotImplementedError( 'RNN baseline is only available for multi label classification' ) tokenizer = None else: # Load pretrained Transformers models and tokenizers model_config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir) # No need for spacy spacy_nlp = None if 'longformer' in model_args.model_name_or_path: # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU) model_config.attention_mode = 'tvm' # override tokenizer name if not set if model_args.tokenizer_name is None: roberta_path = os.path.join(env['bert_dir'], 'roberta-base') model_args.tokenizer_name = roberta_path if os.path.exists( roberta_path) else 'roberta-base' logger.info( f'Overriding tokenizer: {model_args.tokenizer_name}') # override max length experiment_args.max_length = 4096 if experiment_args.multi_label: model_cls = AutoModelForMultiLabelSequenceClassification else: model_cls = AutoModelForSequenceClassification model = model_cls.from_pretrained(model_args.model_name_or_path, config=model_config, cache_dir=model_args.cache_dir) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Set token limit if defined by model (for Longformer) if model.config.max_position_embeddings > 0: tokenizer.model_max_length = model.config.max_position_embeddings # Init helper dpt = DocRelTrainerHelper( id2doc=doc_id2doc, transformers_tokenizer=tokenizer, spacy_nlp=spacy_nlp, label_classes=label_classes, binary_classification=experiment_args.binary_classification, doc_a_col=experiment_args.doc_a_col, doc_b_col=experiment_args.doc_b_col, label_col=experiment_args.label_col, text_from_doc_func=get_non_empty_text_from_doc, classification_threshold=experiment_args.classification_threshold, max_length=experiment_args.max_length, multi_label=experiment_args.multi_label, ) logger.info('Converting to features (doc mapping, tokenize, ...)') # Build hash from settings for caching data_settings_hash = hashlib.md5( dataclasses.asdict(experiment_args).__str__().encode("utf-8") + dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest() train_tensor_ds = train_ds.map( dpt.convert_to_features, batched=True, load_from_cache_file=True, num_proc=int(env['workers']), cache_file_name=os.path.join( experiment_args.hf_dataset_cache_dir, "cache-train-" + data_settings_hash + ".arrow")) train_tensor_ds.set_format(type='torch', columns=columns) test_tensor_ds = test_ds.map( dpt.convert_to_features, batched=True, load_from_cache_file=True, num_proc=int(env['workers']), cache_file_name=os.path.join( experiment_args.hf_dataset_cache_dir, "cache-test-" + data_settings_hash + ".arrow")) test_tensor_ds.set_format(type='torch', columns=columns) logger.info(f'Dataset columns: {columns}') logger.info(f'Train sample: {train_ds[0]}') logger.debug(f'- as tensor: {train_tensor_ds[0]}') logger.info(f'Test sample: {test_ds[0]}') logger.debug(f'- as tensor: {test_tensor_ds[0]}') # Load models weights (when no training but predictions) model_weights_path = os.path.join(training_args.output_dir, 'pytorch_model.bin') if not training_args.do_train and experiment_args.save_predictions: logger.info( f'Loading existing model weights from disk: {model_weights_path}') if os.path.exists(model_weights_path): model.load_state_dict(torch.load(model_weights_path)) else: logger.error('Weights files does not exist!') # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_tensor_ds, eval_dataset=test_tensor_ds, data_collator=DocRelDataCollator(), #prediction_loss_only=False, compute_metrics=dpt.compute_metrics, ) # Log additional (to Weights & Baises) if is_wandb_available(): extra_config = {} extra_config.update(dataclasses.asdict(experiment_args)) extra_config.update(dataclasses.asdict(model_args)) wandb.config.update(extra_config, allow_val_change=True) if training_args.do_train: logger.info('Training started...') trainer.train() if isinstance(model, PreTrainedModel): trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) elif isinstance(model, nn.Module): # RNN model torch.save(model.state_dict(), model_weights_path) if experiment_args.save_predictions: logger.info('Predicting...') predictions = trainer.predict(test_tensor_ds) df = dpt.get_df_from_predictions(test_ds, docs_ds, predictions, exclude_columns=['abstract']) # Save results to disk df.to_csv(os.path.join(training_args.output_dir, 'results.csv'), index=False) json.dump( predictions.metrics, open(os.path.join(training_args.output_dir, 'metrics.json'), 'w')) logger.info('Done')
def get_test_split(aspect, k): return datasets.Split(f'fold_{aspect}_{k}_test')
def get_train_split(aspect, k): return datasets.Split(f'fold_{aspect}_{k}_train')
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" qanta_path = dl_manager.download_and_extract(_QANTA_URL) trick_path = dl_manager.download_and_extract(_TRICK_URL) return [ datasets.SplitGenerator( name=datasets.Split("guesstrain"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "guesstrain", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), datasets.SplitGenerator( name=datasets.Split("buzztrain"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "buzztrain", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), datasets.SplitGenerator( name=datasets.Split("guessdev"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "guessdev", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), datasets.SplitGenerator( name=datasets.Split("buzzdev"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "buzzdev", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), datasets.SplitGenerator( name=datasets.Split("guesstest"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "guesstest", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), datasets.SplitGenerator( name=datasets.Split("buzztest"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "buzztest", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), datasets.SplitGenerator( name=datasets.Split("adversarial"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "adversarial", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), ]
def _split_generators(self, dl_manager): if '_cross_validation_' in self.config.name: return self._split_generators_for_cross_validation(dl_manager) dl_dir = dl_manager.download_and_extract(self.config.data_url) or "" task_name = _get_task_name_from_data_url(self.config.data_url) dl_dir = os.path.join(dl_dir, task_name) if self.config.name in ["axb", "axg"]: return [ datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "data_file": os.path.join(dl_dir, "{}.jsonl".format(task_name)), "split": datasets.Split.TEST, }, ), ] if self.config.train_path is not None: train_path = dl_manager.download_and_extract( self.config.train_path) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "data_file": train_path, "split": datasets.Split.TRAIN, }, ) ] elif self.config.is_few_shot: train_path = dl_manager.download_and_extract( os.path.join(self.config.few_shot_url, "train.jsonl")) else: train_path = os.path.join(dl_dir, "train.jsonl") splits = [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "data_file": train_path, "split": datasets.Split.TRAIN, }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "data_file": os.path.join(dl_dir, "val.jsonl"), "split": datasets.Split.VALIDATION, }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "data_file": os.path.join(dl_dir, "test.jsonl"), "split": datasets.Split.TEST, }, ), ] if self.config.is_few_shot: splits.append( datasets.SplitGenerator( name=datasets.Split("unlabeled"), gen_kwargs={ "data_file": (os.path.join(dl_dir, "unlabeled.jsonl") if not self.config.is_few_shot else dl_manager.download_and_extract( os.path.join(self.config.few_shot_url, "unlabeled.jsonl"))), "split": datasets.Split("unlabeled"), }, )) return splits
parser.add_argument("--RATE", type=float, default=0.5, help="learning rate") parser.add_argument("--BACKEND", default="cpu", help="backend mode") parser.add_argument("--DATASET", default="simple", help="dataset") parser.add_argument("--PLOT", default=False, help="dataset") args = parser.parse_args() PTS = args.PTS if args.DATASET == "xor": DATASET = datasets.Xor(PTS, vis=True) elif args.DATASET == "simple": DATASET = datasets.Simple(PTS, vis=True) elif args.DATASET == "split": DATASET = datasets.Split(PTS, vis=True) HIDDEN = int(args.HIDDEN) RATE = args.RATE # Change which backend to use if args.BACKEND == "cpu": BACKEND = minitorch.make_tensor_backend(minitorch.FastOps) elif args.BACKEND == "old": # Module-2 backend # You can use this to debug, but you will need to add a # Matrix multiplication @ operator BACKEND = minitorch.TensorFunctions elif args.BACKEND == "gpu":