def eval_on_file(self, data_dir: str, test_filename: str, device: str): """ Performs evaluation on a SQuAD-formatted file. Returns a dict containing the following metrics: - "EM": exact match score - "f1": F1-Score - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer :param data_dir: The directory in which the test set can be found :type data_dir: Path or str :param test_filename: The name of the file containing the test data in SQuAD format. :type test_filename: str :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :type device: str """ eval_processor = SquadProcessor( tokenizer=self.inferencer.processor.tokenizer, max_seq_len=self.inferencer.processor.max_seq_len, label_list=self.inferencer.processor.tasks["question_answering"] ["label_list"], metric=self.inferencer.processor.tasks["question_answering"] ["metric"], train_filename=None, dev_filename=None, dev_split=0, test_filename=test_filename, data_dir=Path(data_dir), ) data_silo = DataSilo(processor=eval_processor, batch_size=self.inferencer.batch_size, distributed=False) data_loader = data_silo.get_data_loader("test") evaluator = Evaluator(data_loader=data_loader, tasks=eval_processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_accuracy": eval_results[0]["top_n_accuracy"] } return results
def setup_evaluator(dataset_name, data_silo, device): evaluator = Evaluator( data_loader=data_silo.get_data_loader(dataset_name), label_maps=data_silo.processor.label_maps, device=device, metrics=data_silo.processor.metrics, classification_report=False, ) return evaluator
def perform_fine_tuning(current_info_need, bert_model, label_list, num_epochs, condition, folds=10, stratified=True, learning_rate=2e-5, batch_size=32, embeds_dropout_prob=.1): ## Define evaluation metrics ## def evaluation_metrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other") f1infoneed = f1_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_infoneed = recall_score(y_true=labels, y_pred=preds, pos_label=current_info_need) precision_infoneed = precision_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_other = recall_score(y_true=labels, y_pred=preds, pos_label="Other") precision_other = precision_score(y_true=labels, y_pred=preds, pos_label="Other") recall_macro = recall_score(y_true=labels, y_pred=preds, average="macro") precision_macro = precision_score(y_true=labels, y_pred=preds, average="macro") recall_micro = recall_score(y_true=labels, y_pred=preds, average="micro") precision_micro = precision_score(y_true=labels, y_pred=preds, average="micro") recall_weighted = recall_score(y_true=labels, y_pred=preds, average="weighted") precision_weighted = precision_score(y_true=labels, y_pred=preds, average="weighted") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") mcc = matthews_corrcoef(labels, preds) f1weighted = f1_score(y_true=labels, y_pred=preds, average="weighted") return { "info_need": current_info_need, "model": bert_model, "num_epochs": num_epochs, "condition": condition, "acc": acc, "f1_other": f1other, "f1_infoneed": f1infoneed, "precision_infoneed": precision_infoneed, "recall_infoneed": recall_infoneed, "recall_other": recall_other, "precision_other": precision_other, "recall_macro": recall_macro, "precision_macro": precision_macro, "recall_micro": recall_micro, "precision_micro": precision_micro, "recall_weighted": recall_weighted, "precision_weighted": precision_weighted, "f1_weighted": f1weighted, "f1_macro": f1macro, "f1_micro": f1micro, "f1_weighted": f1weighted, "mcc": mcc } register_metrics( f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs', evaluation_metrics) metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs' set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) logger, ml_logger = init_logging() tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=256, train_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv", test_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv", data_dir="data/", label_list=label_list, metric=metric, text_column_name="utterance", label_column_name=level, delimiter=";") data_silo = DataSilo(processor=processor, batch_size=batch_size) silos = DataSiloForCrossVal.make(data_silo, n_splits=folds, sets=['train', 'test']) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(bert_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout_prob, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=num_epochs, use_amp=None) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_infoneed", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=num_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=100, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_info_need = -1 language_model_name = bert_model if language_model_name.find("/") != -1: language_model_name = language_model_name.replace("/", "_") save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}" ) for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_info_need = result[0]["f1_infoneed"] if f1_info_need > bestf1_info_need: bestf1_info_need = f1_info_need bestfold = num_fold # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis with open( f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="Other") xval_f1_info_need = f1_score(all_labels, all_preds, labels=label_list, pos_label=current_info_need) xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_mcc = matthews_corrcoef(all_labels, all_preds) xval_overall_results = { "xval_f1_other": xval_f1_other, f"xval_f1_infoneed": xval_f1_info_need, "xval_f1_micro": xval_f1_micro, "xval_f1_macro": xval_f1_macro, "xval_f1_mcc": xval_mcc } logger.info(f"XVAL F1 MICRO: {xval_f1_micro}") logger.info(f"XVAL F1 MACRO: {xval_f1_macro}") logger.info(f"XVAL F1 OTHER: {xval_f1_other}") logger.info( f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs: {xval_f1_info_need}" ) logger.info(f"XVAL MCC: {xval_mcc}") # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}" ) model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"])) logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"])) logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"])) logger.info("TEST F1 {0}: {1}".format(current_info_need, result[0]["f1_infoneed"])) logger.info("TEST MCC: {}".format(result[0]["mcc"])) test_set_results = { "test_f1_other": result[0]["f1_other"], "test_f1_infoneed": result[0][f"f1_infoneed"], "test_f1_micro": result[0]["f1_micro"], "test_f1_macro": result[0]["f1_macro"], "test_f1_mcc": result[0]["mcc"] }
# for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_offense = -1 save_dir = "saved_models/bert-german-doc-tutorial-es" for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense
def outcome_pretraining(task_config, model_name, cache_dir, run_name="0", lr=1e-05, warmup_steps=5000, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="loss", early_stopping_mode="min", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=True, do_train=True, do_eval=True, do_hpo=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = Path(task_config["data"]["data_dir"]) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = OutcomePretrainingProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], seed=seed, max_size_admission=50, max_size_discharge=50, cache_dir=cache_dir) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = OutcomePretrainingDataSilo( processor=processor, caching=True, cache_dir=cache_dir, batch_size=batch_size, max_multiprocessing_chunksize=200) if do_train: # Set save dir for experiment output save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}' # Use HPO config args if config is passed if do_hpo: save_dir = save_dir / tune.session.get_trial_name() else: exp_name = f"exp_{random.randint(100000, 999999)}" save_dir = save_dir / exp_name # Create save dir if not os.path.exists(save_dir): os.makedirs(save_dir) # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment(experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name, language_model_class=model_class) # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model if model_class == "Bert": next_sentence_head = NextSentenceHead.load(model_class) else: next_sentence_head = TextClassificationHead(num_labels=2) model = AdaptiveModel( language_model=language_model, prediction_heads=[next_sentence_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer schedule_opts = {"name": "LinearWarmup", "num_warmup_steps": warmup_steps} model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping( mode=early_stopping_mode, min_delta=0.0001, save_dir=save_dir, metric=early_stopping_metric, patience=early_stopping_patience ) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval ) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(save_dir / "final_model") processor.save(save_dir / "final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = save_dir else: model_dir = Path(model_name) logger.info("###### Eval on TEST SET #####") evaluator_test = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # Load trained model for evaluation model = AdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir / "eval_results.txt")
def eval( self, document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", ): """ Performs evaluation on evaluation documents in the DocumentStore. Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - "top_n_accuracy": Proportion of predicted answers that match with correct answer :param document_store: DocumentStore containing the evaluation documents :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :param label_index: Index/Table name where labeled questions are stored :param doc_index: Index/Table name where documents that are used for evaluation are stored """ # extract all questions for evaluation filters = {"origin": [label_origin]} labels = document_store.get_all_labels(index=label_index, filters=filters) # Aggregate all answer labels per question aggregated_per_doc = defaultdict(list) for label in labels: if not label.document_id: logger.error(f"Label does not contain a document_id") continue aggregated_per_doc[label.document_id].append(label) # Create squad style dicts d: Dict[str, Any] = {} for doc_id in aggregated_per_doc.keys(): doc = document_store.get_document_by_id(doc_id, index=doc_index) if not doc: logger.error( f"Document with the ID '{doc_id}' is not present in the document store." ) continue d[str(doc_id)] = {"context": doc.text} # get all questions / answers aggregated_per_question: Dict[str, Any] = defaultdict(list) for label in aggregated_per_doc[doc_id]: # add to existing answers if label.question in aggregated_per_question.keys(): aggregated_per_question[label.question]["answers"].append({ "text": label.answer, "answer_start": label.offset_start_in_doc }) # create new one else: aggregated_per_question[label.question] = { "id": str(hash(str(doc_id) + label.question)), "question": label.question, "answers": [{ "text": label.answer, "answer_start": label.offset_start_in_doc }] } # Get rid of the question key again (after we aggregated we don't need it anymore) d[str(doc_id)]["qas"] = [ v for v in aggregated_per_question.values() ] # Convert input format for FARM farm_input = [v for v in d.values()] # Create DataLoader that can be passed to the Evaluator indices = range(len(farm_input)) dataset, tensor_names = self.inferencer.processor.dataset_from_dicts( farm_input, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_accuracy": eval_results[0]["top_n_accuracy"] } return results
def question_answering_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) #ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") #ml_logger.init_experiment(experiment_name="QA_X-Validation", run_name="Squad_Roberta_Base") ########################## ########## Settings ########################## save_per_fold_results = False # unsupported for now set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = False n_epochs = 2 batch_size = 80 learning_rate = 3e-5 data_dir = Path("../data/covidqa") filename = "COVID-QA.json" xval_folds = 5 dev_split = 0 evaluate_every = 0 no_ans_boost = -100 # use large negative values to disable giving "no answer" option accuracy_at = 3 # accuracy at n is useful for answers inside long documents use_amp = None ########################## ########## k fold Cross validation ########################## # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=["start_token", "end_token"], metric="squad", train_filename=filename, dev_filename=None, dev_split=dev_split, test_filename=None, data_dir=data_dir, doc_stride=192, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # fine-tune pre-trained question-answering model model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # If positive, thjs will boost "No Answer" as prediction. # If negative, this will prevent the model from giving "No Answer" as prediction. model.prediction_heads[0].no_ans_boost = no_ans_boost # Number of predictions the model will make per Question. # The multiple predictions are used for evaluating top n recall. model.prediction_heads[0].n_best = accuracy_at # # or train question-answering models from scratch # # Create an AdaptiveModel # # a) which consists of a pretrained language model as a basis # language_model = LanguageModel.load(lang_model) # # b) and a prediction head on top that is suited for our task => Question-answering # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=accuracy_at) # model = AdaptiveModel( # language_model=language_model, # prediction_heads=[prediction_head], # embeds_dropout_prob=0.1, # lm_output_types=["per_token"], # device=device,) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, then evaluate the model on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging all_results = [] all_preds = [] all_labels = [] all_f1 = [] all_em = [] all_topnaccuracy = [] for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", logging=False, steps=len(silo.get_data_loader("test")), num_fold=num_fold) all_results.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) all_f1.append(result[0]["f1"]) all_em.append(result[0]["EM"]) all_topnaccuracy.append(result[0]["top_n_accuracy"]) # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis # TODO currently not supported - adjust to QAPred and QACandidate objects # if save_per_fold_results: # def convert_numpy_dtype(obj): # if type(obj).__module__ == "numpy": # return obj.item() # # raise TypeError("Unknown type:", type(obj)) # # with open("qa_xval.results.json", "wt") as fp: # json.dump(all_results, fp, default=convert_numpy_dtype) # calculate overall metrics across all folds xval_score = squad(preds=all_preds, labels=all_labels) logger.info(f"Single EM-Scores: {all_em}") logger.info(f"Single F1-Scores: {all_f1}") logger.info( f"Single top_{accuracy_at}_accuracy Scores: {all_topnaccuracy}") logger.info(f"XVAL EM: {xval_score['EM']}") logger.info(f"XVAL f1: {xval_score['f1']}") logger.info( f"XVAL top_{accuracy_at}_accuracy: {xval_score['top_n_accuracy']}") ml_logger.log_metrics({"XVAL EM": xval_score["EM"]}, 0) ml_logger.log_metrics({"XVAL f1": xval_score["f1"]}, 0) ml_logger.log_metrics( {f"XVAL top_{accuracy_at}_accuracy": xval_score["top_n_accuracy"]}, 0)
def test_evaluation(): ########################## ########## Settings ########################## lang_model = "deepset/roberta-base-squad2" do_lower_case = False test_assertions = True data_dir = Path("testsave/data/squad20") evaluation_filename = "dev-v2.0.json" device, n_gpu = initialize_device_settings(use_cuda=True) # loading models and evals model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.prediction_heads[0].no_ans_boost = 0 model.prediction_heads[0].n_best = 1 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) starttime = time() data_silo = DataSilo(processor=processor, batch_size=50) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # 1. Test FARM internal evaluation results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnrecall = results[0]["top_n_recall"] * 100 elapsed = time() - starttime print(results) print(elapsed) gold_EM = 77.7478 gold_f1 = 82.1557 gold_tnrecall = 84.0646 # top 1 recall gold_elapsed = 70 # 4x V100 if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}") np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}") np.testing.assert_allclose( tnrecall, gold_tnrecall, rtol=0.001, err_msg=f"FARM Eval changed for top 1 recall by: {em_score-gold_EM}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" ) # 2. Test FARM predictions with outside eval script starttime = time() model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=50, gpu=device.type == "cuda") filename = data_dir / evaluation_filename result = model.inference_from_file(file=filename) elapsed = time() - starttime os.makedirs("../testsave", exist_ok=True) write_squad_predictions(predictions=result, predictions_filename=filename, out_filename="testsave/predictions.json") script_params = { "data_file": filename, "pred_file": "testsave/predictions.json", "na_prob_thresh": 1, "na_prob_file": False, "out_file": False } results_official = squad_evaluation.main(OPTS=DotMap(script_params)) f1_score = results_official["f1"] em_score = results_official["exact"] gold_EM = 78.4890 gold_f1 = 81.7104 gold_elapsed = 66 # 4x V100 print(elapsed) if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg= f"Eval with official script changed for EM by: {em_score - gold_EM}" ) np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg= f"Eval with official script changed for f1 score by: {f1_score - gold_f1}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds" )
def train(self): """ Perform the training procedure. The training is visualized by a progress bar. It counts the epochs in a zero based manner. For example, when you specify ``epochs=20`` it starts to count from 0 to 19. If trainer evaluates the model with a test set the result of the evaluation is stored in ``test_result``. :return: Returns the model after training. When you do ``early_stopping`` with a ``save_dir`` the best model is loaded and returned. """ # connect the prediction heads with the right output from processor self.model.connect_heads_with_processor(self.data_silo.processor.tasks, require_labels=True) # Check that the tokenizer(s) fits the language model(s) if hasattr(self.model, "language_model2"): self.model.verify_vocab_size( vocab_size1=len(self.data_silo.processor.query_tokenizer), vocab_size2=len(self.data_silo.processor.passage_tokenizer)) else: self.model.verify_vocab_size( vocab_size=len(self.data_silo.processor.tokenizer)) self.model.train() do_stopping = False evalnr = 0 loss = 0 resume_from_step = self.from_step if self.local_rank in [0, -1]: logger.info(f"\n {GROWING_TREE}") for epoch in range(self.from_epoch, self.epochs): early_break = False self.from_epoch = epoch train_data_loader = self.data_silo.get_data_loader("train") progress_bar = tqdm(train_data_loader, disable=self.local_rank not in [0, -1] or self.disable_tqdm) for step, batch in enumerate(progress_bar): # when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint if resume_from_step and step <= resume_from_step: # TODO: Improve skipping for StreamingDataSilo # The seeds before and within the loop are currently needed, if you need full reproducibility # of runs with vs. without checkpointing using StreamingDataSilo. Reason: While skipping steps in StreamingDataSilo, # we update the state of the random number generator (e.g. due to masking words), which can impact the model behaviour (e.g. dropout) if step % 10000 == 0: logger.info( f"Skipping {step} out of {resume_from_step} steps ..." ) if resume_from_step == step: logger.info( f"Finished skipping {resume_from_step} steps ...") resume_from_step = None else: continue progress_bar.set_description( f"Train epoch {epoch}/{self.epochs-1} (Cur. train loss: {loss:.4f})" ) # Only for distributed training: we need to ensure that all ranks still have a batch left for training if self.local_rank != -1: if not self._all_ranks_have_data(has_data=1, step=step): early_break = True break # Move batch of samples to device batch = {key: batch[key].to(self.device) for key in batch} # Forward & backward pass through model logits = self.model.forward(**batch) per_sample_loss = self.model.logits_to_loss( logits=logits, global_step=self.global_step, **batch) loss = self.backward_propagate(per_sample_loss, step) # Perform evaluation if self.evaluate_every != 0 \ and self.global_step % self.evaluate_every == 0 \ and self.global_step != 0\ and self.local_rank in [0,-1]: # When using StreamingDataSilo, each evaluation creates a new instance of # dev_data_loader. In cases like training from scratch, this could cause # some variance across evaluators due to the randomness in word masking. dev_data_loader = self.data_silo.get_data_loader("dev") if dev_data_loader is not None: evaluator_dev = Evaluator( data_loader=dev_data_loader, tasks=self.data_silo.processor.tasks, device=self.device, report=self.eval_report) evalnr += 1 result = evaluator_dev.eval(self.model) evaluator_dev.log_results(result, "Dev", self.global_step) if self.early_stopping: do_stopping, save_model, eval_value = self.early_stopping.check_stopping( result) if save_model: logger.info( "Saving current best model to {}, eval={}". format(self.early_stopping.save_dir, eval_value)) self.model.save(self.early_stopping.save_dir) self.data_silo.processor.save( self.early_stopping.save_dir) if do_stopping: # log the stopping logger.info( "STOPPING EARLY AT EPOCH {}, STEP {}, EVALUATION {}" .format(epoch, step, evalnr)) if do_stopping: break self.global_step += 1 self.from_step = step + 1 # save the current state as a checkpoint before exiting if a SIGTERM signal is received if self.sigterm_handler and self.sigterm_handler.kill_now: logger.info( "Received a SIGTERM signal. Saving the current train state as a checkpoint ..." ) if self.local_rank in [0, -1]: self._save() torch.distributed.destroy_process_group() sys.exit(0) # save a checkpoint and continue train if self.checkpoint_every and step % self.checkpoint_every == 0: if self.local_rank in [0, -1]: self._save() # Let other ranks wait until rank 0 has finished saving if self.local_rank != -1: torch.distributed.barrier() if do_stopping: break # Only for distributed training: we need to ensure that all ranks still have a batch left for training if self.local_rank != -1 and not early_break: self._all_ranks_have_data(has_data=False) # With early stopping we want to restore the best model if self.early_stopping and self.early_stopping.save_dir: logger.info("Restoring best model so far from {}".format( self.early_stopping.save_dir)) lm_name = self.model.language_model.name self.model = AdaptiveModel.load(self.early_stopping.save_dir, self.device, lm_name=lm_name) self.model.connect_heads_with_processor( self.data_silo.processor.tasks, require_labels=True) # Eval on test set if self.evaluator_test and self.local_rank in [0, -1]: test_data_loader = self.data_silo.get_data_loader("test") if test_data_loader is not None: evaluator_test = Evaluator( data_loader=test_data_loader, tasks=self.data_silo.processor.tasks, device=self.device) self.test_result = evaluator_test.eval(self.model) evaluator_test.log_results(self.test_result, "Test", self.global_step) return self.model
def eval( self, document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", ): """ Performs evaluation on evaluation documents in the DocumentStore. Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer :param document_store: DocumentStore containing the evaluation documents :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :param label_index: Index/Table name where labeled questions are stored :param doc_index: Index/Table name where documents that are used for evaluation are stored """ if self.top_k_per_candidate != 4: logger.info( f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n" f"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n" f"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5" ) # extract all questions for evaluation filters = {"origin": [label_origin]} labels = document_store.get_all_labels(index=label_index, filters=filters) # Aggregate all answer labels per question aggregated_per_doc = defaultdict(list) for label in labels: if not label.document_id: logger.error(f"Label does not contain a document_id") continue aggregated_per_doc[label.document_id].append(label) # Create squad style dicts d: Dict[str, Any] = {} all_doc_ids = [ x.id for x in document_store.get_all_documents(doc_index) ] for doc_id in all_doc_ids: doc = document_store.get_document_by_id(doc_id, index=doc_index) if not doc: logger.error( f"Document with the ID '{doc_id}' is not present in the document store." ) continue d[str(doc_id)] = {"context": doc.text} # get all questions / answers aggregated_per_question: Dict[str, Any] = defaultdict(list) if doc_id in aggregated_per_doc: for label in aggregated_per_doc[doc_id]: # add to existing answers if label.question in aggregated_per_question.keys(): # Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max if len(aggregated_per_question[label.question] ["answers"]) >= 6: continue aggregated_per_question[ label.question]["answers"].append({ "text": label.answer, "answer_start": label.offset_start_in_doc }) # create new one else: aggregated_per_question[label.question] = { "id": str(hash(str(doc_id) + label.question)), "question": label.question, "answers": [{ "text": label.answer, "answer_start": label.offset_start_in_doc }] } # Get rid of the question key again (after we aggregated we don't need it anymore) d[str(doc_id)]["qas"] = [ v for v in aggregated_per_question.values() ] # Convert input format for FARM farm_input = [v for v in d.values()] n_queries = len([y for x in farm_input for y in x["qas"]]) # Create DataLoader that can be passed to the Evaluator tic = perf_counter() indices = range(len(farm_input)) dataset, tensor_names = self.inferencer.processor.dataset_from_dicts( farm_input, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) toc = perf_counter() reader_time = toc - tic results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_accuracy": eval_results[0]["top_n_accuracy"], "top_n": self.inferencer.model.prediction_heads[0].n_best, "reader_time": reader_time, "seconds_per_query": reader_time / n_queries } return results
def __init__( self, model, optimizer, data_silo, epochs, n_gpu, device, lr_schedule=None, evaluate_every=100, evaluator_dev=None, evaluator_test=None, use_amp=None, grad_acc_steps=1, local_rank=-1, early_stopping=None, log_learning_rate=False, checkpoint_on_sigterm=False, checkpoint_every=None, checkpoint_root_dir=None, checkpoints_to_keep=3, from_epoch=0, from_step=0, ): """ :param optimizer: An optimizer object that determines the learning strategy to be used during training :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders :type data_silo: DataSilo :param epochs: How many times the training procedure will loop through the train dataset :type epochs: int :param n_gpu: The number of gpus available for training and evaluation. :type n_gpu: int :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda". :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer :param evaluate_every: Perform dev set evaluation after this many steps of training. :type evaluate_every: int :param evaluator_dev: Evaluator for dev set. Options: `None` (Default) => will init a new evaluator, if there's a dev set in the DataSilo `Evaluator Object` => use the manually supplied evaluator `False` => Don't use any evaluator :type evaluator_dev: Evaluator, None or False :param evaluator_test: Evaluator for test set. Options: `None` (Default) => will init a new evaluator, if there's a test set in the DataSilo `Evaluator Object` => use the manually supplied evaluator `False` => Don't use any evaluator :type evaluator_test: Evaluator, None or False :param use_amp: Whether to use automatic mixed precision with Apex. One of the optimization levels must be chosen. "O1" is recommended in almost all cases. :type use_amp: str :param grad_acc_steps: TODO :type grad_acc_steps: int :param local_rank: TODO :type local_rank: int :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models. :type early_stopping: EarlyStopping :param log_learning_rate: Whether to log learning rate to Mlflow :type log_learning_rate: bool :param checkpoint_on_sigterm: save a checkpoint for the Trainer when a SIGTERM signal is sent. The checkpoint can be used to resume training. It is useful in frameworks like AWS SageMaker with Spot instances where a SIGTERM notifies to save the training state and subsequently the instance is terminated. :type checkpoint_on_sigterm: bool :param checkpoint_every: save a train checkpoint after this many steps of training. :type checkpoint_every: int :param checkpoint_root_dir: the Path of directory where all train checkpoints are saved. For each individual checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created. :type checkpoint_root_dir: Path :param checkpoints_to_keep: maximum number of train checkpoints to save. :type checkpoints_to_keep: int :param from_epoch: the epoch number to start the training from. In the case when training resumes from a saved checkpoint, it is used to fast-forward training to the last epoch in the checkpoint. :type from_epoch: int :param from_step: the step number to start the training from. In the case when training resumes from a saved checkpoint, it is used to fast-forward training to the last step in the checkpoint. :type from_step: int """ self.model = model self.data_silo = data_silo self.epochs = int(epochs) self.optimizer = optimizer self.evaluate_every = evaluate_every self.n_gpu = n_gpu self.grad_acc_steps = grad_acc_steps self.use_amp = use_amp self.lr_schedule = lr_schedule self.data_loader_train = data_silo.get_data_loader("train") self.device = device self.local_rank = local_rank self.log_params() self.early_stopping = early_stopping self.log_learning_rate = log_learning_rate if use_amp and not AMP_AVAILABLE: raise ImportError(f'Got use_amp = {use_amp}, but cannot find apex. ' 'Please install Apex if you want to make use of automatic mixed precision. ' 'https://github.com/NVIDIA/apex') self.checkpoint_on_sigterm = checkpoint_on_sigterm if checkpoint_on_sigterm: self.sigterm_handler = GracefulKiller() else: self.sigterm_handler = None self.checkpoint_root_dir = checkpoint_root_dir self.checkpoints_to_keep = checkpoints_to_keep self.checkpoint_every = checkpoint_every if self.checkpoint_every and not checkpoint_root_dir: raise Exception("checkpoint_path needs to be supplied when using checkpoint_every.") if checkpoint_on_sigterm and not checkpoint_root_dir: raise Exception("checkpoint_path needs to be supplied when using checkpoint_on_sigterm.") self.from_epoch = from_epoch self.from_step = from_step self.global_step = (from_epoch * from_step) - 1 # evaluator on dev set if evaluator_dev is None and self.data_silo.get_data_loader("dev"): evaluator_dev = Evaluator( data_loader=self.data_silo.get_data_loader("dev"), tasks=self.data_silo.processor.tasks, device=device, ) self.evaluator_dev = evaluator_dev # evaluator on test set if evaluator_test is None and self.data_silo.get_data_loader("test"): evaluator_test = Evaluator( data_loader=self.data_silo.get_data_loader("test"), tasks=self.data_silo.processor.tasks, device=device ) self.evaluator_test = evaluator_test
def dense_passage_retrieval(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="FARM-dense_passage_retrieval", run_name="Run_dpr") ########################## ########## Settings ########################## set_all_seeds(seed=42) batch_size = 4 n_epochs = 3 distributed = False # enable for multi GPU training via DDP evaluate_every = 1000 question_lang_model = "facebook/dpr-question_encoder-single-nq-base" passage_lang_model = "facebook/dpr-ctx_encoder-single-nq-base" do_lower_case = True use_fast = True embed_title = True num_hard_negatives = 1 similarity_function = "dot_product" train_filename = "nq-train.json" dev_filename = "nq-dev.json" test_filename = "nq-dev.json" max_samples = None # load a smaller dataset (e.g. for debugging) # For multi GPU Training via DDP we need to get the local rank args = parse_arguments() device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank) # 1.Create question and passage tokenizers query_tokenizer = Tokenizer.load( pretrained_model_name_or_path=question_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) passage_tokenizer = Tokenizer.load( pretrained_model_name_or_path=passage_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # data_dir "data/retriever" should contain DPR training and dev files downloaded from https://github.com/facebookresearch/DPR # i.e., nq-train.json, nq-dev.json or trivia-train.json, trivia-dev.json label_list = ["hard_negative", "positive"] metric = "text_similarity_metric" processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, max_seq_len_query=64, max_seq_len_passage=256, label_list=label_list, metric=metric, data_dir="../data/retriever", train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, embed_title=embed_title, num_hard_negatives=num_hard_negatives, max_samples=max_samples) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed) # 4. Create an BiAdaptiveModel+ # a) which consists of 2 pretrained language models as a basis question_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRQuestionEncoder") passage_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRContextEncoder") # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = TextSimilarityHead( similarity_function=similarity_function) model = BiAdaptiveModel( language_model1=question_language_model, language_model2=passage_language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \ "eps": 1e-08}, schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=1, device=device, distributed=distributed ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/dpr-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Evaluate test_data_loader = data_silo.get_data_loader("test") if test_data_loader is not None: evaluator_test = Evaluator(data_loader=test_data_loader, tasks=data_silo.processor.tasks, device=device) model.connect_heads_with_processor(processor.tasks) test_result = evaluator_test.eval(model)
def doc_classification_crossvalidation(): # the code for this function is partially taken from: # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_multilabel.py and # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_crossvalidation.py # for local logging: ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="covid-document-classification", run_name=RUNNAME) # model settings xval_folds = FOLDS set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) if RUNLOCAL: device = "cpu" n_epochs = NEPOCHS batch_size = BATCHSIZE evaluate_every = EVALEVERY lang_model = MODELTYPE do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) metric = "f1_macro" # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # The processor wants to know the possible labels ... label_list = LABELS processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=MAXLEN, data_dir=DATADIR, train_filename=TRAIN, test_filename=TEST, dev_split=0.1, label_list=label_list, metric=metric, label_column_name="Categories", # confusing parameter name: it should be called multiCLASS # not multiLABEL multilabel=True ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir, dev): # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( # there is still an error with class weights ... # class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=dev) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=dev, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer save_dir = Path(str(save_dir) + f"-{n_fold}") # unfortunately, early stopping is still not working earlystopping = EarlyStopping( metric="f1_macro", mode="max", save_dir=save_dir, # where to save the best model patience=5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=dev, evaluator_test=False, #early_stopping=earlystopping) ) # train it trainer.train() trainer.model.save(save_dir) return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_macro = -1 save_dir = Path("saved_models/covid-classification-v1") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir, device) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device, ) result = evaluator_test.eval(model, return_preds_and_labels=True) os.makedirs(os.path.dirname(BESTMODEL + "/classification_report.txt"), exist_ok=True) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Evaluation on withheld split for numfold no. {} \n".format(num_fold)) file.write(result[0]["report"]) file.write("\n\n") file.close() evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_macro = result[0]["f1_macro"] if f1_macro > bestf1_macro: bestf1_macro = f1_macro bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("../data/predictions/covid-classification-xval.results.json", "wt") as fp: json.dump(allresults, fp, cls=NumpyArrayEncoder) # calculate overall f1 score across all folds xval_f1_macro = f1_score(all_labels, all_preds, average="macro") ml_logger.log_metrics({"f1 macro across all folds": xval_f1_macro}, step=None) # test performance evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/covid-classification-v1-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) ml_logger.log_metrics({"f1 macro on final test set": result[0]["f1_macro"]}, step=None) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Final result of the best model \n") file.write(result[0]["report"]) file.write("\n\n") file.close() ml_logger.log_artifacts(BESTMODEL + "/") # save model for later use processor.save(BESTMODEL) model.save(BESTMODEL) return model
def test_evaluation(): ########################## ########## Settings ########################## lang_model = "deepset/roberta-base-squad2" do_lower_case = False test_assertions = False data_dir = Path("testsave/data/squad20") evaluation_filename = "dev-v2.0.json" device, n_gpu = initialize_device_settings(use_cuda=True) # loading models and evals model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.prediction_heads[0].no_ans_boost = 0 model.prediction_heads[0].n_best = 1 model.prediction_heads[0].n_best_per_sample = 1 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) starttime = time() data_silo = DataSilo(processor=processor, batch_size=40 * n_gpu_factor) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) model, _ = optimize_model(model=model, device=device, local_rank=-1, optimizer=None, distributed=False, use_amp=None) evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # 1. Test FARM internal evaluation results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 elapsed = time() - starttime print(results) print(elapsed) gold_EM = 78.4721 gold_f1 = 82.6671 gold_tnacc = 84.3594 # top 1 recall gold_elapsed = 40 # 4x V100 if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}") np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}") np.testing.assert_allclose( tnacc, gold_tnacc, rtol=0.001, err_msg= f"FARM Eval changed for top 1 accuracy by: {tnacc-gold_tnacc}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.001): error_messages.append( f"FARM Eval changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.001): error_messages.append( f"FARM Eval changed for EM by: {round(em_score - gold_EM, 4)}") if not np.allclose(tnacc, gold_tnacc, rtol=0.001): error_messages.append( f"FARM Eval changed for top 1 accuracy by: {round(tnacc-gold_tnacc, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"FARM Eval speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds" ) benchmark_result = [{ "run": "FARM internal evaluation", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": round(tnacc - gold_tnacc, 4), "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": round(tnacc, 4), "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": gold_tnacc, "elapsed_gold": gold_elapsed }] logger.info("\n\n" + pformat(benchmark_result[0]) + "\n") # # 2. Test FARM predictions with outside eval script starttime = time() model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=40 * n_gpu_factor, gpu=device.type == "cuda") filename = data_dir / evaluation_filename result = model.inference_from_file(file=filename, return_json=False, multiprocessing_chunksize=80) results_squad = [x.to_squad_eval() for x in result] model.close_multiprocessing_pool() elapsed = time() - starttime os.makedirs("../testsave", exist_ok=True) write_squad_predictions(predictions=results_squad, predictions_filename=filename, out_filename="testsave/predictions.json") script_params = { "data_file": filename, "pred_file": "testsave/predictions.json", "na_prob_thresh": 1, "na_prob_file": False, "out_file": False } results_official = squad_evaluation.main(OPTS=DotMap(script_params)) f1_score = results_official["f1"] em_score = results_official["exact"] gold_EM = 79.878 gold_f1 = 82.917 gold_elapsed = 27 # 4x V100 print(elapsed) if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg= f"Eval with official script changed for EM by: {em_score - gold_EM}" ) np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg= f"Eval with official script changed for f1 score by: {f1_score - gold_f1}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.001): error_messages.append( f"Eval with official script changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.001): error_messages.append( f"Eval with official script changed for EM by: {round(em_score - gold_EM, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"Inference speed changed significantly by: {round(elapsed - gold_elapsed,4)} seconds" ) benchmark_result.append({ "run": "outside eval script", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": "-", "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": "-", "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": "-", "elapsed_gold": gold_elapsed }) logger.info("\n\n" + pformat(benchmark_result[1]) + "\n") return benchmark_result
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) # GPU utilization on 4x V100 # 40*4, 14.3/16GB on master, 12.6/16 on others batch_size = 40 * n_gpu_factor n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model test_assertions = False train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 78.6575 #77.714 gold_tnrecall = 97.3721 gold_elapsed = 1135 if test_assertions: np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg= f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnacc, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.01): error_messages.append( f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.01): error_messages.append( f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}") if not np.allclose(tnacc, gold_tnrecall, rtol=0.01): error_messages.append( f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds" ) benchmark_result = [{ "run": "train evaluation", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": round(tnacc - gold_tnrecall, 4), "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": round(tnacc, 4), "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": gold_tnrecall, "elapsed_gold": gold_elapsed }] logger.info("\n\n" + pformat(benchmark_result) + "\n") return benchmark_result
def __init__(self, optimizer, data_silo, epochs, n_gpu, device, lr_schedule=None, evaluate_every=100, evaluator_dev=None, evaluator_test=None, use_amp=None, grad_acc_steps=1, local_rank=-1, early_stopping=None, log_learning_rate=False): """ :param optimizer: An optimizer object that determines the learning strategy to be used during training :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders :type data_silo: DataSilo :param epochs: How many times the training procedure will loop through the train dataset :type epochs: int :param n_gpu: The number of gpus available for training and evaluation. :type n_gpu: int :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda". :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer :param evaluate_every: Perform dev set evaluation after this many steps of training. :type evaluate_every: int :param evaluator_dev: Evaluator for dev set. Options: `None` (Default) => will init a new evaluator, if there's a dev set in the DataSilo `Evaluator Object` => use the manually supplied evaluator `False` => Don't use any evaluator :type evaluator_dev: Evaluator, None or False :param evaluator_test: Evaluator for test set. Options: `None` (Default) => will init a new evaluator, if there's a test set in the DataSilo `Evaluator Object` => use the manually supplied evaluator `False` => Don't use any evaluator :type evaluator_test: Evaluator, None or False :param use_amp: Whether to use automatic mixed precision with Apex. One of the optimization levels must be chosen. "O1" is recommended in almost all cases. :type use_amp: str :param grad_acc_steps: TODO :type grad_acc_steps: int :param local_rank: TODO :type local_rank: int :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models. :type early_stopping: EarlyStopping :param log_learning_rate: Whether to log learning rate to Mlflow :type log_learning_rate: bool """ self.data_silo = data_silo self.epochs = int(epochs) self.optimizer = optimizer self.evaluate_every = evaluate_every self.n_gpu = n_gpu self.grad_acc_steps = grad_acc_steps self.use_amp = use_amp self.lr_schedule = lr_schedule self.global_step = 0 self.data_loader_train = data_silo.get_data_loader("train") self.device = device self.local_rank = local_rank self.log_params() self.early_stopping = early_stopping self.log_learning_rate = log_learning_rate if use_amp and not AMP_AVAILABLE: raise ImportError( f'Got use_amp = {use_amp}, but cannot find apex. ' 'Please install Apex if you want to make use of automatic mixed precision. ' 'https://github.com/NVIDIA/apex') # evaluator on dev set if evaluator_dev is None and self.data_silo.get_data_loader("dev"): evaluator_dev = Evaluator( data_loader=self.data_silo.get_data_loader("dev"), tasks=self.data_silo.processor.tasks, device=device, ) self.evaluator_dev = evaluator_dev # evaluator on test set if evaluator_test is None and self.data_silo.get_data_loader("test"): evaluator_test = Evaluator( data_loader=self.data_silo.get_data_loader("test"), tasks=self.data_silo.processor.tasks, device=device) self.evaluator_test = evaluator_test
def __init__( self, optimizer, data_silo, epochs, n_gpu, device, warmup_linear=0.1, evaluate_every=100, evaluator_dev=None, evaluator_test=None, fp16=False, grad_acc_steps=1, local_rank=-1, early_stopping=None, ): """ :param optimizer: An optimizer object that determines the learning strategy to be used during training :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders :type data_silo: DataSilo :param epochs: How many times the training procedure will loop through the train dataset :type epochs: int :param n_gpu: The number of gpus available for training and evaluation. :type n_gpu: int :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda". :param warmup_linear: TODO :param evaluate_every: Perform dev set evaluation after this many steps of training. :type evaluate_every: int :param evaluator_dev: Evaluator for dev set. Options: `None` (Default) => will init a new evaluator, if there's a dev set in the DataSilo `Evaluator Object` => use the manually supplied evaluator `False` => Don't use any evaluator :type evaluator_dev: Evaluator, None or False :param evaluator_test: Evaluator for test set. Options: `None` (Default) => will init a new evaluator, if there's a test set in the DataSilo `Evaluator Object` => use the manually supplied evaluator `False` => Don't use any evaluator :type evaluator_test: Evaluator, None or False :param fp16: Whether to use floating point 16 mode. :type fp16: bool :param grad_acc_steps: TODO :type grad_acc_steps: int :param local_rank: TODO :type local_rank: int :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models. :type early_stopping: EarlyStopping """ self.data_silo = data_silo self.epochs = int(epochs) self.optimizer = optimizer self.evaluate_every = evaluate_every self.n_gpu = n_gpu self.grad_acc_steps = grad_acc_steps self.fp16 = fp16 self.learning_rate = self.optimizer.get_lr() self.warmup_linear = warmup_linear self.global_step = 0 self.data_loader_train = data_silo.get_data_loader("train") self.device = device self.local_rank = local_rank self.log_params() self.early_stopping = early_stopping # evaluator on dev set if evaluator_dev is None and self.data_silo.get_data_loader("dev"): evaluator_dev = Evaluator( data_loader=self.data_silo.get_data_loader("dev"), tasks=self.data_silo.processor.tasks, device=device, ) self.evaluator_dev = evaluator_dev # evaluator on test set if evaluator_test is None and self.data_silo.get_data_loader("test"): evaluator_test = Evaluator( data_loader=self.data_silo.get_data_loader("test"), tasks=self.data_silo.processor.tasks, device=device) self.evaluator_test = evaluator_test
def doc_classification_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## xval_folds = 5 xval_stratified = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. # For xval, we also store the actual predictions and labels in each result so we can # calculate overall metrics over all folds later def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_xval.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_f1_offense = f1_score(all_labels, all_preds, labels=label_list, pos_label="OFFENSE") xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="OTHER") xval_mcc = matthews_corrcoef(all_labels, all_preds) logger.info("XVAL F1 MICRO: ", xval_f1_micro) logger.info("XVAL F1 MACRO: ", xval_f1_macro) logger.info("XVAL F1 OFFENSE: ", xval_f1_offense) logger.info("XVAL F1 OTHER: ", xval_f1_other) logger.info("XVAL MCC: ", xval_mcc) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: ", result[0]["f1_micro"]) logger.info("TEST F1 MACRO: ", result[0]["f1_macro"]) logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"]) logger.info("TEST F1 OTHER: ", result[0]["f1_other"]) logger.info("TEST MCC: ", result[0]["mcc"])
def doc_classification_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## xval_folds = 5 xval_stratification = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 dev_split = 0.1 # For xval the dev_stratification parameter must not be None: with None, the devset cannot be created # using the default method of only splitting by the available chunks as initial train set for each fold # is just a single chunk! dev_stratification = True lang_model = "bert-base-german-cased" do_lower_case = False use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. # For xval, we also store the actual predictions and labels in each result so we can # calculate overall metrics over all folds later def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, dev_split=dev_split, dev_stratification=dev_stratification, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, sets=["train", "dev"], n_splits=xval_folds, stratification=xval_stratification) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} of {xval_folds} ############" ) logger.info( f"Fold training samples: {len(silo_to_use.data['train'])}") logger.info(f"Fold dev samples: {len(silo_to_use.data['dev'])}") logger.info( f"Fold testing samples: {len(silo_to_use.data['test'])}") logger.info( "Total number of samples: " f"{len(silo_to_use.data['train'])+len(silo_to_use.data['dev'])+len(silo_to_use.data['test'])}" ) # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # remember all individual evaluation results allresults = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): mlflow.start_run(run_name=f"fold-{num_fold + 1}-of-{len(silos)}", nested=True) model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold mlflow.end_run() # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_xval.results.json", "wt") as fp: json.dump(allresults, fp) # log the best fold metric and fold logger.info(f"Best fold f1_offense: {bestf1_offense} in fold {bestfold}") # calculate overall metrics across all folds: we only have one head so we do this only for the first head # information in each of the per-fold results # First create a dict where for each metric, we have a list of values from each fold xval_metric_lists_head0 = defaultdict(list) for results in allresults: head0results = results[0] for name in head0results.keys(): if name not in ["preds", "labels"] and not name.startswith("_") and \ isinstance(head0results[name], numbers.Number): xval_metric_lists_head0[name].append(head0results[name]) # Now calculate the mean and stdev for each metric, also copy over the task name xval_metric = {} xval_metric["task_name"] = allresults[0][0].get("task_name", "UNKNOWN TASKNAME") for name in xval_metric_lists_head0.keys(): values = xval_metric_lists_head0[name] vmean = statistics.mean(values) vstdev = statistics.stdev(values) xval_metric[name + "_mean"] = vmean xval_metric[name + "_stdev"] = vstdev logger.info( f"XVAL Accuracy: mean {xval_metric['acc_mean']} stdev {xval_metric['acc_stdev']}" ) logger.info( f"XVAL F1 MICRO: mean {xval_metric['f1_micro_mean']} stdev {xval_metric['f1_micro_stdev']}" ) logger.info( f"XVAL F1 MACRO: mean {xval_metric['f1_macro_mean']} stdev {xval_metric['f1_macro_stdev']}" ) logger.info( f"XVAL F1 OFFENSE: mean {xval_metric['f1_offense_mean']} stdev {xval_metric['f1_offense_stdev']}" ) logger.info( f"XVAL F1 OTHER: mean {xval_metric['f1_other_mean']} stdev {xval_metric['f1_other_stdev']}" ) logger.info( f"XVAL MCC: mean {xval_metric['mcc_mean']} stdev {xval_metric['mcc_stdev']}" ) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info(f"TEST Accuracy: {result[0]['acc']}") logger.info(f"TEST F1 MICRO: {result[0]['f1_micro']}") logger.info(f"TEST F1 MACRO: {result[0]['f1_macro']}") logger.info(f"TEST F1 OFFENSE: {result[0]['f1_offense']}") logger.info(f"TEST F1 OTHER: {result[0]['f1_other']}") logger.info(f"TEST MCC: {result[0]['mcc']}")
def eval(self, document_store: ElasticsearchDocumentStore, device: str, label_index: str = "feedback", doc_index: str = "eval_document", label_origin: str = "gold_label"): """ Performs evaluation on evaluation documents in Elasticsearch DocumentStore. Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - "top_n_recall": Proportion of predicted answers that overlap with correct answer :param document_store: The ElasticsearchDocumentStore containing the evaluation documents :type document_store: ElasticsearchDocumentStore :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :type device: str :param label_index: Elasticsearch index where labeled questions are stored :type label_index: str :param doc_index: Elasticsearch index where documents that are used for evaluation are stored :type doc_index: str """ # extract all questions for evaluation filter = {"origin": label_origin} questions = document_store.get_all_documents_in_index( index=label_index, filters=filter) # mapping from doc_id to questions doc_questions_dict = {} id = 0 for question in questions: doc_id = question["_source"]["doc_id"] if doc_id not in doc_questions_dict: doc_questions_dict[doc_id] = [{ "id": id, "question": question["_source"]["question"], "answers": question["_source"]["answers"], "is_impossible": False if question["_source"]["answers"] else True }] else: doc_questions_dict[doc_id].append({ "id": id, "question": question["_source"]["question"], "answers": question["_source"]["answers"], "is_impossible": False if question["_source"]["answers"] else True }) id += 1 # extract eval documents and convert data back to SQuAD-like format documents = document_store.get_all_documents_in_index(index=doc_index) dicts = [] for document in documents: doc_id = document["_source"]["doc_id"] text = document["_source"]["text"] questions = doc_questions_dict[doc_id] dicts.append({"qas": questions, "context": text}) # Create DataLoader that can be passed to the Evaluator indices = range(len(dicts)) dataset, tensor_names = self.inferencer.processor.dataset_from_dicts( dicts, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) eval_results = evaluator.eval(self.inferencer.model) results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], "top_n_recall": eval_results[0]["top_n_recall"] } return results
def question_answering_confidence(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = False batch_size = 80 data_dir = Path("../data/squad20") # We use the same file for dev and test set only for demo purposes dev_filename = "dev-v2.0.json" test_filename = "dev-v2.0.json" accuracy_at = 3 # accuracy at n is useful for answers inside long documents # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=dev_filename, test_filename=test_filename, data_dir=data_dir, doc_stride=192, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Load pre-trained question-answering model model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering") model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Number of predictions the model will make per Question. # The multiple predictions are used for evaluating top n recall. model.prediction_heads[0].n_best = accuracy_at # 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head. # This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature. # A softmax function is applied to the logits afterward to get confidence scores in the range [0,1]. # A temperature larger than 1 decreases the model’s confidence scores. logger.info(f"Parameter used for temperature scaling of model confidence scores: {model.prediction_heads[0].temperature_for_confidence}") # 6a. We can either manually set the temperature (default value is 1.0)... model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device)) # 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling. # It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter. # During the calibration, this parameter is automatically set internally as an attribute of the prediction head. evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device ) result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True) # evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev"))) # 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy evaluator_test = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0] logger.info("Grouping predictions by confidence score and calculating metrics for each bin.") em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10) for bin_number in range(10): logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}") # 8. Hooray! You have a model with calibrated confidence scores. # Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head. save_dir = Path("../saved_models/qa-confidence-tutorial") model.save(save_dir) processor.save(save_dir) # 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough # To this end, load the stored model, which will automatically load the stored temperature parameter. # The confidence scores are automatically adjusted based on this temperature parameter. # For each prediction, we can check the model's confidence and decide whether to output the prediction or not. inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True) logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}") QA_input = [ { "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0] if result.prediction[0].confidence > 0.9: print(result.prediction[0].answer) else: print("The confidence is not high enough to give an answer.")
def __init__( self, optimizer, data_silo, epochs, n_gpu, device, warmup_linear=0.1, evaluate_every=100, evaluator_dev=None, evaluator_test=None, fp16=False, grad_acc_steps=1, ): """ :param optimizer: An optimizer object that determines the learning strategy to be used during training :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders :type data_silo: DataSilo :param epochs: How many times the training procedure will loop through the train dataset :type epochs: int :param n_gpu: The number of gpus available for training and evaluation. :type n_gpu: int :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda". :param warmup_linear: TODO :param evaluate_every: Perform dev set evaluation after this many steps of training. :type evaluate_every: int :param evaluator_dev: The dev set Evaluator object. :type evaluator_dev: Evaluator :param evaluator_test: The test set Evaluator object. :type evaluator_test: Evaluator :param fp16: Whether to use floating point 16 mode. :type fp16: bool :param grad_acc_steps: TODO """ self.data_silo = data_silo self.epochs = int(epochs) self.optimizer = optimizer self.evaluate_every = evaluate_every self.n_gpu = n_gpu self.grad_acc_steps = grad_acc_steps self.fp16 = fp16 self.learning_rate = self.optimizer.get_lr() self.warmup_linear = warmup_linear self.global_step = 0 self.data_loader_train = data_silo.get_data_loader("train") self.device = device self.log_params() # evaluator on dev set if evaluator_dev is None and self.data_silo.get_data_loader("dev"): evaluator_dev = Evaluator( data_loader=self.data_silo.get_data_loader("dev"), label_maps=self.data_silo.processor.label_maps, device=device, metrics=self.data_silo.processor.metrics, ) self.evaluator_dev = evaluator_dev # evaluator on test set if evaluator_test is None and self.data_silo.get_data_loader("test"): evaluator_test = Evaluator( data_loader=self.data_silo.get_data_loader("test"), label_maps=self.data_silo.processor.label_maps, device=device, metrics=self.data_silo.processor.metrics, ) self.evaluator_test = evaluator_test
def evaluate_question_answering(): ########################## ########## Settings ########################## device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = True data_dir = Path("../data/squad20") evaluation_filename = "dev-v2.0.json" batch_size = 50 no_ans_boost = 0 recall_at = 3 # recall at n is only useful for answers inside long documents # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list= ["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an Evaluator evaluator = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # 5. Load model model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering") # use "load" if you want to use a local model that was trained with FARM #model = AdaptiveModel.load(lang_model, device=device) model.prediction_heads[0].no_ans_boost = no_ans_boost model.prediction_heads[0].n_best = recall_at model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # 6. Run the Evaluator results = evaluator.eval(model) f1_score = results[0]["f1"] em_score = results[0]["EM"] tnrecall = results[0]["top_n_recall"] print("F1-Score:", f1_score) print("Exact Match Score:", em_score) print(f"top_{recall_at}_recall:", tnrecall)
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 32 * 4 # 4x V100 n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnrecall = results[0]["top_n_recall"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 77.714 gold_tnrecall = 97.3721 # gold_elapsed = 1286.30 np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnrecall, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 1 recall by: {em_score - gold_EM}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" )
def train(self): """ Perform the training procedure. """ # connect the prediction heads with the right output from processor self.model.connect_heads_with_processor(self.data_silo.processor.tasks, require_labels=True) # Check that the tokenizer fits the language model self.model.verify_vocab_size( vocab_size=len(self.data_silo.processor.tokenizer)) logger.info(f"\n {GROWING_TREE}") self.model.train() do_stopping = False evalnr = 0 loss = 0 resume_from_step = self.from_step for epoch in range(self.from_epoch + 1, self.epochs + 1): train_data_loader = self.data_silo.get_data_loader("train") progress_bar = tqdm(train_data_loader) for step, batch in enumerate(progress_bar, start=1): # when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint if resume_from_step and step <= resume_from_step: if resume_from_step == step: resume_from_step = None continue if self.sigterm_handler and self.sigterm_handler.kill_now: # save the current state as a checkpoint logger.info( "Received a SIGTERM signal. Saving the current train state as a checkpoint ..." ) self._save() sys.exit(0) # save a checkpoint and continue train (do not create a new checkpoint if just resumed from a checkpoint) if self.checkpoint_every and step % self.checkpoint_every == 0 and resume_from_step + 1 != step: self._save() progress_bar.set_description( f"Train epoch {epoch}/{self.epochs} (Cur. train loss: {loss:.4f})" ) # Move batch of samples to device batch = {key: batch[key].to(self.device) for key in batch} # Forward pass through model logits = self.model.forward(**batch) per_sample_loss = self.model.logits_to_loss( logits=logits, global_step=self.global_step, **batch) loss = self.backward_propagate(per_sample_loss, step) # Perform evaluation if self.global_step % self.evaluate_every == 0 and self.global_step != 0: # When using StreamingDataSilo, each evaluation creates a new instance of # dev_data_loader. In cases like training from scratch, this could cause # some variance across evaluators due to the randomness in word masking. dev_data_loader = self.data_silo.get_data_loader("dev") if dev_data_loader is not None: evaluator_dev = Evaluator( data_loader=dev_data_loader, tasks=self.data_silo.processor.tasks, device=self.device) evalnr += 1 result = evaluator_dev.eval(self.model) evaluator_dev.log_results(result, "Dev", self.global_step) if self.early_stopping: do_stopping, save_model, eval_value = self.early_stopping.check_stopping( result) if save_model: logger.info( "Saving current best model to {}, eval={}". format(self.early_stopping.save_dir, eval_value)) self.model.save(self.early_stopping.save_dir) self.data_silo.processor.save( self.early_stopping.save_dir) if do_stopping: # log the stopping logger.info( "STOPPING EARLY AT EPOCH {}, STEP {}, EVALUATION {}" .format(epoch, step, evalnr)) if do_stopping: break self.global_step += 1 self.from_step = step self.from_epoch = epoch if do_stopping: break # With early stopping we want to restore the best model if self.early_stopping and self.early_stopping.save_dir: logger.info("Restoring best model so far from {}".format( self.early_stopping.save_dir)) lm_name = self.model.language_model.name model = AdaptiveModel.load(self.early_stopping.save_dir, self.device, lm_name=lm_name) model.connect_heads_with_processor(self.data_silo.processor.tasks, require_labels=True) # Eval on test set test_data_loader = self.data_silo.get_data_loader("test") if test_data_loader is not None: evaluator_test = Evaluator(data_loader=test_data_loader, tasks=self.data_silo.processor.tasks, device=self.device) result = evaluator_test.eval(self.model) evaluator_test.log_results(result, "Test", self.global_step) return self.model
def doc_classification( task_config, model_name_or_path, cache_dir, data_dir, save_dir, model_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = data_dir save_dir = save_dir model_dir = model_dir # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / 'labels' / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config["data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"]) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment( experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights( task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": warmup_steps } model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping(mode=early_stopping_mode, min_delta=0.0001, save_dir=model_dir, metric=early_stopping_metric, patience=early_stopping_patience) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(model_dir + "/final_model") processor.save(model_dir + "/final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = model_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir + "/eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir + "/eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
processor=processor, batch_size=4, gpu=True, # TODO: how to mix for multihead? task_type="classification") basic_texts = [ { "text": "Some text you want to classify" }, { "text": "A second sample" }, ] ret = inferencer.inference_from_dicts(basic_texts) logger.info(f"Result of inference: {ret}") logger.info(f"Evaluating on training set...") evaluator = Evaluator(data_loader=data_silo.get_data_loader("train"), tasks=processor.tasks, device=device) result = evaluator.eval(inferencer.model, return_preds_and_labels=True) evaluator.log_results(result, "Test", steps=len(data_silo.get_data_loader("test"))) inferencer.close_multiprocessing_pool() logger.info("PROCESSING FINISHED")