def log_results(results, dataset_name, steps, logging=True, print=True): # Print a header header = "\n\n" header += BUSH_SEP + "\n" header += "***************************************************\n" header += f"***** EVALUATION | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n" header += "***************************************************\n" header += BUSH_SEP + "\n" logger.info(header) for head_num, head in enumerate(results): logger.info("\n _________ {} _________".format(head['task_name'])) for metric_name, metric_val in head.items(): # log with ML framework (e.g. Mlflow) if logging: if isinstance(metric_val, numbers.Number): MlLogger.log_metrics( metrics={ f"{dataset_name}_{metric_name}_{head['task_name']}": metric_val }, step=steps, ) # print via standard python logger if print: if metric_name == "report": if isinstance(metric_val, str) and len(metric_val) > 8000: metric_val = metric_val[:7500] + "\n ............................. \n" + metric_val[-500:] logger.info("{}: \n {}".format(metric_name, metric_val)) else: logger.info("{}: {}".format(metric_name, metric_val))
def _calculate_statistics(self, ): self.counts = { "train": len(self.data["train"]), "dev": len(self.data["dev"]), "test": len(self.data.get("test", [])), } train_input_numpy = self.data["train"][:][0].numpy() seq_lens = np.sum(train_input_numpy != 0, axis=1) self.ave_len = np.mean(seq_lens) max_seq_len = self.data["train"][:][0].shape[1] self.clipped = np.mean(seq_lens == max_seq_len) logger.info("Examples in train: {}".format(self.counts["train"])) logger.info("Examples in dev : {}".format(self.counts["dev"])) logger.info("Examples in test : {}".format(self.counts["test"])) logger.info("") logger.info("Max sequence length: {}".format(max(seq_lens))) logger.info("Average sequence length: {}".format(self.ave_len)) logger.info("Proportion clipped: {}".format(self.clipped)) MlLogger.log_params({ "n_samples_train": self.counts["train"], "n_samples_dev": self.counts["train"], "n_samples_test": self.counts["train"], "ave_seq_len": self.ave_len, "clipped": self.clipped })
def build_task_data(params: Params, data_supplier) -> SeqTagTaskData: dataset_dict: Dict[str, List[TaggedSequence]] = data_supplier() ner_labels = ["[PAD]", NIT] + list( set(tag for taggedseqs in dataset_dict.values() for taggedseq in taggedseqs for tok, tag in taggedseq)) ml_logger = MLFlowLogger(tracking_uri=os.environ["HOME"] + "/data/mlflow_experiments/mlruns") ml_logger.init_experiment(experiment_name="Sequence_Tagging", run_name="Run_ner") lang_model = "bert-base-cased" do_lower_case = False tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=None, # noqa metric="seq_f1", label_list=ner_labels, ) task_data = { "num_labels": len(ner_labels), "lang_model": lang_model, "ml_logger": ml_logger, "processor": processor, "params": params, } return SeqTagTaskData(data=dataset_dict, task_data=task_data)
def logits_to_loss(self, logits, global_step=None, **kwargs): """ Get losses from all prediction heads & reduce to single loss *per sample*. :param logits: logits, can vary in shape and type, depending on task :type logits: object :param global_step: number of current training step :type global_step: int :param kwargs: placeholder for passing generic parameters. Note: Contains the batch (as dict of tensors), when called from Trainer.train(). :type kwargs: object :return loss: torch.tensor that is the per sample loss (len: batch_size) """ all_losses = self.logits_to_loss_per_head(logits, **kwargs) # This aggregates the loss per sample across multiple prediction heads # Default is sum(), but you can configure any fn that takes [Tensor, Tensor ...] and returns [Tensor] # Log the loss per task for i, per_sample_loss in enumerate(all_losses): task_name = self.prediction_heads[i].task_name task_loss = per_sample_loss.mean() MlLogger.log_metrics( { f"train_loss_{task_name}": float(task_loss.detach().cpu().numpy()) }, step=global_step) loss = self.loss_aggregation_fn(all_losses, global_step=global_step, batch=kwargs) return loss
def log_params(self): """ Logs paramteres to generic logger MlLogger """ params = { "lm1_type": self.language_model1.__class__.__name__, "lm1_name": self.language_model1.name, "lm1_output_types": ",".join(self.lm1_output_types), "lm2_type": self.language_model2.__class__.__name__, "lm2_name": self.language_model2.name, "lm2_output_types": ",".join(self.lm2_output_types), "prediction_heads": ",".join( [head.__class__.__name__ for head in self.prediction_heads]) } try: MlLogger.log_params(params) except Exception as e: logger.warning(f"ML logging didn't work: {e}")
def main(): config_files = [ "experiments/ner/conll2003_de_config.json", "experiments/ner/germEval14_config.json", "experiments/text_classification/germEval18Fine_config.json", "experiments/text_classification/germEval18Coarse_config.json", "experiments/text_classification/gnad_config.json", "experiments/qa/squad20_config.json", ] for conf_file in config_files: experiments = load_experiments(conf_file) for args in experiments: logger.info( "\n***********************************************" f"\n************* Experiment: {args.task.name} ************" "\n************************************************" ) ml_logger = MLFlowLogger(tracking_uri=args.logging.mlflow_url) ml_logger.init_experiment( experiment_name=args.logging.mlflow_experiment, run_name=args.logging.mlflow_run_name, nested=args.logging.mlflow_nested, ) run_experiment(args)
def backward_propagate(self, loss, step): loss = self.adjust_loss(loss) if self.global_step % 10 == 1: MlLogger.log_metrics( {"Train_loss_total": float(loss.detach().cpu().numpy())}, step=self.global_step, ) if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if self.log_learning_rate: MlLogger.log_metrics( {"learning_rate": self.lr_schedule.get_lr()[0]}, step=self.global_step) if step % self.grad_acc_steps == 0: # TODO We might wanna add gradient clipping here self.optimizer.step() self.optimizer.zero_grad() if self.lr_schedule: self.lr_schedule.step() return loss
def log_params(self): params = { "epochs": self.epochs, "n_gpu": self.n_gpu, "device": self.device } MlLogger.log_params(params)
def log_results(results, dataset_name, steps, logging=True, print=True): logger.info( "\n***** Evaluation Results on {} data after {} steps *****".format( dataset_name, steps ) ) for head_num, head in enumerate(results): logger.info("\n _________ Prediction Head {} _________".format(head_num)) for metric_name, metric_val in head.items(): # log with ML framework (e.g. Mlflow) if logging: if isinstance(metric_val, numbers.Number): MlLogger.log_metrics( metrics={ f"{dataset_name}_{metric_name}_head{head_num}": metric_val }, step=steps, ) # print via standard python logger if print: if metric_name == "report": if isinstance(metric_val, str) and len(metric_val) > 8000: metric_val = metric_val[:7500] + "\n ............................. \n" + metric_val[-500:] logger.info("{}: \n {}".format(metric_name, metric_val)) else: logger.info("{}: {}".format(metric_name, metric_val))
def _calculate_statistics(self, ): self.counts = { "train": len(self.data["train"]), "dev": len(self.data["dev"]), "test": len(self.data.get("test", [])), } train_input_numpy = self.data["train"][:][0].numpy() seq_lens = np.sum(train_input_numpy != 0, axis=1) self.ave_len = np.mean(seq_lens) max_seq_len = self.data["train"][:][0].shape[1] self.clipped = np.mean(seq_lens == max_seq_len) logger.info("Examples in train: {}".format(self.counts["train"])) logger.info("Examples in dev : {}".format(self.counts["dev"])) logger.info("Examples in test : {}".format(self.counts["test"])) logger.info("") logger.info("Max sequence length: {}".format(max(seq_lens))) logger.info("Average sequence length: {}".format(self.ave_len)) logger.info("Proportion clipped: {}".format(self.clipped)) if self.clipped > 0.5: logger.info( "[Farmer's Tip] {}% of your samples got cut down to {} tokens. " "Consider increasing max_seq_len. " "This will lead to higher memory consumption but is likely to " "improve your model performance".format( round(self.clipped * 100, 1), max_seq_len)) MlLogger.log_params({ "n_samples_train": self.counts["train"], "n_samples_dev": self.counts["train"], "n_samples_test": self.counts["train"], "ave_seq_len": self.ave_len, "clipped": self.clipped })
def backward_propagate(self, loss, step): loss = self.adjust_loss(loss) if self.global_step % self.log_loss_every == 0 and self.local_rank in [ -1, 0 ]: if self.local_rank in [-1, 0]: MlLogger.log_metrics( {"Train_loss_total": float(loss.detach().cpu().numpy())}, step=self.global_step, ) if self.log_learning_rate: MlLogger.log_metrics( {"learning_rate": self.lr_schedule.get_last_lr()[0]}, step=self.global_step) if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if step % self.grad_acc_steps == 0: if self.max_grad_norm is not None: if self.use_amp: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() if self.lr_schedule: self.lr_schedule.step() return loss
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
def _get_optim(model, opts): """ Get the optimizer based on dictionary with options. Options are passed to the optimizer constructor. :param model: model to optimize :param opts: config dictionary that will be passed to optimizer together with the params (e.g. lr, weight_decay, correct_bias ...). no_decay' can be given - parameters containing any of those strings will have weight_decay set to 0. :return: created optimizer """ optimizer_name = opts.pop('name', None) # Logging logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'") MlLogger.log_params(opts) MlLogger.log_params({"optimizer_name": optimizer_name}) weight_decay = opts.pop('weight_decay', None) no_decay = opts.pop('no_decay', None) if no_decay: optimizable_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], **opts}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0, **opts} ] else: optimizable_parameters = [{'params': [p for p in model.parameters() if p.requires_grad], **opts}] # default weight decay is not the same for all optimizers, so we can't use default value # only explicitly add weight decay if it's given if weight_decay is not None: optimizable_parameters[0]['weight_decay'] = weight_decay # Import optimizer by checking in order: torch, transformers, apex and local imports try: optim_constructor = getattr(import_module('torch.optim'), optimizer_name) except AttributeError: try: optim_constructor = getattr(import_module('transformers.optimization'), optimizer_name) except AttributeError: try: optim_constructor = getattr(import_module('apex.optimizers'), optimizer_name) except (AttributeError, ImportError): try: # Workaround to allow loading AdamW from transformers # pytorch > 1.2 has now also a AdamW (but without the option to set bias_correction = False, # which is done in the original BERT implementation) optim_constructor = getattr(sys.modules[__name__], optimizer_name) except (AttributeError, ImportError): raise AttributeError(f"Optimizer '{optimizer_name}' not found in 'torch', 'transformers', 'apex' or 'local imports") return optim_constructor(optimizable_parameters)
def _calculate_statistics(self): """ Calculate and log simple summary statistics of the datasets """ self.counts = {} if self.data["train"]: self.counts["train"] = len(self.data["train"]) else: self.counts["train"] = 0 if self.data["dev"]: self.counts["dev"] = len(self.data["dev"]) else: self.counts["dev"] = 0 if self.data["test"]: self.counts["test"] = len(self.data["test"]) else: self.counts["test"] = 0 seq_lens = [] if self.data["train"]: for dataset in self.data["train"].datasets: train_input_numpy = dataset[:][0].numpy() seq_lens.extend(np.sum(train_input_numpy != self.processor.tokenizer.pad_token_id, axis=1)) max_seq_len = dataset[:][0].shape[1] self.clipped = np.mean(np.array(seq_lens) == max_seq_len) if seq_lens else 0 self.ave_len = np.mean(seq_lens) if seq_lens else 0 logger.info("Examples in train: {}".format(self.counts["train"])) logger.info("Examples in dev : {}".format(self.counts["dev"])) logger.info("Examples in test : {}".format(self.counts["test"])) logger.info("") if self.data["train"]: logger.info("Longest sequence length observed after clipping: {}".format(max(seq_lens))) logger.info("Average sequence length after clipping: {}".format(self.ave_len)) logger.info("Proportion clipped: {}".format(self.clipped)) if self.clipped > 0.5: logger.info("[Farmer's Tip] {}% of your samples got cut down to {} tokens. " "Consider increasing max_seq_len. " "This will lead to higher memory consumption but is likely to " "improve your model performance".format(round(self.clipped * 100, 1), max_seq_len)) MlLogger.log_params( { "n_samples_train": self.counts["train"], "n_samples_dev": self.counts["dev"], "n_samples_test": self.counts["test"], "batch_size": self.batch_size, "ave_seq_len": self.ave_len, "clipped": self.clipped, } )
def _log_params(self): params = { "processor": self.__class__.__name__, "tokenizer": self.tokenizer.__class__.__name__, } names = ["max_seq_len", "dev_split"] for name in names: value = getattr(self, name) params.update({name: str(value)}) try: MlLogger.log_params(params) except Exception as e: logger.warning(f"ML logging didn't work: {e}")
def log_results(results, dataset_name, steps, logging=True, print=True, save_path=None, num_fold=None): logger = get_logger(__name__) # Print a header header = "\n\n" header += BUSH_SEP + "\n" header += "***************************************************\n" if num_fold: header += f"***** EVALUATION | FOLD: {num_fold} | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n" else: header += f"***** EVALUATION | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n" header += "***************************************************\n" header += BUSH_SEP + "\n" logger.info(header) save_log = header for head_num, head in enumerate(results): logger.info("\n _________ {} _________".format(head['task_name'])) for metric_name, metric_val in head.items(): metric_log = None # log with ML framework (e.g. Mlflow) if logging: if not metric_name in ["preds", "probs", "labels"] and not metric_name.startswith("_"): if isinstance(metric_val, numbers.Number): MlLogger.log_metrics( metrics={ f"{dataset_name}_{metric_name}_{head['task_name']}": metric_val }, step=steps, ) # print via standard python logger if print: if metric_name == "report": if isinstance(metric_val, str) and len(metric_val) > 8000: metric_val = metric_val[:7500] + "\n ............................. \n" + metric_val[-500:] metric_log = "{}: \n {}".format(metric_name, metric_val) logger.info(metric_log) else: if not metric_name in ["preds", "probs", "labels"] and not metric_name.startswith("_"): metric_log = "{}: {};".format(metric_name, metric_val) logger.info(metric_log) if save_path and metric_log: save_log += "\n" + metric_log if save_path: with open(save_path, "w", encoding="utf-8") as log_file: log_file.write(save_log)
def eval_question_similarity(y_true, y_pred, lang, model_name, params, user=None, log_to_mlflow=True, run_name="default"): # basic metrics mean_diff = np.mean(np.abs(y_true - y_pred)) roc_auc = roc_auc_score(y_true, y_pred) f1 = f1_score(y_true, y_pred.round(0)) metrics = {"roc_auc": roc_auc, "mean_abs_diff": mean_diff, "f1_score": f1} print(metrics) # log experiment results to MLFlow (visit https://public-mlflow.deepset.ai/) if log_to_mlflow: params["lang"] = lang params["model_name"] = model_name if user: params["user"] = user ml_logger = MLFlowLogger( tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="COVID-question-sim", run_name=run_name) ml_logger.log_params(params) ml_logger.log_metrics(metrics, step=0)
def log_params(self): """ Logs paramteres to generic logger MlLogger :return: just log into the void """ params = { "lm": self.language_model.__class__.__name__, "prediction_heads": ",".join( [head.__class__.__name__ for head in self.prediction_heads] ), "lm_output_types": ",".join(self.lm_output_types), } try: MlLogger.log_params(params) except Exception as e: logger.warning(f"ML logging didn't work: {e}")
def _calculate_statistics(self, ): self.counts = { "train": len(self.data["train"]), "dev": len(self.data["dev"]), "test": len(self.data.get("test", [])), } logger.info("Examples in train: {}".format(self.counts["train"])) logger.info("Examples in dev : {}".format(self.counts["dev"])) logger.info("Examples in test : {}".format(self.counts["test"])) MlLogger.log_params({ "n_samples_train": self.counts["train"], "n_samples_dev": self.counts["train"], "n_samples_test": self.counts["train"], })
def init_logging(): logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) ml_logger = MLFlowLogger(tracking_uri="logs") return logger, ml_logger
def dataset_from_file(self, file, log_time=True): """ Contains all the functionality to turn a data file into a PyTorch Dataset and a list of tensor names. This is used for training and evaluation. :param file: Name of the file containing the data. :type file: str :return: a Pytorch dataset and a list of tensor names. """ if log_time: a = time.time() self._init_baskets_from_file(file) b = time.time() MlLogger.log_metrics(metrics={"t_from_file": (b - a) / 60}, step=0) self._init_samples_in_baskets() c = time.time() MlLogger.log_metrics(metrics={"t_init_samples": (c - b) / 60}, step=0) self._featurize_samples() d = time.time() MlLogger.log_metrics(metrics={"t_featurize_samples": (d - c) / 60}, step=0) self._log_samples(3) else: self._init_baskets_from_file(file) self._init_samples_in_baskets() self._featurize_samples() self._log_samples(3) dataset, tensor_names = self._create_dataset() return dataset, tensor_names
def get_scheduler(optimizer, opts): """ Get the scheduler based on dictionary with options. Options are passed to the scheduler constructor. :param optimizer: optimizer whose learning rate to control :param opts: dictionary of args to be passed to constructor of schedule :return: created scheduler """ schedule_name = opts.get('name') try: sched_constructor = getattr(import_module('torch.optim.lr_scheduler'), schedule_name) except AttributeError: try: # The method names in transformers became quite long and unhandy. # for convenience we offer usage of shorter alias (e.g. "LinearWarmup") scheduler_translations = { "LinearWarmup": "get_linear_schedule_with_warmup", "ConstantWarmup": "get_constant_schedule_with_warmup", "Constant": "get_constant_schedule", "CosineWarmup": "get_cosine_schedule_with_warmup", "CosineWarmupWithRestarts": "get_cosine_with_hard_restarts_schedule_with_warmup" } if schedule_name in scheduler_translations.keys(): schedule_name = scheduler_translations[schedule_name] # in contrast to torch, we actually get here a method and not a class sched_constructor = getattr( import_module('transformers.optimization'), schedule_name) except AttributeError: raise AttributeError( f"Scheduler '{schedule_name}' not found in 'torch' or 'transformers'" ) logger.info(f"Using scheduler '{schedule_name}'") # get supported args of constructor allowed_args = inspect.signature(sched_constructor).parameters.keys() # convert from warmup proportion to steps if required if 'num_warmup_steps' in allowed_args and 'num_warmup_steps' not in opts and 'warmup_proportion' in opts: opts['num_warmup_steps'] = int(opts["warmup_proportion"] * opts["num_training_steps"]) MlLogger.log_params({"warmup_proportion": opts["warmup_proportion"]}) # only pass args that are supported by the constructor constructor_opts = {k: v for k, v in opts.items() if k in allowed_args} # Logging logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'") MlLogger.log_params(constructor_opts) MlLogger.log_params({"schedule_name": schedule_name}) scheduler = sched_constructor(optimizer, **constructor_opts) scheduler.opts = opts # save the opts with the scheduler to use in load/save return scheduler
def backward_propagate(self, loss, step): loss = self.adjust_loss(loss) if self.global_step % 10 == 1: MlLogger.log_metrics( {"Train_loss_total": float(loss.detach().cpu().numpy())}, step=self.global_step, ) if self.fp16: self.optimizer.backward(loss) else: loss.backward() if (step + 1) % self.grad_acc_steps == 0: if self.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = self.learning_rate * self.warmup_linear.get_lr( self.global_step, self.warmup_proportion) for param_group in self.optimizer.param_groups: param_group["lr"] = lr_this_step # MlLogger.write_metrics({"learning_rate": lr_this_step}, step=self.global_step) self.optimizer.step() self.optimizer.zero_grad()
def doc_classification( task_config, model_name_or_path, cache_dir, data_dir, save_dir, model_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = data_dir save_dir = save_dir model_dir = model_dir # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / 'labels' / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config["data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"]) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment( experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights( task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": warmup_steps } model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping(mode=early_stopping_mode, min_delta=0.0001, save_dir=model_dir, metric=early_stopping_metric, patience=early_stopping_patience) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(model_dir + "/final_model") processor.save(model_dir + "/final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = model_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir + "/eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir + "/eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
def finetune_sentence_level(args): logging.basicConfig( format="%(asctime)s %(levelname)s %(name)s %(message)s", datefmt="%d-%m-%y %H:%M:%S", level=logging.INFO) args.logger = logging.getLogger(__name__) if args.do_logfile: filehandler = logging.FileHandler( os.path.join(args.log_dir, f"{args.run_name}.log")) args.logger.addHandler(filehandler) args.logger.info(vars(args)) # Setup MLFlow ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) set_all_seeds(seed=args.seed) args.device, args.n_gpu = initialize_device_settings(use_cuda=True) # Create a tokenizer tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name, do_lower_case=args.do_lower_case, tokenizer_class=tok_class) # Create a processor for the dataset processor = load_processor(args, tokenizer) # Create a DataSilo that loads several datasets (train/dev/test) # provides DataLoaders and calculates descriptive statistics data_silo = DataSilo(processor=processor, batch_size=args.batch_size) if args.do_feat_embeds: args.feat_size = processor.feat_size # We do cross-validation if args.folds > 1: evaluate_kfold(args, data_silo, processor) else: adapt_model = train_on_split(args, data_silo, processor) evaluator_test = MultitaskEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=args.device) result = evaluator_test.eval(adapt_model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len( data_silo.get_data_loader("test"))) pred_tsv = pd.DataFrame() args.logger.info("Test results:") for res in result[1:]: args.logger.info(f"__{res['task_name']}__") if args.train_mode == "classification": metrics = classification_metrics(res.get("preds"), res.get("labels")) args.logger.info(metrics) else: metrics = regression_metrics(res.get("preds"), res.get("labels")) for metric in metrics.keys(): args.logger.info(f"{metric}: {metrics[metric]}") if args.save_predictions: pred_tsv[f"{res['task_name']}_preds"] = res.get("preds")[0] pred_tsv[f"{res['task_name']}_labels"] = res.get("labels")[0] if args.save_predictions: save_tsv(pred_tsv, os.path.join(args.out_dir, f"{args.run_name}.tsv")) # Load trained model and perform inference dicts = [ { "text": "The intense interest aroused in the public has now somewhat subsided." }, { "text": "The quick brown fox jumped over the lazy dog." }, ] model = MultitaskInferencer.load(args.save_dir, gpu=True, level="sentence") result = model.inference_from_dicts(dicts=dicts) args.logger.info("Inference example:") args.logger.info(result)
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
from farm.data_handler.processor import RegressionProcessor from farm.experiment import initialize_optimizer from farm.infer import Inferencer from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import LanguageModel from farm.modeling.prediction_head import RegressionHead from farm.modeling.tokenization import Tokenizer from farm.train import Trainer from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" # 1.Create a tokenizer tokenizer = Tokenizer.from_pretrained(pretrained_model_name_or_path=lang_model,
from farm.modeling.optimization import initialize_optimizer from farm.infer import Inferencer from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import Bert from farm.modeling.prediction_head import TokenClassificationHead from farm.modeling.tokenization import BertTokenizer from farm.train import Trainer from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_minimal_example_ner") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 batch_size = 32 evaluate_every = 50 lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained(
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 2000 lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("../data/squad20"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-english-qa-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = QAInferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input)[0] pprint.pprint(result) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir, processor.dev_filename) result = model.inference_from_file(file=filename, return_json=False) result_squad = [x.to_squad_eval() for x in result] write_squad_predictions(predictions=result_squad, predictions_filename=filename, out_filename="predictions.json")
def question_answering_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) #ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") #ml_logger.init_experiment(experiment_name="QA_X-Validation", run_name="Squad_Roberta_Base") ########################## ########## Settings ########################## save_per_fold_results = False # unsupported for now set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = False n_epochs = 2 batch_size = 80 learning_rate = 3e-5 data_dir = Path("../data/covidqa") filename = "COVID-QA.json" xval_folds = 5 dev_split = 0 evaluate_every = 0 no_ans_boost = -100 # use large negative values to disable giving "no answer" option accuracy_at = 3 # accuracy at n is useful for answers inside long documents use_amp = None ########################## ########## k fold Cross validation ########################## # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=["start_token", "end_token"], metric="squad", train_filename=filename, dev_filename=None, dev_split=dev_split, test_filename=None, data_dir=data_dir, doc_stride=192, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # fine-tune pre-trained question-answering model model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # If positive, thjs will boost "No Answer" as prediction. # If negative, this will prevent the model from giving "No Answer" as prediction. model.prediction_heads[0].no_ans_boost = no_ans_boost # Number of predictions the model will make per Question. # The multiple predictions are used for evaluating top n recall. model.prediction_heads[0].n_best = accuracy_at # # or train question-answering models from scratch # # Create an AdaptiveModel # # a) which consists of a pretrained language model as a basis # language_model = LanguageModel.load(lang_model) # # b) and a prediction head on top that is suited for our task => Question-answering # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=accuracy_at) # model = AdaptiveModel( # language_model=language_model, # prediction_heads=[prediction_head], # embeds_dropout_prob=0.1, # lm_output_types=["per_token"], # device=device,) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, then evaluate the model on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging all_results = [] all_preds = [] all_labels = [] all_f1 = [] all_em = [] all_topnaccuracy = [] for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", logging=False, steps=len(silo.get_data_loader("test")), num_fold=num_fold) all_results.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) all_f1.append(result[0]["f1"]) all_em.append(result[0]["EM"]) all_topnaccuracy.append(result[0]["top_n_accuracy"]) # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis # TODO currently not supported - adjust to QAPred and QACandidate objects # if save_per_fold_results: # def convert_numpy_dtype(obj): # if type(obj).__module__ == "numpy": # return obj.item() # # raise TypeError("Unknown type:", type(obj)) # # with open("qa_xval.results.json", "wt") as fp: # json.dump(all_results, fp, default=convert_numpy_dtype) # calculate overall metrics across all folds xval_score = squad(preds=all_preds, labels=all_labels) logger.info(f"Single EM-Scores: {all_em}") logger.info(f"Single F1-Scores: {all_f1}") logger.info( f"Single top_{accuracy_at}_accuracy Scores: {all_topnaccuracy}") logger.info(f"XVAL EM: {xval_score['EM']}") logger.info(f"XVAL f1: {xval_score['f1']}") logger.info( f"XVAL top_{accuracy_at}_accuracy: {xval_score['top_n_accuracy']}") ml_logger.log_metrics({"XVAL EM": xval_score["EM"]}, 0) ml_logger.log_metrics({"XVAL f1": xval_score["f1"]}, 0) ml_logger.log_metrics( {f"XVAL top_{accuracy_at}_accuracy": xval_score["top_n_accuracy"]}, 0)