def build_trainer( config, lr: float, serialization_dir: str, num_epochs: int, model: Model, train_loader: DataLoader, dev_loader: DataLoader) -> Trainer: parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad] optimizer = AdamOptimizer(parameters, lr=lr) if torch.cuda.is_available(): model.cuda() # remove serialization dir if os.path.exists(serialization_dir) and config.shutil_pre_finished_experiment: shutil.rmtree(serialization_dir) if not os.path.exists(serialization_dir): os.makedirs(serialization_dir) trainer = GradientDescentTrainer( model=model, data_loader=train_loader, validation_data_loader=dev_loader, num_epochs=num_epochs, optimizer=optimizer, serialization_dir=serialization_dir, cuda_device=0 if torch.cuda.is_available() else -1 ) return trainer
def train( model: Model, binary_class: str, train_data: DatasetType, valid_reader: DatasetReader, vocab: Vocabulary, optimizer_type: str, optimizer_learning_rate: float, optimizer_weight_decay: float, batch_size: int, patience: int, num_epochs: int, device: str, ) -> Tuple[Model, MetricsType]: train_reader = BIODatasetReader( ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class), token_indexers={ 'tokens': ELMoTokenCharactersIndexer(), }, ) train_dataset = train_reader.read('tmp.txt') valid_dataset = valid_reader.read('tmp.txt') cuda_device = -1 if device == 'cuda': cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD( model.parameters(), lr=optimizer_learning_rate, weight_decay=optimizer_weight_decay, ) iterator = BucketIterator( batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")], ) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, patience=patience, num_epochs=num_epochs, cuda_device=cuda_device, validation_metric='f1-measure-overall', ) metrics = trainer.train() return model, metrics
def build_trainer( config, model: Model, train_loader: DataLoader, dev_loader: DataLoader, ) -> Trainer: parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad] optimizer = AdamOptimizer(parameters, lr=config.lr) # type: ignore model.cuda() trainer = GradientDescentTrainer( model=model, data_loader=train_loader, validation_data_loader=dev_loader, num_epochs=config.num_epochs, optimizer=optimizer, cuda_device=0, serialization_dir=config.serialization_dir) return trainer
def run( # type: ignore self, model: Model, dataset: DatasetDict, split: str = "validation", data_loader: Optional[Lazy[TangoDataLoader]] = None, ) -> EvaluationResult: """ Runs an evaluation on a dataset. * `model` is the model we want to evaluate. * `dataset` is the dataset we want to evaluate on. * `split` is the name of the split we want to evaluate on. * `data_loader` gives you the chance to choose a custom dataloader for the evaluation. By default this step evaluates on batches of 32 instances each. """ concrete_data_loader: TangoDataLoader if data_loader is None: concrete_data_loader = BatchSizeDataLoader(dataset.splits[split], batch_size=32, shuffle=False) else: concrete_data_loader = data_loader.construct( instances=dataset.splits[split]) if torch.cuda.device_count() > 0: model = model.cuda() cuda_device = torch.device(0) else: cuda_device = torch.device("cpu") generator_tqdm = Tqdm.tqdm(iter(concrete_data_loader)) # Number of batches in instances. predictions: List[Dict[str, Any]] = [] # Number of batches where the model produces a loss. loss_count = 0 batch_count = 0 # Cumulative loss total_loss = 0.0 with torch.inference_mode(): model.eval() for batch in concrete_data_loader: batch_count += 1 batch = move_to_device(batch, cuda_device) output_dict = model(**batch) metrics = model.get_metrics() loss = output_dict.pop("loss", None) if loss is not None: loss_count += 1 total_loss += loss.item() metrics["loss"] = total_loss / loss_count if any( metric_name.startswith("_") for metric_name in metrics): self.logger.warning_once( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar.") description = (", ".join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||") generator_tqdm.set_description(description, refresh=False) output_dict = sanitize(output_dict) # This is write-only code, but it's quite fast. predictions.extend( dict(zip(output_dict.keys(), x)) for x in zip(*output_dict.values())) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes produced a loss!" ) final_metrics["loss"] = total_loss / loss_count return self.EvaluationResult(final_metrics, predictions)
def _from_params( cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> DecompTrainer: # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) validation_data_path = params.pop("validation_data_path", None) validation_prediction_path = params.pop("validation_prediction_path", None) semantics_only = params.pop("semantics_only", False) drop_syntax = params.pop("drop_syntax", True) include_attribute_scores = params.pop("include_attribute_scores", False) warmup_epochs = params.pop("warmup_epochs", 0) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) bert_optim_params = params.pop("bert_optimizer", None) bert_name = "_bert_encoder" if bert_optim_params is not None: tune_after_layer_num = params.pop("bert_tune_layer", 12) frozen_regex_str = [ "(_bert_encoder\.bert_model\.embeddings.*)", "(_bert_encoder\.bert_model\.pooler.*)" ] tune_regex_str = [] for i in range(0, 12): # match all numbers greater than layer num via disjunction tune_regex_one = f"({bert_name}\.bert_model\.encoder\.layer\.{i}\..*)" if i >= tune_after_layer_num: tune_regex_str.append(tune_regex_one) else: frozen_regex_str.append(tune_regex_one) tune_regex = re.compile("|".join(tune_regex_str)) frozen_regex = re.compile("|".join(frozen_regex_str)) # decide which params require grad for which optimizer all_names = [n for n, p in model.named_parameters()] tune_bert_names = [ n for n in all_names if tune_regex.match(n) is not None ] frozen_names = [ n for n in all_names if frozen_regex.match(n) is not None ] # assert that they're disjoint assert (len(set(frozen_names) & set(tune_bert_names)) == 0) # set tunable params to require gradient, frozen ones to not require for i, (n, p) in enumerate(model.named_parameters()): if n in frozen_names: p.requires_grad = False else: p.requires_grad = True # extract BERT bert_params = [[n, p] for n, p in model.named_parameters() if p.requires_grad and n in tune_bert_names] # make sure this matches the tuneable bert params assert ([x[0] for x in bert_params] == tune_bert_names) bert_optimizer = Optimizer.from_params(bert_params, bert_optim_params) else: # freeze all BERT params tune_bert_names = [] bert_optimizer = None for i, (n, p) in enumerate(model.named_parameters()): if "_bert_encoder" in n: p.requires_grad = False # model params parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad and n not in tune_bert_names] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) syntactic_method = params.pop("syntactic_method", None) accumulate_batches = params.pop("accumulate_batches", 1) params.assert_empty(cls.__name__) return cls(model=model, optimizer=optimizer, bert_optimizer=bert_optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=validation_data, validation_data_path=validation_data_path, validation_prediction_path=validation_prediction_path, semantics_only=semantics_only, warmup_epochs=warmup_epochs, syntactic_method=syntactic_method, drop_syntax=drop_syntax, include_attribute_scores=include_attribute_scores, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, accumulate_batches=accumulate_batches)
def train_model(args, model: Model, train_dataset, valid_dataset, test_dataset=None, metric='fscore'): output_model_path = args.model_path iterator = BucketIterator(sorting_keys=[('text', 'num_tokens')], batch_size=args.batch) iterator.index_with(model.vocab) model.vocab.save_to_files(os.path.join(output_model_path, 'vocab')) save_model_options(file_path=os.path.join(output_model_path, 'model.option'), options=args) optimizer = env_utils.prepare_optimizer(args, model) if torch.cuda.is_available(): cuda_device = args.device model = model.cuda(cuda_device) else: cuda_device = -1 logger.info(model) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, patience=args.patience, num_epochs=args.epoch, cuda_device=cuda_device, serialization_dir=output_model_path, num_serialized_models_to_keep=1, validation_metric='+' + metric, learning_rate_scheduler=LearningRateScheduler.from_params( optimizer, Params( { 'type': 'reduce_on_plateau', 'patience': args.lr_reduce_patience, 'verbose': True, 'factor': args.lr_reduce_factor, 'mode': 'max' }, )), automatic_mixed_precision=args.fp16) train_result = trainer.train() dump_metrics(os.path.join(output_model_path, f'metrics.json'), train_result) valid_result = { 'loss': train_result['best_validation_loss'], 'precision': train_result['best_validation_precision'], 'recall': train_result['best_validation_recall'], 'fscore': train_result['best_validation_fscore'], 'accuracy': train_result['best_validation_accuracy'], } result_str = "Final Valid Loss: %.4f, Acc: %.2f, P: %.2f, R: %.2f, F1: %.2f" % ( valid_result['accuracy'], valid_result['loss'], valid_result['precision'], valid_result['recall'], valid_result['fscore']) logger.info(result_str) if test_dataset: test_result = evaluate(model, test_dataset, iterator, cuda_device=cuda_device, batch_weight_key="") result_str = "Final Test Loss: %.4f, Acc: %.2f, P: %.2f, R: %.2f, F1: %.2f" % ( test_result['accuracy'], test_result['loss'], test_result['precision'], test_result['recall'], test_result['fscore']) logger.info(result_str) logger.info("Model Path: %s" % output_model_path)