def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [ instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation ] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) stdout_handler = prepare_global_logging(serialization_dir, False) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) cleanup_global_logging(stdout_handler)
def main(args): params = Params.from_file(args.config_path) stdout_handler = prepare_global_logging(args.output_dir, False) prepare_environment(params) reader = DatasetReader.from_params(params["dataset_reader"]) train_dataset = reader.read(params.pop("train_data_path", None)) valid_dataset = reader.read(params.pop("validation_data_path", None)) test_data_path = params.pop("test_data_path", None) if test_data_path: test_dataset = reader.read(test_data_path) vocab = Vocabulary.from_instances(train_dataset + valid_dataset + test_dataset) else: test_dataset = None vocab = Vocabulary.from_instances(train_dataset + valid_dataset) model_params = params.pop("model", None) model = Model.from_params(model_params.duplicate(), vocab=vocab) vocab.save_to_files(os.path.join(args.output_dir, "vocabulary")) # copy config file with open(args.config_path, "r", encoding="utf-8") as f_in: with open(os.path.join(args.output_dir, "config.json"), "w", encoding="utf-8") as f_out: f_out.write(f_in.read()) iterator = DataIterator.from_params(params.pop("iterator", None)) iterator.index_with(vocab) trainer_params = params.pop("trainer", None) trainer = Trainer.from_params(model=model, serialization_dir=args.output_dir, iterator=iterator, train_data=train_dataset, validation_data=valid_dataset, params=trainer_params.duplicate()) trainer.train() # evaluate on the test set if test_dataset: logging.info("Evaluating on the test set") import torch # import here to ensure the republication of the experiment model.load_state_dict( torch.load(os.path.join(args.output_dir, "best.th"))) test_metrics = evaluate(model, test_dataset, iterator, cuda_device=trainer_params.pop( "cuda_device", 0), batch_weight_key=None) logging.info(f"Metrics on the test set: {test_metrics}") with open(os.path.join(args.output_dir, "test_metrics.txt"), "w", encoding="utf-8") as f_out: f_out.write(f"Metrics on the test set: {test_metrics}") cleanup_global_logging(stdout_handler)
def elmo_command(args): elmo_embedder = ElmoEmbedder(args.options_file, args.weight_file, args.cuda_device) output_format = u"" if args.all: output_format = u"all" elif args.top: output_format = u"top" elif args.average: output_format = u"average" prepare_global_logging(os.path.realpath(os.path.dirname(args.output_file)), args.file_friendly_logging) with torch.no_grad(): elmo_embedder.embed_file(args.input_file, args.output_file, output_format, args.batch_size, args.forget_sentences, args.use_sentence_keys)
def elmo_command(args): elmo_embedder = ElmoEmbedder(args.options_file, args.weight_file, args.cuda_device) output_format = "" if args.all: output_format = "all" elif args.top: output_format = "top" elif args.average: output_format = "average" prepare_global_logging(os.path.realpath(os.path.dirname(args.output_file)), args.file_friendly_logging) with torch.no_grad(): elmo_embedder.embed_file( args.input_file, args.output_file, output_format, args.batch_size, args.forget_sentences, args.use_sentence_keys)
def train_model(params: Params, serialization_dir: str, selector: str, num_ensemble_models: Optional[int], file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model_params = params.pop('model') if selector == 'qbc': assert num_ensemble_models is not None models_list = [Model.from_params(vocab=vocab, params=model_params.duplicate()) for i in range(num_ensemble_models)] ensemble_model = CorefEnsemble(models_list) model = ensemble_model.submodels[0] else: model = Model.from_params(vocab=vocab, params=model_params) ensemble_model = None # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None held_out_iterator_params = params.pop("held_out_iterator", None) if held_out_iterator_params: held_out_iterator = DataIterator.from_params(held_out_iterator_params) held_out_iterator.index_with(vocab) else: held_out_iterator = None train_data = all_datasets['train'] held_out_train_data = all_datasets.get('held_out_train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop("type") trainer = ALCorefTrainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, held_out_train_data=held_out_train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, held_out_iterator=held_out_iterator, ensemble_model=ensemble_model) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics, query_info = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) best_model = None logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0], batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value return best_model, metrics, query_info
def debug_vocab(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- parameter_filename : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', -1)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) vocab = model.vocab vocab_namespace_dict = vocab._token_to_index vocab_oov_token = vocab._oov_token vocab_non_padded_namespaces = vocab._non_padded_namespaces # this is a set vocab_tokens_dict = vocab_namespace_dict['tokens'] vocab_labels_dict = vocab_namespace_dict['labels'] print() print("Vocab's OOV token: " + vocab_oov_token) print("Non-padded namespaces in vocab: " + str(list(vocab_non_padded_namespaces))) print() print("Number of words in vocab's tokens dict: " + str(len(vocab_tokens_dict))) if any( namespace_match(pattern, 'tokens') for pattern in vocab_non_padded_namespaces): is_padded = False else: is_padded = True print("tokens will return True for is_padded: " + str(is_padded)) print("Vocab's OOV token is in its tokens dict (should be True): " + str(vocab_oov_token in vocab_tokens_dict)) print() print("Number of words in vocab's labels dict: " + str(len(vocab_labels_dict))) if any( namespace_match(pattern, 'labels') for pattern in vocab_non_padded_namespaces): is_padded = False else: is_padded = True print("labels will return True for is_padded: " + str(is_padded)) print("Vocab's OOV token is in its labels dict (should be False): " + str(vocab_oov_token in vocab_labels_dict))
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) ############################################################################################################################################ prev_state_dict = torch.load("/home/ubuntu/Hurricanes/model/bestS.th", map_location='cpu') for n, p in model.named_parameters(): if ( n in prev_state_dict and n != 'linear.weight' and n != 'linear.bias' and n != 'classifier_feedforward._linear_layers.1.weight' and n != 'classifier_feedforward._linear_layers.1.bias' and n != 'classifier_feedforward._linear_layers.0.weight' and n != 'classifier_feedforward._linear_layers.0.bias' ): w = prev_state_dict[n] p.data.copy_(w.data) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
is_master_rank = (dist.get_rank() == args.local_rank) serialize_config_file = os.path.join(serialization_dir, CONFIG_NAME) recover = os.path.exists(serialize_config_file) if is_master_rank: if not os.path.exists(serialization_dir): os.makedirs(serialization_dir, exist_ok=True) params = ConstParams.from_file(param_fil) params.to_file(serialize_config_file) dist.barrier() params = ConstParams.from_file(serialize_config_file) log_dir = os.path.join(serialization_dir, str(dist.get_rank())) os.makedirs(log_dir, exist_ok=True) stdout_handler = prepare_global_logging(log_dir, file_friendly_logging=False) prepare_environment(params) cuda_device = params.trainer.get('cuda_device', -1) check_for_gpu(cuda_device) trainer_type = params.trainer.type trainer = TrainerBase.from_params(params, serialization_dir, recover) params_cnt, params_trainable_cnt = count_parameters(trainer.model) print("all params cnt: ", params_cnt) print("all trainable params cnt: ", params_trainable_cnt) metrics = trainer.train() cleanup_global_logging(stdout_handler)
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> Model: create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if True: # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) logger.info("Using MultiTrainer") from lm.trainining.MultiTaskTrainer import MultiTaskTrainer # MultiTrainer trainer = MultiTaskTrainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") """ The only main difference """ print("Using MultuTrainer") logger.info("Using MultiTrainer") trainer = MultiTrainer.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty("base train command") try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def train_model(params, serialization_dir, file_friendly_logging=False, recover=False, model="bidaf"): """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ print("Starting training models...") prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) print("get all of the dataset.") datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") print("creatig vocaburary...") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) if model == "self": model = BiDAFSelfAttention.from_params(vocab, params.pop("model")) else: model = BidirectionalAttentionFlow.from_params(vocab, params.pop("model")) print("Initialized a BiDAF model.") # This is for debugging. print(model) print(serialization_dir) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) print("create iterator") train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') print("initalizing a trainer") trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
parser.add_argument("-a", "--alpha", type=float, help="SIA alpha") parser.add_argument("-b", "--beta", type=float, help="SIA alpha") parser.add_argument("-o", "--output_dir", help="SIA alpha", default="results") args = parser.parse_args() # put it at the start of the file logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) create_serialization_dir(Params({"seed": 123}), args.output_dir, recover=False, force=True) stdout_handler = prepare_global_logging(serialization_dir=args.output_dir, file_friendly_logging=False) checkpointer = Checkpointer(args.output_dir, keep_serialized_model_every_num_seconds=3600) tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") class SIAHeadlineDataset(Dataset): def __init__(self, data): super().__init__() self.dataset = data self.CLS = [101] self.SEP = [102] def __len__(self): return len(self.dataset)
def main(): ############################################################################################### prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False) #DATA reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')}, target=False, label=True, lazy=False) # train_data = reader.read("../../datasets/math/label-data/train-all") # val_data = reader.read("../../datasets/math/label-data/interpolate") val_data = reader.read("./generate_files") vocab = Vocabulary() vocab.add_tokens_to_namespace([START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}'], namespace='tokens') vocab.add_tokens_to_namespace(['algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability'], namespace='labels') # MODEL embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) source_embedder = BasicTextFieldEmbedder({"tokens": embedding}) if args.model == 'lstm': encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)) elif args.model == 'cnn': encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM) else: raise NotImplemented("The classifier model should be LSTM or CNN") model = TextClassifier(vocab=vocab, source_text_embedder=source_embedder, encoder=encoder, ) model.to(device) if not Path(args.serialization_dir).exists() or not Path(args.serialization_dir).is_dir(): raise NotImplementedError("The model seems not to exist") with open(Path(args.serialization_dir) / "best.th", "rb") as model_path: model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) model.load_state_dict(model_state) model.eval() predictor = TextClassifierPredictor(model, dataset_reader=reader) # TEST correct = 0 total = 0 pbar = tqdm(val_data) batch_instance = list() batch_gt = list() idx_last = 0 for idx, instance in enumerate(pbar): if idx != (idx_last + BATCH_SIZE): batch_instance.append(instance) batch_gt.append(instance.fields["labels"].label) # str else: idx_last = idx outputs = predictor.predict(batch_instance) for i, output in enumerate(outputs): if batch_gt[i] == output['predict_labels']: correct += 1 total += 1 batch_instance = list() batch_gt = list() pbar.set_description("correct/total %.3f" % (correct / total))
def fine_tune_model(model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False, batch_weight_key: str = "", embedding_sources_mapping: Dict[str, str] = None, in_fold = None, num_folds = None, ewc_weight=None) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- model : ``Model`` A model to fine tune. params : ``Params`` A parameter object specifying an AllenNLP Experiment serialization_dir : ``str`` The directory in which to save results and logs. extend_vocab: ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. embedding_sources_mapping: ``Dict[str, str]``, optional (default=None) mapping from model paths to the pretrained embedding filepaths used during fine-tuning. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning("You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive.") vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning("You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances(vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model.extend_embedder_vocab(embedding_sources_mapping) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') dl_params = params.pop("data_loader") if test_data is not None: rand = random.Random(1234) test_data.index_with(vocab) shuffled_test = copy(test_data.instances) rand.shuffle(shuffled_test) extra_test = shuffled_test[:2000] keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": AllennlpDataset(extra_test, vocab)}) extra_test_loader = DataLoader.from_params(params.pop("test_data_loader", keys)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": test_data}) test_loader = DataLoader.from_params(params.pop("test_data_loader", keys)) master_model = model global_metrics = {} training_metrics = [] final_metrics = {} master_trainer = trainer_params.as_dict() if num_folds is not None: rand = random.Random(1234) fold_train = [] fold_test = [] fold_train_loader = [] fold_test_loader = [] shuffled_instances = copy(train_data.instances) rand.shuffle(shuffled_instances) kfold = KFold(n_splits=num_folds, random_state=None, shuffle=False) computed_folds = list(kfold.split(shuffled_instances)) for fold in range(num_folds): train_indexes, test_indexes = computed_folds[fold] new_train = [shuffled_instances[i] for i in train_indexes] new_test = [shuffled_instances[i] for i in test_indexes] fold_train.append(AllennlpDataset(new_train, vocab=vocab)) fold_test.append(AllennlpDataset(new_test, vocab=vocab)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": fold_test[-1]}) fold_test_loader.append(DataLoader.from_params(params.pop("fold_test_data_loader",keys))) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": fold_train[-1]}) fold_train_loader.append(DataLoader.from_params(params.pop("fold_train_data_loader", keys))) for fold in ([in_fold] if in_fold is not None else range(num_folds)): fold_model = deepcopy(master_model) eval_epoch_callback = EvalEpochCallback(fold, fold_test_loader[fold], test_loader, global_metrics) callbacks = [eval_epoch_callback] if ewc_weight is not None: ewc = EWC(extra_test_loader) def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]: ewc_loss = 0 if ewc.model.training: ewc_loss = ewc.penalty(ewc.model) ret = ewc.model.old_forward(*args, **kwargs) ret["loss"] += ewc_weight * ewc_loss return ret fold_model.old_forward = fold_model.forward fold_model.forward = ewc_forward callbacks.append(CallLossCallback(ewc)) trainer = Trainer.from_params(model=fold_model, serialization_dir=serialization_dir, data_loader=fold_train_loader[fold], train_data=train_data, validation_data=None, params=Params(deepcopy(master_trainer)), validation_data_loader=None, epoch_callbacks=callbacks) training_metrics.append(trainer.train()) del fold_model del trainer del eval_epoch_callback state = glob(serialization_dir+"/*.th") for file in state: logger.info("deleting state - {}".format(file)) os.unlink(file) else: callbacks = [] if ewc_weight is not None: ewc = EWC(extra_test_loader) def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]: ewc_loss = 0 if ewc.model.training: ewc_loss = ewc.penalty(ewc.model) ret = ewc.model.old_forward(*args, **kwargs) ret["loss"] += ewc_weight * ewc_loss return ret model.old_forward = model.forward model.forward = ewc_forward callbacks.append(CallLossCallback(ewc)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": train_data}) train_data.index_with(vocab) train_data_loader = DataLoader.from_params(params.pop("train_loader",keys)) if validation_data is not None: validation_data.index_with(vocab) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": validation_data}) validation_data_loader = DataLoader.from_params(params.pop("validation_loader", keys)) else: validation_data_loader = None if "finetune" in dir(model): model.finetune() logger.info("Fine tuning model") trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, train_data=train_data, validation_data=None, params=Params(deepcopy(master_trainer)), validation_data_loader=validation_data_loader, epoch_callbacks=callbacks) training_metrics = trainer.train() archive_model(serialization_dir) final_metrics["fine_tune"] = global_metrics final_metrics["training"] = training_metrics metrics_json = json.dumps(final_metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def main(): ############################################################################################### prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False) #DATA reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='tokens') }, target=False, label=True, lazy=True) train_data = reader.read("../../datasets/math/label-data/train-all") # val_data = reader.read("../../datasets/math/label-data/interpolate") vocab = Vocabulary() vocab.add_tokens_to_namespace([ START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}' ], namespace='tokens') vocab.add_tokens_to_namespace([ 'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement', 'numbers', 'polynomials', 'probability' ], namespace='labels') # MODEL embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) source_embedder = BasicTextFieldEmbedder({"tokens": embedding}) if args.model == 'lstm': encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)) elif args.model == 'cnn': encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM) else: raise NotImplemented("The classifier model should be LSTM or CNN") model = TextClassifier( vocab=vocab, source_text_embedder=source_embedder, encoder=encoder, ) model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.995), eps=1e-6) train_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens")]) train_iterator = MultiprocessIterator(train_iterator, num_workers=16) train_iterator.index_with(vocab) val_iterator = BucketIterator(batch_size=BATCH_SIZE, max_instances_in_memory=1024, sorting_keys=[("source_tokens", "num_tokens") ]) val_iterator = MultiprocessIterator(val_iterator, num_workers=16) val_iterator.index_with(vocab) #pdb.set_trace() LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1} lr_scheduler = LearningRateScheduler.from_params(optimizer, Params(LR_SCHEDULER)) # TRAIN trainer = Trainer(model=model, optimizer=optimizer, iterator=train_iterator, validation_iterator=None, train_dataset=train_data, validation_dataset=None, patience=None, shuffle=True, num_epochs=1, summary_interval=100, learning_rate_scheduler=lr_scheduler, cuda_device=CUDA_DEVICES, grad_norm=5, grad_clipping=5, model_save_interval=600, serialization_dir=args.serialization_dir, keep_serialized_model_every_num_seconds=3600, should_log_parameter_statistics=True, should_log_learning_rate=True) trainer.train()
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, del_models: bool = False, del_vocab: bool = False, convert: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. del_models : ``bool``, optional (default=False) If ``True``, we will delete existing models and logs if they already exist. del_vocab : ``bool``, optional (default=False) If ``True``, we will delete existing vocabulary if it already exists. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ cuda_device = params.params.get('trainer').get('cuda_device', -1) if cuda_device >= 0: check_for_gpu(cuda_device) torch.cuda.set_device(cuda_device) # Sometimes we might change the config a bit but still want to continue training # if recover: # create_serialization_dir( # params, serialization_dir, recover, del_models) if del_models: for path in glob(f'{serialization_dir}/*'): if os.path.isfile(path) and not path.endswith('config.yaml'): os.remove(path) log_path = f'{serialization_dir}/log' if os.path.isdir(log_path): shutil.rmtree(log_path) if del_vocab: vocab_path = f'{serialization_dir}/vocabulary' if os.path.isdir(vocab_path): shutil.rmtree(vocab_path) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == 'default': # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.corpus.train, validation_data=pieces.corpus.valid, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.corpus.test batch_weight_key = pieces.batch_weight_key elif trainer_type == 'trainer_fp16_single': params.get("trainer").pop('type') # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member trainer = TrainerF16SingleTask.from_params( model=pieces.model, serialization_dir=serialization_dir, files_to_archive=params.files_to_archive, iterator=pieces.iterator, train_data=pieces.corpus.train, validation_data=pieces.corpus.valid, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.corpus.test batch_weight_key = pieces.batch_weight_key else: trainer = TrainerBase.from_params(params, serialization_dir, recover) # TODO(joelgrus): handle evaluation in the general case evaluation_iterator = evaluation_dataset = None params.assert_empty('base train command') if convert: logging.info('In conversion mode.') trainer._save_checkpoint(epoch=0) create_model_archive(serialization_dir, params) sys.exit(0) try: metrics = trainer.train() except (KeyboardInterrupt, RuntimeError): # if we have completed an epoch, try to create a model archive. logging.info("Training stopped. Attempting to create " "a model archive using the current best epoch weights.") create_model_archive(serialization_dir, params) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key=batch_weight_key) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) if params.pop('vocabulary', None): logger.warning( "You passed parameters for the vocabulary in your configuration file, but " "we are ignoring them, using instead the vocabulary from the saved model." ) vocab = model.vocab vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) all_datasets = datasets_from_params(params) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', -1)) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets, all_datasets_aux, all_datasets_aux2 = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) datasets_for_vocab_creation_aux = set(params.pop("auxiliary_datasets_for_vocab_creation", all_datasets_aux)) datasets_for_vocab_creation_aux2 = set(params.pop("auxiliary_datasets_for_vocab_creation_2", all_datasets_aux2)) mixing_ratio = params.pop_float("mixing_ratio") mixing_ratio2 = params.pop_float("mixing_ratio2") cutoff_epoch = params.pop("cutoff_epoch", -1) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab_instances_aux = [ instance for key, dataset in all_datasets_aux.items() for instance in dataset if key in datasets_for_vocab_creation_aux ] vocab_instances_aux.extend([ instance for key, dataset in all_datasets_aux2.items() for instance in dataset if key in datasets_for_vocab_creation_aux2 ]) vocab = VocabularyMultitask.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), instances_aux=vocab_instances_aux ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) iterator_aux = DataIterator.from_params(params.pop("iterator_aux")) iterator_aux.index_with(vocab) iterator_aux2 = DataIterator.from_params(params.pop("iterator_aux2")) iterator_aux2.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None # TODO: if validation in multi-task need to add validation iterator as above train_data = all_datasets.get('train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') train_data_aux = all_datasets_aux.get('train_aux') validation_data_aux = all_datasets_aux.get('validation_aux') test_data_aux = all_datasets_aux.get('test_aux') train_data_aux2 = all_datasets_aux2.get('train_aux') validation_data_aux2 = all_datasets_aux2.get('validation_aux') test_data_aux2 = all_datasets_aux2.get('test_aux') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = MultiTaskTrainer2.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, iterator_aux=iterator_aux, iterator_aux2=iterator_aux2, train_data=train_data, train_data_aux=train_data_aux, train_data_aux2=train_data_aux2, mixing_ratio=mixing_ratio, mixing_ratio2=mixing_ratio2, cutoff_epoch=cutoff_epoch, validation_data_aux=validation_data_aux, validation_data_aux2=validation_data_aux2, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) evaluate_aux_on_test = params.pop_bool("evaluate_aux_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") if test_data_aux and evaluate_aux_on_test: # for instance in test_data_aux: # instance.index_fields(vocab) # for instance in test_data_aux2: # instance.index_fields(vocab) test_metrics_aux = evaluate(best_model, test_data_aux, iterator_aux, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access test_metrics_aux2 = evaluate(best_model, test_data_aux2, iterator_aux2, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics_aux.items(): metrics["test_aux_" + key] = value for key, value in test_metrics_aux2.items(): metrics["test_aux2_" + key] = value elif test_data_aux: logger.info("To evaluate on the auxiliary test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def train_model(params: Params, serialization_dir: str, results_fn: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Tuple[Model, Dict[str, Any]]: prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None held_out_iterator_params = params.pop("held_out_iterator", None) if held_out_iterator_params: held_out_iterator = DataIterator.from_params(held_out_iterator_params) held_out_iterator.index_with(vocab) else: held_out_iterator = None train_data = all_datasets['train'] held_out_train_data = all_datasets.get('held_out_train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, held_out_train_data=held_out_train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, held_out_iterator=held_out_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(results_dir, results_fn), metrics, log=True) return best_model, metrics
def modified_train_model(serialization_dir, training_config_filename, cuda_device=-1, file_friendly_logging: bool = False) -> Model: """ Function not currently in use. This is from back when I was trying to keep each successive addition to the model's training in the same serialization directory. Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ model, params, prev_optimizer_params, cur_optimizer_params = \ load_model_from_serialization_dir(serialization_dir, training_config_filename, cuda_device=cuda_device) prepare_environment(params) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) params.pop('model') iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) list_of_cur_optimizer_param_keys = [ key for key in cur_optimizer_params.as_flat_dict().keys() ] list_of_prev_optimizer_param_keys = [ key for key in prev_optimizer_params.as_flat_dict().keys() ] optimizer_params_match = True for key in list_of_cur_optimizer_param_keys: if key not in list_of_prev_optimizer_param_keys: optimizer_params_match = False break for key in list_of_prev_optimizer_param_keys: if key not in list_of_cur_optimizer_param_keys: optimizer_params_match = False break if not optimizer_params_match: # a list of each p is what will be passed to the optimizer constructor while constructing Trainer-- # adjust if necessary (i.e., if we changed optimizers) model_params = [[n, p] for n, p in model.named_parameters() if p.requires_grad] assert "parameter_groups" not in list_of_cur_optimizer_param_keys, \ "Current way of dealing with optimizer change doesn't take parameter groups into account" assert "parameter_groups" not in list_of_prev_optimizer_param_keys, \ "Current way of dealing with optimizer change doesn't take parameter groups into account" for param_tup in model_params: # modify the second element of param_tup in-place (it's a dict) to match the keys specified in # cur_optimizer_params param_dict = param_tup[1] keys_to_del = [] keys_already_in_dict = [] try: for key in param_dict.keys(): if not key in list_of_cur_optimizer_param_keys: keys_to_del.append(key) else: keys_already_in_dict.append(key) for key in keys_to_del: del param_dict[key] for key_to_have in list_of_cur_optimizer_param_keys: if key_to_have != "type" and key_to_have not in keys_already_in_dict: param_dict[key_to_have] = cur_optimizer_params.get( key_to_have) except: pass trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False, batch_weight_key: str = "") -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. extend_vocab: ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError( f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning( "You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances( vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_type = trainer_params.pop("type", "default") if trainer_type == "default": trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) else: raise ConfigurationError( "currently fine-tune only works with the default Trainer") evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, batch_weight_key=batch_weight_key) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
#if semi_supervision: params['unlabelled_train_data_path'] = os.path.join( params['data_dir'], 'shuffle' + str(shuffle_id), params['unlabelled_train_data_file']) params['unlabelled_dataset_reader']['start_from'] = train_size params['dataset_reader']['how_many_sentences'] = train_size #params['model']['train_size'] = train_size params['serialization_dir'] = os.path.join( os.path.dirname(os.path.dirname(params['serialization_dir'])), 'shuffle' + str(shuffle_id), 'ts' + str(params['train_size'])) serialization_dir = params['serialization_dir'] training_util.create_serialization_dir(params, serialization_dir, args.recover, args.force) common_util.prepare_global_logging(serialization_dir, True) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) for key in [ 'warmup_epochs', 'unlabelled_train_data_file', 'test_data_file', 'data_dir', 'cuda_device', 'serialization_dir', 'train_data_file', 'validation_data_file', 'constraints_wt', 'train_size', 'shuffle_id', 'semi_supervised', 'which_mixer', 'distributed_lambda_update' ]: params.pop(key, None) #Pdb().set_trace() pieces = gan_trainer_hm.TrainerPiecesForSemi.from_params( params, serialization_dir, args.recover, semi_supervision) # pylint: disable=no-member trainer = Trainer.from_params(
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning("You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive.") vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning("You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. " "Vocabulary from the saved model will be extended with current data.") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = model.vocab vocab.extend_from_instances(vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
folds = [train_dataset[:ltd // 2], train_dataset[ltd // 2:]] # FIXME: their code trains on each fold before updating either one. Consider doing this also... for big_iter in range(iterations): print(" ---- BIG ITERATION: {} ---- ".format(big_iter)) for i, fold in enumerate(folds): model, optimizer, cuda_device = get_model(pretrained_file, WORD_EMB_DIM, vocab, len(reader.alltags)) iterator = BasicIterator(batch_size=batch_size) iterator.index_with(vocab) ser_dir_iter = serialization_dir + "/iter-{}-{}".format(big_iter, i) prepare_global_logging(ser_dir_iter, False) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=fold, validation_dataset=validation_dataset, patience=10, num_epochs=45, validation_metric="+f1-measure-overall", cuda_device=cuda_device, num_serialized_models_to_keep=3, serialization_dir=ser_dir_iter) metrics = trainer.train()
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, include_package: List[str] = None, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. node_rank : ``int``, optional Rank of the node world_size : ``int``, optional The number of processes involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ prepare_global_logging(serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size) prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 evaluate_on_test = params.pop_bool("evaluate_on_test", False) if distributed: # Since the worker is spawned and not forked, the extra imports # need to be done again. if include_package is not None: for package_name in include_package: import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(gpu_id) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty("base train command") try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise if master: if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer.cuda_device, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command." ) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not distributed: return trainer.model return None # to make mypy happy
print("serialization directory exists... ") r = input( "Serialization dir {} exists. Remove? y/n ".format(serialization_dir)) if r == "y": shutil.rmtree(serialization_dir) else: print("Not removing directory") sys.exit() iterator = BasicIterator(batch_size=batch_size) iterator.index_with(vocab) model, optimizer, cuda_device = get_model(pretrained_file, WORD_EMB_DIM, vocab, len(reader.alltags)) prepare_global_logging(serialization_dir, False) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=75, validation_metric="+f1-measure-overall", cuda_device=cuda_device, num_serialized_models_to_keep=3, serialization_dir=serialization_dir) metrics = trainer.train()
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
default=False, help='outputs tqdm status on separate lines and slows tqdm refresh rate') args = parser.parse_args() params = Params.from_file(args.param_path, args.overrides) random_seed, numpy_seed, pytorch_seed = random.randint( 0, 999999999), random.randint(0, 999999999), random.randint(0, 999999999) params["random_seed"] = random_seed params["numpy_seed"] = numpy_seed params["pytorch_seed"] = pytorch_seed prepare_environment(params) from graph_dependency_parser.components.evaluation.predictors import Evaluator, EmptyMRPEvaluator from graph_dependency_parser.train.amtrainer import AMTrainer, TrainerPieces serialization_dir = args.serialization_dir create_serialization_dir(params, serialization_dir, args.recover, args.force) stdout_handler = prepare_global_logging(serialization_dir, args.file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) test_evaluators = params.pop("test_evaluators", []) test_evaluators: List[Tuple[str, Evaluator]] = [ (name, Evaluator.from_params(evaluator)) for formalism in test_evaluators for name, evaluator in formalism ] if len(test_evaluators) == 0: logger.warning(
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.params.get('trainer').get('cuda_device', -1)) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, debate_mode: List[str] = ('f'), judge_filename: str = None, update_judge: bool = False, eval_mode: bool = False, reward_method: str = None, detach_value_head: bool = False, breakpoint_level: int = 0, search_outputs_path: str = None, accumulation_steps: int = 1, multi_gpu: bool = False, choice_mode: str = None, qa_loss_weight: float = 0., influence_reward: bool = False, theory_of_mind: bool = False, num_pred_rounds: int = -1, x_order_prob: float = 0., require_action: bool = False, single_shot: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. debate_mode : ``List[str]`` List of debate turns (e.g. aa, ar, rr, Ar) => capitalization implies search agent file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. judge_filename : ``str``, optional (default=None) Path to judge config or pre-trained judge model. If config, judge trained during debate. Necessary parameter if running in debate mode. update_judge : ``bool``, optional (default=False) Boolean whether or not to update Judge model during debate training. eval_mode : ``bool``, optional (default=False) Boolean whether or not to run in eval-only mode, on test data. Does not update/train any of the models. reward_method : ``str``, optional (default=False) Choice of reward function (RL) or loss function (Supervised Learning) for training debate agents detach_value_head : ``bool``, optional (default=False) Boolean whether or not to detatch value function gradient updates from the policy network. This prevents value function gradients from affecting policy network parameters. breakpoint_level : ``int`` optional (default=0) Debugging option to set breakpoint sensitivity (0 - no breakpoints). id_to_search_filename : ``str`` optional (default=None) Path to file with search predictions for each agent - necessary for supervised training accumulation_steps : ``int`` (default=1) Number of gradient steps to accumulate over before performing an update. Poor-man's batching for instances where number of examples per batch is small (limited GPU memory) multi_gpu : ``bool`` (default=False) Boolean whether or not to run models/training in model parallel mode. Requires specifying GPU allocations for trainer, judge, and debaters in the training config file (see training_config/bidaf.race.size=0.5.gpu=2.jsonnet for example usage). Returns ------- best_model: ``Model`` The model with the best epoch weights. """ assert ( not single_shot ) or eval_mode, 'Using single shot prediction outside eval_mode not yet supported.' assert (not single_shot) or (num_pred_rounds == -1), \ 'Using single shot prediction for a specific number of rounds is not yet supported.' # Get number of debate turns, and assert that not performing judge-only training num_no_qa_turns = sum([(('l' in debate_turn) or ('w' in debate_turn)) for debate_turn in debate_mode]) if (qa_loss_weight > 0) and (num_no_qa_turns == 0): warnings.warn( 'Unused argument qa_loss_weight in debate mode ' + str(debate_mode) + '. If this was unintentional, please remove the -q flag.', UserWarning) not_using_trained_debater = len( set('ablwⅰⅱⅲⅳ').intersection(''.join(debate_mode))) == 0 if (judge_filename is not None) and not_using_trained_debater: warnings.warn( 'Unnecessary to have debaters in debate mode ' + str(debate_mode) + '. If this was unintentional, please remove the -j flag.', UserWarning) prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) # Check that all Desired CUDA Devices exist => trainer => cuda_devices should contain list of required devices cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) # Build Allocation Dictionary (to be passed to all future functions) if multi_gpu: gpu_allocations, allocation_dict = params.params.pop( 'gpu_allocations', {}), {} assert len(gpu_allocations ) == 3, 'Must set gpu_allocations in config if multi-gpu' for k in ['debate', 'judge', 'trainer']: assert gpu_allocations[ k] in cuda_device, "Desired GPU not available... current: %s" % str( cuda_device) allocation_dict[k] = gpu_allocations[k] else: allocation_dict = {} params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. params['dataset_reader'][ 'debate_mode'] = debate_mode # If debate_mode requires sample duplicates pieces = TrainerPieces.from_params(params, serialization_dir, cuda_device, recover, judge_filename=judge_filename, update_judge=update_judge, eval_mode=eval_mode, reward_method=reward_method, detach_value_head=detach_value_head, allocation_dict=allocation_dict, qa_loss_weight=qa_loss_weight, influence_reward=influence_reward, theory_of_mind=theory_of_mind) # pylint: disable=no-member trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, debate_mode=debate_mode, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, eval_mode=eval_mode, breakpoint_level=breakpoint_level, search_outputs_path=search_outputs_path, accumulation_steps=accumulation_steps, allocation_dict=allocation_dict, choice_mode=choice_mode, num_pred_rounds=num_pred_rounds, x_order_prob=x_order_prob, require_action=require_action, single_shot=single_shot) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: assert (len(debate_mode) == 1) and (debate_mode[0] == 'f'), 'TrainerBase untested for debate training.' trainer = TrainerBase.from_params(params, serialization_dir, recover) evaluation_iterator = evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)) and not eval_mode: logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") # Now tar up results if not eval_mode: archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) else: dump_metrics(os.path.join( serialization_dir, "metrics.eval.d=" + '-'.join(debate_mode) + ".json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, include_package: List[str] = None, batch_weight_key: str = "", node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. node_rank : ``int``, optional Rank of the node. master_addr : ``str``, optional (default="127.0.0.1") Address of the master node for distributed training. master_port : ``str``, optional (default="29500") Port of the master node for distributed training. world_size : ``int``, optional The number of processes involved in distributed training. distributed_device_ids: ``List[str]``, optional IDs of the devices used involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ common_util.prepare_global_logging(serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size) common_util.prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 include_package = include_package or [] if distributed: # Since the worker is spawned and not forked, the extra imports need to be done again. import_plugins() for package_name in include_package: common_util.import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # Number of processes per node is useful to know if a process # is a master in the local node(node in which it is running) os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(int(gpu_id)) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") train_loop = TrainModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=process_rank, batch_weight_key=batch_weight_key, ) try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = train_loop.run() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir) raise if master: train_loop.finish(metrics) if not distributed: return train_loop.model return None # to make mypy happy
def main(serialization_dir, evaluation_data_file, split, cuda_device, weights_file, overrides): archive_file = os.path.join(serialization_dir, "model.tar.gz") logging_dir = os.path.join(serialization_dir, "logging") if os.path.isfile(archive_file): weights_file = None archive = load_archive(archive_file, cuda_device, overrides, weights_file) config = archive.config prepare_environment(config) prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split) model = archive.model else: # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) prepare_environment(config) prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split) if weights_file: weights_path = os.path.join(serialization_dir, weights_file) else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) logger.info("Using weights_file located at : %s", weights_path) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) # Eval mode ON model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) if evaluation_data_file is None: logger.info("--evaluation_data_file not provided. So using --split=%s to read data", split) data_path_key = split + '_data_path' evaluation_data_path = config.pop(data_path_key) else: evaluation_data_path = evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) logger.info("No. of instances = %d", len(instances)) iterator = BasicIterator(batch_size=128) iterator.index_with(model.vocab) metrics, model_predictions = get_model_predictions(model, instances, iterator, args.cuda_device) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) write_predictions(serialization_dir=serialization_dir, instances=instances, model_predictions=model_predictions, split=split) analyze_gold_data(serialization_dir=serialization_dir, instances=instances, split=split) analyze_model_predictions(serialization_dir=serialization_dir, instances=instances, model_predictions=model_predictions, split=split) analyze_bio_violations(instances=instances, model_predictions=model_predictions)