def from_params(self, params: Params, **extras) -> PytorchSeq2SeqWrapper: if not params.pop_bool('batch_first', True): raise ConfigurationError( "Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params['batch_first'] = True stateful = params.pop_bool('stateful', False) weight_dropout = params.pop_float('weight_dropout', 0.0) variational = params.pop_float('variational', True) num_layers = params.get('num_layers', 1) bidirectional = params.get('bidirectional', False) all_recurrent_weights = [ f"weight_hh_l{layer}{suffix}" for layer, suffix in product(range(num_layers), [""] + ["_reverse"] * (1 if bidirectional else 0)) ] if weight_dropout > 0.0: module = weight_drop_factory(self._module_class)( module_args=params.as_dict(infer_type_and_cast=True), weights=all_recurrent_weights, wdrop=weight_dropout, variational=variational, ) else: module = self._module_class(**params.as_dict( infer_type_and_cast=True)) return PytorchSeq2SeqWrapper(module, stateful=stateful)
def from_params(cls, params: Params) -> 'LinearTransformSumReprCombination': tensor1_dim = params.get("tensor_1_dim", 0) tensor2_dim = params.get("tensor_2_dim", 0) tensor3_dim = params.get("tensor_3_dim", 0) output_dim = params.get("output_dim", 0) activation = Activation.by_name(params.get("activation", "linear"))() return cls(tensor1_dim, tensor2_dim, tensor3_dim, output_dim, activation)
def from_params(cls, params: Params) -> 'WeightedSumReprCombination': keep_context_threshold = params.get("keep_context_threshold", 0.5) tensor1_dim = params.get("tensor1_dim", 0) tensor2_dim = params.get("tensor2_dim", 0) output_dim = params.get("output_dim", 0) activation = Activation.by_name(params.get("activation", "linear"))() return cls(tensor1_dim, tensor2_dim, output_dim, keep_context_threshold, activation)
def create_or_extend_vocab( params: Params, datasets: Dict[str, Dict[str, Iterable[Instance]]], vocabulary_params: Params, vocabulary_path: str, vocab: Vocabulary = None, recover: bool = False, ) -> Vocabulary: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", datasets)) for key in datasets_for_vocab_creation: if key not in datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {key}") datasets = { key: dataset for key, dataset in datasets.items() if key in datasets_for_vocab_creation } flat_datasets = training_util.as_flat_dict(datasets) instance_generator = (instance for key, dataset in flat_datasets.items() for instance in dataset) dataset_keys_to_use_str = ", ".join(datasets_for_vocab_creation) if vocab: logger.info( f"Extending model vocabulary using {dataset_keys_to_use_str} data." ) vocab.extend_from_instances(instances=instance_generator) else: logger.info( "From dataset instances, %s will be considered for vocabulary creation.", dataset_keys_to_use_str, ) if recover and os.path.exists(vocabulary_path): vocab = Vocabulary.from_files( vocabulary_path, vocabulary_params.get("padding_token", None), vocabulary_params.get("oov_token", None), ) else: # Using a generator comprehension here is important because, by being lazy, # it allows us to not iterate over the dataset when directory_path is specified. vocab = Vocabulary.from_params(vocabulary_params, instances=instance_generator) return vocab
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False): # pylint: disable=arguments-differ typ3 = params.get("trainer", {}).pop("type", "default") if typ3 == "default": # Special logic to keep old from_params behavior. from allennlp.training.trainer import Trainer, TrainerPieces pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member return Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) else: return TrainerBase.by_name(typ3).from_params( params, serialization_dir, recover)
def from_params(cls, params: Params) -> 'ArcMultiChoiceJsonReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = token_indexer_dict_from_params( params.pop('token_indexers', {})) choice_value_type = params.get('choice_value_type', None) question_value_type = params.get('question_value_type', None) lazy = params.pop('lazy', False) return ArcMultiChoiceJsonReader( tokenizer=tokenizer, token_indexers=token_indexers, choice_value_type=choice_value_type, question_value_type=question_value_type, lazy=lazy)
def __init__( self, model: Model, train_dataset: List[Instance], iterator: DataIterator, subtrainer_params: Params, cross_validation_splitter: CrossValidationSplitter, serialization_dir: str, group_key: Optional[str] = None, leave_model_trained: bool = False, validation_dataset: Optional[List[Instance]] = None, recover: bool = False ) -> None: # FIXME: does recover make sense? Maybe to continue the CV. # To use the same device as the subtrainers, in case `self._cuda_devices` is queried. cuda_device = parse_cuda_device( subtrainer_params.get('cuda_device', -1)) super().__init__(serialization_dir, cuda_device=cuda_device) self.model = model self.train_dataset = train_dataset self.iterator = iterator self.subtrainer_params = subtrainer_params self.cross_validation_splitter = cross_validation_splitter self.group_key = group_key self.leave_model_trained = leave_model_trained self.validation_dataset = validation_dataset self.recover = recover
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False): # pylint: disable=arguments-differ typ3 = params.get("trainer", {}).pop("type", "default") if typ3 == "default": # Special logic to keep old from_params behavior. from allennlp.training.trainer import Trainer from allennlp.training.trainer_pieces import TrainerPieces pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member return Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) else: klass = TrainerBase.by_name(typ3) # Explicit check to prevent recursion. is_overriden = klass.from_params.__func__ != TrainerBase.from_params.__func__ # type: ignore assert is_overriden, f"Class {klass.__name__} must override `from_params`." return klass.from_params(params, serialization_dir, recover)
def _load( cls, config: Params, serialization_dir: str, weights_file: Optional[str] = None, cuda_device: int = -1, opt_level: Optional[str] = None, ) -> Model: """ Ensembles don't have vocabularies or weights of their own, so they override _load. """ if opt_level is not None: raise NotImplementedError(f"{cls.__name__} does not support AMP yet.") model_params = config.get("model") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_weights_related_keys_from_params(model_params) model = Model.from_params(vocab=None, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def from_params(cls, vocab: Vocabulary, params: Params) -> 'QAMultiChoice_OneVsRest_Choices_v1': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) embeddings_dropout_value = params.pop("embeddings_dropout", 0.0) # question encoder question_encoder_params = params.pop("question_encoder", None) question_enc_aggregate = params.pop("question_encoder_aggregate", "max") share_encoders = params.pop("share_encoders", False) # condition the choices or facts encoding on quesiton output states choices_init_from_question_states = params.pop( "choices_init_from_question_states", False) if question_encoder_params is not None: question_encoder = Seq2SeqEncoder.from_params( question_encoder_params) else: question_encoder = None if share_encoders: choice_encoder = question_encoder choice_enc_aggregate = question_enc_aggregate else: # choice encoder choice_encoder_params = params.pop("choice_encoder", None) choice_enc_aggregate = params.pop("choice_encoder_aggregate", "max") if choice_encoder_params is not None: choice_encoder = Seq2SeqEncoder.from_params( choice_encoder_params) else: choice_encoder = None use_choice_sum_instead_of_question = params.get( "use_choice_sum_instead_of_question", False) init_params = params.pop('initializer', None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) return cls( vocab=vocab, text_field_embedder=text_field_embedder, question_encoder=question_encoder, choice_encoder=choice_encoder, initializer=initializer, aggregate_choice=choice_enc_aggregate, aggregate_question=question_enc_aggregate, embeddings_dropout_value=embeddings_dropout_value, share_encoders=share_encoders, choices_init_from_question_states=choices_init_from_question_states, use_choice_sum_instead_of_question= use_choice_sum_instead_of_question, params=params)
def __init__(self, vocab_size, params: Params): super().__init__() self.vocab_size = vocab_size self.char_emb_size = params.get("char_emb_size") self.dropout_rate = params.get("dropout_rate") self.init_char_embs = Embedding( num_embeddings=self.vocab_size, embedding_dim=self.char_emb_size, padding_idx=0, ) self.cnn = CNN(params) self.highway_network = HighwayNetwork( Params( {"dropout_rate": self.dropout_rate, "input_size": self.char_emb_size} ) )
def setup_output_dir(config: Params, loglevel: Optional[str] = None) -> str: """Setup the Experiment Folder Note that the output_dir stores each run as run-1, .... Makes the next run directory. This also sets up the logger A run directory has the following structure - run-1 - models * modelname*.tar.gz - vocabulary * namespace_1.txt * namespace_2.txt ... * config.json * githash.log of current run * gitdiff.log of current run * logfile.log (the log of the current run) Arguments: config (``allennlp.common.Params``): The experiment parameters loglevel (str): The logger mode [INFO/DEBUG/ERROR] Returns str, allennlp.common.Params: The filename, and the modified config """ output_dir = config.get('base_output_dir', "./Outputs") make_directory(output_dir) last_run = -1 for dirname in os.listdir(output_dir): if dirname.startswith('run-'): last_run = max(last_run, int(dirname.split('-')[1])) new_dirname = os.path.join(output_dir, 'run-%d' % (last_run + 1)) make_directory(new_dirname) best_model_dirname = os.path.join(new_dirname, 'models') make_directory(best_model_dirname) vocab_dirname = os.path.join(best_model_dirname, 'vocabulary') make_directory(vocab_dirname) config_file = os.path.join(new_dirname, 'config.jsonnet') write_config_to_file(config_file, config) # Save the git hash process = Popen('git log -1 --format="%H"'.split(), stdout=PIPE, stderr=PIPE) stdout, _ = process.communicate() stdout = stdout.decode('ascii').strip('\n').strip('"') with open(os.path.join(new_dirname, "githash.log"), "w") as fp: fp.write(stdout) # Save the git diff process = Popen('git diff'.split(), stdout=PIPE, stderr=PIPE) stdout, _ = process.communicate() with open(os.path.join(new_dirname, "gitdiff.log"), "w") as fp: stdout = stdout.decode('ascii', errors="ignore") fp.write(stdout) if loglevel: # Set up the logger logfile = os.path.join(new_dirname, 'logfile.log') setup_logger(logfile, loglevel) return best_model_dirname
def _sanitize_config(config: Params) -> None: """ There are some elements of the model config that we need to get rid of when we load from archive, as they refer to paths on the training machine that are unlikely to exist on the un-archiving machine. To be extra-safe we just remove them from the config. This is a temporary fix; once we implement https://github.com/allenai/allennlp/issues/244 it should become unnecessary. """ evaluation_json_file = config.get("model", {}).get('evaluation_json_file', None) if evaluation_json_file and not os.path.exists(evaluation_json_file): logger.warning( "specified evaluation_json_file %s does not exist, removing key", evaluation_json_file) config.get("model", {}).pop('evaluation_json_file')
def from_params(cls, vocab: Vocabulary, params: Params) -> 'QAMultiChoiceMaxAttention': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) embeddings_dropout_value = params.pop("embeddings_dropout", 0.0) # question encoder question_encoder_params = params.pop("question_encoder", None) question_enc_aggregate = params.pop("question_encoder_aggregate", "max") share_encoders = params.pop("share_encoders", False) if question_encoder_params is not None: question_encoder = Seq2SeqEncoder.from_params( question_encoder_params) else: question_encoder = None if share_encoders: choice_encoder = question_encoder choice_enc_aggregate = question_enc_aggregate else: # choice encoder choice_encoder_params = params.pop("choice_encoder", None) choice_enc_aggregate = params.pop("choice_encoder_aggregate", "max") if choice_encoder_params is not None: choice_encoder = Seq2SeqEncoder.from_params( choice_encoder_params) else: choice_encoder = None # question to choice attention att_question_to_choice_params = params.get("att_question_to_choice") att_question_to_choice = SimilarityFunction.from_params( att_question_to_choice_params) init_params = params.pop('initializer', None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) return cls(vocab=vocab, text_field_embedder=text_field_embedder, question_encoder=question_encoder, choice_encoder=choice_encoder, initializer=initializer, aggregate_choice=choice_enc_aggregate, aggregate_question=question_enc_aggregate, embeddings_dropout_value=embeddings_dropout_value, att_question_to_choice=att_question_to_choice)
def from_params(cls, params: Params, **extras) -> "Tokenizer": # type: ignore # Backwards compatibility for legacy "word" Tokenizer # which provided arguments to intitalize current tokenizers # inside "word_splitter" key. tokenizer_type = params.get("type") splitter_params = params.get("word_splitter") if tokenizer_type == "word" or (tokenizer_type is None and splitter_params): if not splitter_params: splitter_params = Params({"type": "spacy"}) elif isinstance(splitter_params, str): splitter_params = Params({"type": splitter_params}) if params.get("word_filter") or params.get("word_stemmer"): raise ConfigurationError( "Support for word_filter, word_stemmer is dropped in the current default tokenizer." ) start_tokens = params.get("start_tokens") end_tokens = params.get("end_tokens") if start_tokens: splitter_params["start_tokens"] = start_tokens if end_tokens: splitter_params["end_tokens"] = end_tokens logger.warning( "Converting old WordTokenizer params - %s \n" "to new params %s.", str(params), str(splitter_params), ) params = splitter_params return super().from_params(params, **extras)
def from_params( cls, params: Params ) -> 'ArcMultiChoiceWithFactsTextJsonReaderMultiSource': # read tokenizers field_tokenizers = tokenizer_dict_from_params( params.get('tokenizers', {})) token_indexers = token_indexer_dict_from_params( params.get('token_indexers', {})) # external knowledge external_knowledge_params = params.pop('external_knowledge') choice_value_type = params.get('choice_value_type', None) question_value_type = params.get('question_value_type', None) no_relevant_fact_add = params.get('no_relevant_fact_add', False) no_relevant_fact_text = params.get('no_relevant_fact_text', NO_RELEVANT_FACT_TEXT) lazy = params.pop('lazy', False) # params.assert_empty(cls.__name__) return ArcMultiChoiceWithFactsTextJsonReaderMultiSource( field_tokenizers=field_tokenizers, token_indexers=token_indexers, external_know_config=external_knowledge_params, choice_value_type=choice_value_type, question_value_type=question_value_type, no_relevant_fact_add=no_relevant_fact_add, no_relevant_fact_text=no_relevant_fact_text, lazy=lazy)
def setup_datasets(params: Params) -> Dict[str, Iterable[Instance]]: dataset_reader_params = params.get('dataset_reader') validation_dataset_reader_params = params.get('validation_dataset_reader', None) dataset_reader = DatasetReader.from_params(dataset_reader_params) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) train_data_path = params.get('train_data_path') train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.get('validation_data_path', None) if validation_data_path is not None: validation_data = validation_and_test_dataset_reader.read(validation_data_path) datasets["validation"] = validation_data test_data_path = params.get("test_data_path", None) if test_data_path is not None: test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def from_config(cls, config: Params, predictor_name: str = None) -> 'Predictor': dataset_reader_params = config["dataset_reader"] dataset_reader = DatasetReader.from_params(dataset_reader_params) tokenizer = dataset_reader._tokenizer or WordTokenizer() # pylint: disable=protected-access token_indexers = dataset_reader._token_indexers # pylint: disable=protected-access model_name = config.get("model").get("type") model = Model.load(config) model.eval() predictor_name = predictor_name or DEFAULT_PREDICTORS[model_name] return Predictor.by_name(predictor_name)(model, tokenizer, token_indexers)
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]: """ Load all the datasets specified by the config. """ for data_name in [ "train_data_path", "validation_data_path", "test_data_path" ]: data_path = params.get(data_name, None) if data_path is not None: check_for_data_path(data_path, data_name) dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) validation_dataset_reader_params = params.pop("validation_dataset_reader", None) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: logger.info( "Using a separate dataset reader to load validation and test data." ) validation_and_test_dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read( validation_data_path) datasets["validation"] = validation_data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ): typ3 = params.get("trainer", {}).pop("type", "default") if typ3 == "default": # Special logic to keep old from_params behavior. from allennlp.training.trainer import Trainer from allennlp.training.trainer_pieces import TrainerPieces pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) return Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) else: klass = TrainerBase.by_name(typ3) # Explicit check to prevent recursion. is_overriden = ( klass.from_params.__func__ != TrainerBase.from_params.__func__ # type: ignore ) assert is_overriden, f"Class {klass.__name__} must override `from_params`." return klass.from_params(params, serialization_dir, recover, cache_directory, cache_prefix)
def modified_model_load(config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> Model: """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') # If the config specifies a vocabulary subclass, we need to use it. vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state, strict=False) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]: """ Load all the datasets specified by the config. """ for data_name in ["train_data_path", "validation_data_path", "test_data_path"]: data_path = params.get(data_name, None) if data_path is not None: check_for_data_path(data_path, data_name) dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) validation_dataset_reader_params = params.pop("validation_dataset_reader", None) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: logger.info("Using a separate dataset reader to load validation and test data.") validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read(validation_data_path) datasets["validation"] = validation_data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def _load(config: Params, adapters_dir: str, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, "best.th") # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab = Vocabulary.by_name(vocab_choice).from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() # model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) # model.load_state_dict(model_state, strict=False) for file in os.listdir(adapters_dir): logger.info(f"{file} is loading..") # loop over the adapters folder and load weights into a dictionary for i, layer in enumerate(model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer): try: for j, (file, attention_adapter, output_attention) in enumerate(zip(os.listdir(adapters_dir), layer.attention.output.adapter, layer.output.adapter)): adapter_state = torch.load(os.path.join(adapters_dir, file)) attention_adapter.load_state_dict(adapter_state['attention_adapter_' + str(i)]) output_attention.load_state_dict(adapter_state['output_adapter_' + str(i)]) except AttributeError: logger.warning(f"Could not find the adapter model inside the archive {adapters_dir}") traceback.print_exc() return # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, include_package: List[str] = None, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. node_rank : ``int``, optional Rank of the node world_size : ``int``, optional The number of processes involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ prepare_global_logging(serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size) prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 evaluate_on_test = params.pop_bool("evaluate_on_test", False) if distributed: # Since the worker is spawned and not forked, the extra imports # need to be done again. if include_package is not None: for package_name in include_package: import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(gpu_id) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty("base train command") try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise if master: if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer.cuda_device, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command." ) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not distributed: return trainer.model return None # to make mypy happy
def from_params(cls, vocab: Vocabulary, params: Params) -> 'QAMultiChoiceKnowReader_v1': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) embeddings_dropout_value = params.pop("embeddings_dropout", 0.0) # whether we want to use knowledge use_knowledge = params.pop("use_knowledge", True) use_ctx2facts_retrieval_map_as_mask = params.pop( "use_ctx2facts_retrieval_map_as_mask", False) # question encoder question_encoder_params = params.pop("question_encoder", None) question_enc_aggregate = params.pop("question_encoder_aggregate", "max") share_encoders = params.pop("share_encoders", False) # condition the choices or facts encoding on quesiton output states choices_init_from_question_states = params.pop( "choices_init_from_question_states", False) facts_init_from_question_states = params.pop( "facts_init_from_question_states", False) if question_encoder_params is not None: question_encoder = Seq2SeqEncoder.from_params( question_encoder_params) else: question_encoder = None knowledge_encoder = None knowledge_enc_aggregate = "max" if share_encoders: choice_encoder = question_encoder choice_enc_aggregate = question_enc_aggregate if use_knowledge: knowledge_encoder = question_encoder knowledge_enc_aggregate = question_enc_aggregate else: # choice encoder choice_encoder_params = params.pop("choice_encoder", None) choice_enc_aggregate = params.pop("choice_encoder_aggregate", "max") if choice_encoder_params is not None: choice_encoder = Seq2SeqEncoder.from_params( choice_encoder_params) else: choice_encoder = None if use_knowledge: knowledge_encoder_params = params.pop("knowledge_encoder", None) knowledge_enc_aggregate = params.pop( "knowledge_encoder_aggregate", "max") if knowledge_encoder_params is not None: knowledge_encoder = Seq2SeqEncoder.from_params( knowledge_encoder_params) else: knowledge_encoder = None know_interactions_params = params.get("know_interactions") know_interactions_aggregate_ffw_params = know_interactions_params.get( 'aggregate_feedforward') # aggregate knowledge input state is inferred automatically update_params(know_interactions_aggregate_ffw_params, { "input_dim": len(know_interactions_params.get("interactions", [])) }) know_aggregate_feedforward = FeedForward.from_params( params.get("know_interactions").get('aggregate_feedforward')) init_params = params.pop('initializer', None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) return cls( vocab=vocab, text_field_embedder=text_field_embedder, question_encoder=question_encoder, choice_encoder=choice_encoder, use_knowledge=use_knowledge, facts_encoder=knowledge_encoder, know_aggregate_feedforward=know_aggregate_feedforward, initializer=initializer, aggregate_choice=choice_enc_aggregate, aggregate_question=question_enc_aggregate, aggregate_facts=knowledge_enc_aggregate, embeddings_dropout_value=embeddings_dropout_value, share_encoders=share_encoders, choices_init_from_question_states=choices_init_from_question_states, facts_init_from_question_states=facts_init_from_question_states, use_ctx2facts_retrieval_map_as_mask= use_ctx2facts_retrieval_map_as_mask, params=params)
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError( f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) check_for_gpu(params.get('trainer').get('cuda_device', -1)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info( f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.' ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ThriftyEmbedding': # type: ignore weights_file = params.get('weights_file') params.params['pretrained_file'] = weights_file obj = super(ThriftyEmbedding, cls).from_params(vocab, params) return obj
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', -1)) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, batch_weight_key: str = "", ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see ``Model.from_archive``. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. # Returns best_model : ``Model`` The model with the best epoch weights. """ training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, include_package=include_package, batch_weight_key=batch_weight_key, ) archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since # the data iterators in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. if params.get("vocabulary", Params({})).get("type", "") != "from_files": vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir) params["vocabulary"] = { "type": "from_files", "directory": os.path.join(serialization_dir, "vocabulary"), "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, include_package, batch_weight_key, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, debate_mode: List[str] = ('f'), judge_filename: str = None, update_judge: bool = False, eval_mode: bool = False, reward_method: str = None, detach_value_head: bool = False, breakpoint_level: int = 0, search_outputs_path: str = None, accumulation_steps: int = 1, multi_gpu: bool = False, choice_mode: str = None, qa_loss_weight: float = 0., influence_reward: bool = False, theory_of_mind: bool = False, num_pred_rounds: int = -1, x_order_prob: float = 0., require_action: bool = False, single_shot: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. debate_mode : ``List[str]`` List of debate turns (e.g. aa, ar, rr, Ar) => capitalization implies search agent file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. judge_filename : ``str``, optional (default=None) Path to judge config or pre-trained judge model. If config, judge trained during debate. Necessary parameter if running in debate mode. update_judge : ``bool``, optional (default=False) Boolean whether or not to update Judge model during debate training. eval_mode : ``bool``, optional (default=False) Boolean whether or not to run in eval-only mode, on test data. Does not update/train any of the models. reward_method : ``str``, optional (default=False) Choice of reward function (RL) or loss function (Supervised Learning) for training debate agents detach_value_head : ``bool``, optional (default=False) Boolean whether or not to detatch value function gradient updates from the policy network. This prevents value function gradients from affecting policy network parameters. breakpoint_level : ``int`` optional (default=0) Debugging option to set breakpoint sensitivity (0 - no breakpoints). id_to_search_filename : ``str`` optional (default=None) Path to file with search predictions for each agent - necessary for supervised training accumulation_steps : ``int`` (default=1) Number of gradient steps to accumulate over before performing an update. Poor-man's batching for instances where number of examples per batch is small (limited GPU memory) multi_gpu : ``bool`` (default=False) Boolean whether or not to run models/training in model parallel mode. Requires specifying GPU allocations for trainer, judge, and debaters in the training config file (see training_config/bidaf.race.size=0.5.gpu=2.jsonnet for example usage). Returns ------- best_model: ``Model`` The model with the best epoch weights. """ assert ( not single_shot ) or eval_mode, 'Using single shot prediction outside eval_mode not yet supported.' assert (not single_shot) or (num_pred_rounds == -1), \ 'Using single shot prediction for a specific number of rounds is not yet supported.' # Get number of debate turns, and assert that not performing judge-only training num_no_qa_turns = sum([(('l' in debate_turn) or ('w' in debate_turn)) for debate_turn in debate_mode]) if (qa_loss_weight > 0) and (num_no_qa_turns == 0): warnings.warn( 'Unused argument qa_loss_weight in debate mode ' + str(debate_mode) + '. If this was unintentional, please remove the -q flag.', UserWarning) not_using_trained_debater = len( set('ablwⅰⅱⅲⅳ').intersection(''.join(debate_mode))) == 0 if (judge_filename is not None) and not_using_trained_debater: warnings.warn( 'Unnecessary to have debaters in debate mode ' + str(debate_mode) + '. If this was unintentional, please remove the -j flag.', UserWarning) prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) # Check that all Desired CUDA Devices exist => trainer => cuda_devices should contain list of required devices cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) # Build Allocation Dictionary (to be passed to all future functions) if multi_gpu: gpu_allocations, allocation_dict = params.params.pop( 'gpu_allocations', {}), {} assert len(gpu_allocations ) == 3, 'Must set gpu_allocations in config if multi-gpu' for k in ['debate', 'judge', 'trainer']: assert gpu_allocations[ k] in cuda_device, "Desired GPU not available... current: %s" % str( cuda_device) allocation_dict[k] = gpu_allocations[k] else: allocation_dict = {} params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. params['dataset_reader'][ 'debate_mode'] = debate_mode # If debate_mode requires sample duplicates pieces = TrainerPieces.from_params(params, serialization_dir, cuda_device, recover, judge_filename=judge_filename, update_judge=update_judge, eval_mode=eval_mode, reward_method=reward_method, detach_value_head=detach_value_head, allocation_dict=allocation_dict, qa_loss_weight=qa_loss_weight, influence_reward=influence_reward, theory_of_mind=theory_of_mind) # pylint: disable=no-member trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, debate_mode=debate_mode, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, eval_mode=eval_mode, breakpoint_level=breakpoint_level, search_outputs_path=search_outputs_path, accumulation_steps=accumulation_steps, allocation_dict=allocation_dict, choice_mode=choice_mode, num_pred_rounds=num_pred_rounds, x_order_prob=x_order_prob, require_action=require_action, single_shot=single_shot) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: assert (len(debate_mode) == 1) and (debate_mode[0] == 'f'), 'TrainerBase untested for debate training.' trainer = TrainerBase.from_params(params, serialization_dir, recover) evaluation_iterator = evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)) and not eval_mode: logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") # Now tar up results if not eval_mode: archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) else: dump_metrics(os.path.join( serialization_dir, "metrics.eval.d=" + '-'.join(debate_mode) + ".json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model