def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]: """ Load all the datasets specified by the config. """ dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) validation_dataset_reader_params = params.pop("validation_dataset_reader", None) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: logger.info("Using a separate dataset reader to load validation and test data.") validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read(validation_data_path) datasets["validation"] = validation_data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def create_serialization_dir(params: Params, serialization_dir: str, recover: bool) -> None: """ This function creates the serialization directory if it doesn't exist. If it already exists, then it verifies that we're recovering from a training with an identical configuration. Parameters ---------- params: ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir: ``str`` The directory in which to save results and logs. recover: ``bool`` If ``True``, we will try to recover from an existing serialization directory, and crash if the directory doesn't exist, or doesn't match the configuration we're given. """ if os.path.exists(serialization_dir): if serialization_dir == '/output': # Special-casing the beaker output directory, which will already exist when training # starts. return if not recover: raise ConfigurationError(f"Serialization directory ({serialization_dir}) already exists. " f"Specify --recover to recover training from existing output.") logger.info(f"Recovering from prior training at {serialization_dir}.") recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME) if not os.path.exists(recovered_config_file): raise ConfigurationError("The serialization directory already exists but doesn't " "contain a config.json. You probably gave the wrong directory.") else: loaded_params = Params.from_file(recovered_config_file) # Check whether any of the training configuration differs from the configuration we are # resuming. If so, warn the user that training may fail. fail = False flat_params = params.as_flat_dict() flat_loaded = loaded_params.as_flat_dict() for key in flat_params.keys() - flat_loaded.keys(): logger.error(f"Key '{key}' found in training configuration but not in the serialization " f"directory we're recovering from.") fail = True for key in flat_loaded.keys() - flat_params.keys(): logger.error(f"Key '{key}' found in the serialization directory we're recovering from " f"but not in the training config.") fail = True for key in flat_params.keys(): if flat_params.get(key, None) != flat_loaded.get(key, None): logger.error(f"Value for '{key}' in training configuration does not match that the value in " f"the serialization directory we're recovering from: " f"{flat_params[key]} != {flat_loaded[key]}") fail = True if fail: raise ConfigurationError("Training configuration does not match the configuration we're " "recovering from.") else: if recover: raise ConfigurationError(f"--recover specified but serialization_dir ({serialization_dir}) " "does not exist. There is nothing to recover from.") os.makedirs(serialization_dir)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttention': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) premise_encoder_params = params.pop("premise_encoder", None) if premise_encoder_params is not None: premise_encoder = Seq2SeqEncoder.from_params(premise_encoder_params) else: premise_encoder = None hypothesis_encoder_params = params.pop("hypothesis_encoder", None) if hypothesis_encoder_params is not None: hypothesis_encoder = Seq2SeqEncoder.from_params(hypothesis_encoder_params) else: hypothesis_encoder = None attend_feedforward = FeedForward.from_params(params.pop('attend_feedforward')) similarity_function = SimilarityFunction.from_params(params.pop("similarity_function")) compare_feedforward = FeedForward.from_params(params.pop('compare_feedforward')) aggregate_feedforward = FeedForward.from_params(params.pop('aggregate_feedforward')) initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, attend_feedforward=attend_feedforward, similarity_function=similarity_function, compare_feedforward=compare_feedforward, aggregate_feedforward=aggregate_feedforward, premise_encoder=premise_encoder, hypothesis_encoder=hypothesis_encoder, initializer=initializer, regularizer=regularizer)
def from_params(self, params: Params) -> PytorchSeq2SeqWrapper: if not params.pop_bool('batch_first', True): raise ConfigurationError("Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params['batch_first'] = True module = self._module_class(**params.as_dict()) return PytorchSeq2SeqWrapper(module)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders)
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'BasicIterator': batch_size = params.pop_int('batch_size', 32) instances_per_epoch = params.pop_int('instances_per_epoch', None) max_instances_in_memory = params.pop_int('max_instances_in_memory', None) params.assert_empty(cls.__name__) return cls(batch_size=batch_size, instances_per_epoch=instances_per_epoch, max_instances_in_memory=max_instances_in_memory)
def from_params(cls, params: Params) -> 'PennTreeBankConstituencySpanDatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) use_pos_tags = params.pop('use_pos_tags', True) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return PennTreeBankConstituencySpanDatasetReader(token_indexers=token_indexers, use_pos_tags=use_pos_tags, lazy=lazy)
def from_params(cls, params: Params) -> 'B': params.add_file_to_archive("filename") filename = params.pop("filename") c_params = params.pop("c") c = C.from_params(c_params) return cls(filename, c)
def from_params(cls, params: Params) -> 'SrlReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) domain_identifier = params.pop("domain_identifier", None) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SrlReader(token_indexers=token_indexers, domain_identifier=domain_identifier, lazy=lazy)
def from_params(cls, params: Params) -> 'LinearSimilarity': tensor_1_dim = params.pop_int("tensor_1_dim") tensor_2_dim = params.pop_int("tensor_2_dim") combination = params.pop("combination", "x,y") activation = Activation.by_name(params.pop("activation", "linear"))() params.assert_empty(cls.__name__) return cls(tensor_1_dim=tensor_1_dim, tensor_2_dim=tensor_2_dim, combination=combination, activation=activation)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'SpanConstituencyParser': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) span_extractor = SpanExtractor.from_params(params.pop("span_extractor")) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) feed_forward_params = params.pop("feedforward", None) if feed_forward_params is not None: feedforward_layer = FeedForward.from_params(feed_forward_params) else: feedforward_layer = None pos_tag_embedding_params = params.pop("pos_tag_embedding", None) if pos_tag_embedding_params is not None: pos_tag_embedding = Embedding.from_params(vocab, pos_tag_embedding_params) else: pos_tag_embedding = None initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) evalb_directory_path = params.pop("evalb_directory_path", None) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, span_extractor=span_extractor, encoder=encoder, feedforward_layer=feedforward_layer, pos_tag_embedding=pos_tag_embedding, initializer=initializer, regularizer=regularizer, evalb_directory_path=evalb_directory_path)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder': # type: ignore # pylint: disable=arguments-differ embedding_params: Params = params.pop("embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. embedding_params.setdefault("vocab_namespace", "token_characters") embedding = Embedding.from_params(vocab, embedding_params) encoder_params: Params = params.pop("encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def test_mismatched_dimensions_raise_configuration_errors(self): params = Params.from_file(self.param_file) # Make the input_dim to the first feedforward_layer wrong - it should be 2. params["model"]["attend_feedforward"]["input_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model")) params = Params.from_file(self.param_file) # Make the projection output_dim of the last layer wrong - it should be # 3, equal to the number of classes. params["model"]["aggregate_feedforward"]["output_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def from_params(cls, vocab: Vocabulary, params: Params) -> 'SimpleTagger': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, encoder=encoder, initializer=initializer, regularizer=regularizer)
def extend_from_instances(self, params: Params, instances: Iterable['adi.Instance'] = ()) -> None: """ Extends an already generated vocabulary using a collection of instances. """ min_count = params.pop("min_count", None) max_vocab_size = pop_max_vocab_size(params) non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES) pretrained_files = params.pop("pretrained_files", {}) min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None) only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False) tokens_to_add = params.pop("tokens_to_add", None) params.assert_empty("Vocabulary - from dataset") logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) self._extend(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def setUp(self): super().setUp() params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "cuda_device": -1, "num_epochs": 2, "optimizer": "adam" } }) all_datasets = datasets_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for dataset in all_datasets.values() for instance in dataset) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate') self.trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None)
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read('tests/fixtures/data/squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab, params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def test_file_archiving(self): # This happens to be a good place to test auxiliary file archiving. # Train the model params = Params.from_file(self.FIXTURES_ROOT / 'elmo' / 'config' / 'characters_token_embedder.json') serialization_dir = os.path.join(self.TEST_DIR, 'serialization') train_model(params, serialization_dir) # Inspect the archive archive_file = os.path.join(serialization_dir, 'model.tar.gz') unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive') with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(unarchive_dir) # It should contain `files_to_archive.json` fta_file = os.path.join(unarchive_dir, 'files_to_archive.json') assert os.path.exists(fta_file) # Which should properly contain { flattened_key -> original_filename } with open(fta_file) as fta: files_to_archive = json.loads(fta.read()) assert files_to_archive == { 'model.text_field_embedder.token_embedders.elmo.options_file': str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'options.json'), 'model.text_field_embedder.token_embedders.elmo.weight_file': str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'lm_weights.hdf5'), } # Check that the unarchived contents of those files match the original contents. for key, original_filename in files_to_archive.items(): new_filename = os.path.join(unarchive_dir, "fta", key) assert filecmp.cmp(original_filename, new_filename)
def setUp(self): super().setUp() self.params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam", } })
def train_model_from_file(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False, recover: bool = False) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- param_path : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A HOCON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) return train_model(params, serialization_dir, file_friendly_logging, recover)
def setUp(self): super(TestCopyNetReader, self).setUp() params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def fine_tune_model_from_file_paths(model_archive_path: str, config_file: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False) -> Model: """ A wrapper around :func:`fine_tune_model` which loads the model archive from a file. Parameters ---------- model_archive_path : ``str`` Path to a saved model archive that is the result of running the ``train`` command. config_file : ``str`` A configuration file specifying how to continue training. The format is identical to the configuration file for the ``train`` command, but any contents in the ``model`` section is ignored (as we are using the provided model archive instead). serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`fine_tune_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`fine_tune_model`. """ # We don't need to pass in `cuda_device` here, because the trainer will call `model.cuda()` if # necessary. archive = load_archive(model_archive_path) params = Params.from_file(config_file, overrides) return fine_tune_model(model=archive.model, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging)
def test_load_from_file(self): filename = 'tests/fixtures/bidaf/experiment.json' params = Params.from_file(filename) assert "dataset_reader" in params assert "trainer" in params model_params = params.pop("model") assert model_params.pop("type") == "bidaf"
def from_params(cls, params: Params): input_dim = params.pop_int('input_dim') hidden_dim = params.pop_int('hidden_dim') projection_dim = params.pop_int('projection_dim', None) feedforward_hidden_dim = params.pop_int("feedforward_hidden_dim") num_layers = params.pop_int("num_layers", 2) num_attention_heads = params.pop_int('num_attention_heads', 3) use_positional_encoding = params.pop_bool('use_positional_encoding', True) dropout_prob = params.pop_float("dropout_prob", 0.2) params.assert_empty(cls.__name__) return cls(input_dim=input_dim, hidden_dim=hidden_dim, feedforward_hidden_dim=feedforward_hidden_dim, projection_dim=projection_dim, num_layers=num_layers, num_attention_heads=num_attention_heads, use_positional_encoding=use_positional_encoding, dropout_prob=dropout_prob)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BidafEnsemble': # type: ignore # pylint: disable=arguments-differ if vocab: raise ConfigurationError("vocab should be None") submodels = [] paths = params.pop("submodels") for path in paths: submodels.append(load_archive(path).model) return cls(submodels=submodels)
def find_learning_rate_from_args(args: argparse.Namespace) -> None: """ Start learning rate finder for given args """ params = Params.from_file(args.param_path, args.overrides) find_learning_rate_model(params, args.serialization_dir, start_lr=args.start_lr, end_lr=args.end_lr, num_batches=args.num_batches, linear_steps=args.linear, stopping_factor=args.stopping_factor, force=args.force)
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the phrase layer wrong - it should be 10 to match # the embedding + char cnn dimensions. params["model"]["phrase_layer"]["input_size"] = 12 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model")) params = Params.from_file(self.param_file) # Make the modeling layer input_dimension wrong - it should be 40 to match # 4 * output_dim of the phrase_layer. params["model"]["phrase_layer"]["input_size"] = 30 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model")) params = Params.from_file(self.param_file) # Make the modeling layer input_dimension wrong - it should be 70 to match # 4 * phrase_layer.output_dim + 3 * modeling_layer.output_dim. params["model"]["span_end_encoder"]["input_size"] = 50 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model"))
def from_params(cls, params: Params) -> 'WordSplitter': language = params.pop('language', 'en_core_web_sm') pos_tags = params.pop_bool('pos_tags', False) parse = params.pop_bool('parse', False) ner = params.pop_bool('ner', False) params.assert_empty(cls.__name__) return cls(language, pos_tags, parse, ner)
def from_params(cls, params: Params) -> 'MultiHeadedSimilarity': num_heads = params.pop_int("num_heads") tensor_1_dim = params.pop_int("tensor_1_dim") tensor_1_projected_dim = params.pop_int("tensor_1_projected_dim", None) tensor_2_dim = params.pop_int("tensor_2_dim", None) tensor_2_projected_dim = params.pop_int("tensor_1_projected_dim", None) internal_similarity = SimilarityFunction.from_params(params.pop("internal_similarity", {})) params.assert_empty(cls.__name__) return cls(num_heads=num_heads, tensor_1_dim=tensor_1_dim, tensor_1_projected_dim=tensor_1_projected_dim, tensor_2_dim=tensor_2_dim, tensor_2_projected_dim=tensor_2_projected_dim, internal_similarity=internal_similarity)
def load_config_file_print_params(params_fname: str, param_depth: int = -1): params = Params.from_file(params_fname) print_params_at_depth(params, 1, depth_cap=param_depth)
def test_can_build_from_params(self): reader = QangarooReader.from_params(Params({})) assert reader._token_indexers[ "tokens"].__class__.__name__ == "SingleIdTokenIndexer"
def test_read(self, lazy): params = Params({"lazy": lazy, "num_context_answers": 2}) reader = QuACReader.from_params(params) instances = reader.read( str(AllenNlpTestCase.FIXTURES_ROOT / "data" / "quac_sample.json")) instances = ensure_list(instances) assert instances[0].fields["question"].sequence_length() == 6 assert instances[0].fields["yesno_list"].sequence_length() == 6 assert [ t.text for t in instances[0].fields["question"].field_list[0].tokens[:3] ] == [ "What", "was", "the", ] assert len(instances) == 2 passage_length = len(instances[0].fields["passage"].tokens) assert [t.text for t in instances[0].fields["passage"].tokens[:3] ] == ["DJ", "Kool", "Herc"] assert [x.label for x in instances[0].fields["yesno_list"].field_list] == [ "x", "x", "y", "x", "x", "x", ] assert [ x.label for x in instances[0].fields["followup_list"].field_list ] == [ "y", "m", "m", "n", "m", "y", ] assert (instances[0].fields["p1_answer_marker"].field_list[0].labels == ["O"] * passage_length) # Check the previous answer marking here prev_1_list = ["O"] * passage_length prev_2_list = ["O"] * passage_length q0_span_start = instances[0].fields["span_start"].field_list[ 0].sequence_index q0_span_end = instances[0].fields["span_end"].field_list[ 0].sequence_index prev_1_list[q0_span_start] = "<{0:d}_{1:s}>".format(1, "start") prev_1_list[q0_span_end] = "<{0:d}_{1:s}>".format(1, "end") prev_2_list[q0_span_start] = "<{0:d}_{1:s}>".format(2, "start") prev_2_list[q0_span_end] = "<{0:d}_{1:s}>".format(2, "end") for passage_index in range(q0_span_start + 1, q0_span_end): prev_1_list[passage_index] = "<{0:d}_{1:s}>".format(1, "in") prev_2_list[passage_index] = "<{0:d}_{1:s}>".format(2, "in") assert instances[0].fields["p1_answer_marker"].field_list[ 1].labels == prev_1_list assert instances[0].fields["p2_answer_marker"].field_list[ 2].labels == prev_2_list
def from_params(cls, params: Params) -> 'BeamSearch': beam_size = params.pop('beam_size') return cls(beam_size=beam_size)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder': # type: ignore # pylint: disable=arguments-differ params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) scalar_mix_parameters = params.pop('scalar_mix_parameters', None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache, scalar_mix_parameters=scalar_mix_parameters)
def from_params(cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params) -> 'Trainer': patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") num_epochs = params.pop_int("num_epochs", 20) cuda_device = params.pop_int("cuda_device", -1) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) if cuda_device >= 0: model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: scheduler = None num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", None) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) params.assert_empty(cls.__name__) return Trainer(model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=scheduler, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval)
def from_params(cls, params: Params) -> 'DataIterator': # TODO(Mark): The adaptive iterator will need a bit of work here, # to retrieve the scaling function etc. iterator_type = params.pop_choice("type", cls.list_available()) return cls.by_name(iterator_type).from_params(params)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ProGlobal': token_embedder_params = params.pop("text_field_embedder") pos_embedder_params = params.pop("pos_field_embedder") sent_pos_embedder_params = params.pop("sent_pos_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, token_embedder_params) pos_field_embedder = TextFieldEmbedder.from_params( vocab, pos_embedder_params) sent_pos_field_embedder = TextFieldEmbedder.from_params( vocab, sent_pos_embedder_params) modeling_layer = Seq2SeqEncoder.from_params( params.pop("modeling_layer")) span_end_encoder_before = Seq2SeqEncoder.from_params( params.pop("span_end_encoder_bef")) span_start_encoder_after = Seq2SeqEncoder.from_params( params.pop("span_start_encoder_aft")) span_end_encoder_after = Seq2SeqEncoder.from_params( params.pop("span_end_encoder_aft")) dropout = params.pop('dropout', 0.2) init_params = params.pop('initializer', None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, pos_field_embedder=pos_field_embedder, sent_pos_field_embedder=sent_pos_field_embedder, modeling_layer=modeling_layer, span_start_encoder_after=span_start_encoder_after, span_end_encoder_before=span_end_encoder_before, span_end_encoder_after=span_end_encoder_after, dropout=dropout, initializer=initializer)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': # type: ignore """ We need the vocabulary here to know how many items we need to embed, and we look for a ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use. If you know beforehand exactly how many embeddings you need, or aren't using a vocabulary mapping for the things getting embedded here, then you can pass in the ``num_embeddings`` key directly, and the vocabulary will be ignored. In the configuration file, a file containing pretrained embeddings can be specified using the parameter ``"pretrained_file"``. It can be the path to a local file or an URL of a (cached) remote file. Two formats are supported: * hdf5 file - containing an embedding matrix in the form of a torch.Tensor; * text file - an utf-8 encoded text file with space separated fields:: [word] [dim 1] [dim 2] ... The text file can eventually be compressed with gzip, bz2, lzma or zip. You can even select a single file inside an archive containing multiple files using the URI:: "(archive_uri)#file_path_inside_the_archive" where ``archive_uri`` can be a file system path or a URL. For example:: "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt" """ # pylint: disable=arguments-differ num_embeddings = params.pop_int('num_embeddings', None) vocab_namespace = params.pop("vocab_namespace", "tokens") if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) embedding_dim = params.pop_int('embedding_dim') pretrained_file = params.pop("pretrained_file", None) projection_dim = params.pop_int("projection_dim", None) trainable = params.pop_bool("trainable", True) padding_index = params.pop_int('padding_index', None) max_norm = params.pop_float('max_norm', None) norm_type = params.pop_float('norm_type', 2.) scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False) sparse = params.pop_bool('sparse', False) params.assert_empty(cls.__name__) if pretrained_file: # If we're loading a saved model, we don't want to actually read a pre-trained # embedding file - the embeddings will just be in our saved weights, and we might not # have the original embedding file anymore, anyway. weight = _read_pretrained_embeddings_file(pretrained_file, embedding_dim, vocab, vocab_namespace) else: weight = None return cls(num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse)
def from_params( cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average)
def from_params(cls, params: Params) -> 'SimilarityFunction': choice = params.pop_choice('type', cls.list_available(), default_to_first_choice=True) return cls.by_name(choice).from_params(params)
def main(serialization_directory: str, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file( os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config["dataset_reader"]) evaluation_data_path = data if data else config["validation_data_path"] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) dataset = dataset_reader.read(evaluation_data_path) with torch.autograd.no_grad(): loader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=32) model_predictions: List[List[str]] = [] for batch in Tqdm.tqdm(loader): batch = move_to_device(batch, device) result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(dataset, model_predictions): fields = instance.fields verb_index = fields["metadata"]["verb_index"] gold_tags = fields["metadata"]["gold_tags"] sentence = fields["metadata"]["words"] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BidirectionalAttentionFlow': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) num_highway_layers = params.pop_int("num_highway_layers") phrase_layer = Seq2SeqEncoder.from_params(params.pop("phrase_layer")) similarity_function = SimilarityFunction.from_params( params.pop("similarity_function")) modeling_layer = Seq2SeqEncoder.from_params( params.pop("modeling_layer")) span_end_encoder = Seq2SeqEncoder.from_params( params.pop("span_end_encoder")) dropout = params.pop_float('dropout', 0.2) initializer = InitializerApplicator.from_params( params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params.pop('regularizer', [])) mask_lstms = params.pop_bool('mask_lstms', True) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, num_highway_layers=num_highway_layers, phrase_layer=phrase_layer, attention_similarity_function=similarity_function, modeling_layer=modeling_layer, span_end_encoder=span_end_encoder, dropout=dropout, mask_lstms=mask_lstms, initializer=initializer, regularizer=regularizer)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ModelMSMARCO': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) #num_highway_layers = params.pop_int("num_highway_layers") phrase_layer = Seq2SeqEncoder.from_params(params.pop("phrase_layer")) similarity_function = SimilarityFunction.from_params( params.pop("similarity_function")) residual_encoder = Seq2SeqEncoder.from_params( params.pop("residual_encoder")) span_start_encoder = Seq2SeqEncoder.from_params( params.pop("span_start_encoder")) span_end_encoder = Seq2SeqEncoder.from_params( params.pop("span_end_encoder")) #feed_forward = FeedForward.from_params(params.pop("feed_forward")) dropout = params.pop_float('dropout', 0.2) initializer = InitializerApplicator.from_params( params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params.pop('regularizer', [])) mask_lstms = params.pop_bool('mask_lstms', True) params.assert_empty(cls.__name__) return cls( vocab=vocab, text_field_embedder=text_field_embedder, # num_highway_layers=num_highway_layers, phrase_layer=phrase_layer, attention_similarity_function=similarity_function, residual_encoder=residual_encoder, span_start_encoder=span_start_encoder, span_end_encoder=span_end_encoder, dropout=dropout, mask_lstms=mask_lstms, initializer=initializer, regularizer=regularizer)
def create_serialization_dir(params: Params, serialization_dir: str, recover: bool) -> None: """ This function creates the serialization directory if it doesn't exist. If it already exists, then it verifies that we're recovering from a training with an identical configuration. Parameters ---------- params: ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir: ``str`` The directory in which to save results and logs. recover: ``bool`` If ``True``, we will try to recover from an existing serialization directory, and crash if the directory doesn't exist, or doesn't match the configuration we're given. """ if os.path.exists(serialization_dir): if serialization_dir == '/output': # Special-casing the beaker output directory, which will already exist when training # starts. return if not recover: raise ConfigurationError( f"Serialization directory ({serialization_dir}) already exists. " f"Specify --recover to recover training from existing output.") logger.info(f"Recovering from prior training at {serialization_dir}.") recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME) if not os.path.exists(recovered_config_file): raise ConfigurationError( "The serialization directory already exists but doesn't " "contain a config.json. You probably gave the wrong directory." ) else: loaded_params = Params.from_file(recovered_config_file) # Check whether any of the training configuration differs from the configuration we are # resuming. If so, warn the user that training may fail. fail = False flat_params = params.as_flat_dict() flat_loaded = loaded_params.as_flat_dict() for key in flat_params.keys() - flat_loaded.keys(): logger.error( f"Key '{key}' found in training configuration but not in the serialization " f"directory we're recovering from.") fail = True for key in flat_loaded.keys() - flat_params.keys(): logger.error( f"Key '{key}' found in the serialization directory we're recovering from " f"but not in the training config.") fail = True for key in flat_params.keys(): if flat_params.get(key, None) != flat_loaded.get(key, None): logger.error( f"Value for '{key}' in training configuration does not match that the value in " f"the serialization directory we're recovering from: " f"{flat_params[key]} != {flat_loaded[key]}") fail = True if fail: raise ConfigurationError( "Training configuration does not match the configuration we're " "recovering from.") else: if recover: raise ConfigurationError( f"--recover specified but serialization_dir ({serialization_dir}) " "does not exist. There is nothing to recover from.") os.makedirs(serialization_dir)
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError( f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "default") if trainer_choice != "default": raise ConfigurationError( "currently find-learning-rate only works with the default Trainer") trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=None, params=trainer_params, validation_iterator=None) logger.info( f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.' ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError("--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover) evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate(trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) # TODO(mattg): pull this block out into a separate function (maybe just add this to # `prepare_environment`?) Tqdm.set_slower_interval(file_friendly_logging) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
class ArchivalTest(AllenNlpTestCase): def setUp(self): super().setUp() self.params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "iterator": { "type": "basic", "batch_size": 2 }, "trainer": { "num_epochs": 2, "optimizer": "adam", } }) def test_archiving(self): # copy params, since they'll get consumed during training params_copy = copy.deepcopy(self.params.as_dict()) # `train_model` should create an archive model = train_model(self.params, serialization_dir=self.TEST_DIR) archive_path = os.path.join(self.TEST_DIR, "model.tar.gz") # load from the archive archive = load_archive(archive_path) model2 = archive.model # check that model weights are the same keys = set(model.state_dict().keys()) keys2 = set(model2.state_dict().keys()) assert keys == keys2 for key in keys: assert torch.equal(model.state_dict()[key], model2.state_dict()[key]) # check that vocabularies are the same vocab = model.vocab vocab2 = model2.vocab assert vocab._token_to_index == vocab2._token_to_index # pylint: disable=protected-access assert vocab._index_to_token == vocab2._index_to_token # pylint: disable=protected-access # check that params are the same params2 = archive.config assert params2.as_dict() == params_copy def test_extra_files(self): serialization_dir = os.path.join(self.TEST_DIR, 'serialization') # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = { "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv' } archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(os.path.join(serialization_dir, 'model.tar.gz')) params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get('train_data_path').endswith('/fta/train_data_path') # The validation data path should be the same though. assert params.get('validation_data_path' ) == 'tests/fixtures/data/sequence_tagging.tsv'
print(infos["extra.walkthrough"]) cnt = 0 while not all(dones) and cnt < 50: infos["gamefile"] = game_file commands = actor.act(obs, cumulative_rewards, dones, infos) obs, cumulative_rewards, dones, infos = env.step(commands) cnt += 1 infos["gamefile"] = game_file actor.act(obs, cumulative_rewards, dones, infos) print(get_sample_history_trace(actor.history, game_file)) if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("game_file", type=str) args = parser.parse_args() params = Params.from_file("configs/config.jsonnet") agent = SimpleNet( config=params["network"], device="cpu", vocab_size=params["training"]["vocab_size"], ) agent.load_state_dict(torch.load(params["training"]["model_path"])) game_file = f"games/train/{args.game_file}" check_agent(game_file=game_file, agent_net=agent, train_params=params.pop("training"))
def ensure_model_can_train_save_and_load( self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = "", disable_dropout: bool = True, ): """ # Parameters param_file : `str` Path to a training configuration file that we will use to train the model for this test. tolerance : `float`, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as `rtol` to `numpy.testing.assert_allclose`). cuda_device : `int`, optional (default=-1) The device to run the test on. gradients_to_ignore : `Set[str]`, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : `str`, optional (default = "") A JSON string that we will use to override values in the input parameter file. disable_dropout : `bool`, optional (default = True) If True we will set all dropout to 0 before checking gradients. (Otherwise, with small datasets, you may get zero gradients because of unlucky dropout.) """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose( model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key, ) params = Params.from_file(param_file, params_overrides=overrides) reader = DatasetReader.from_params(params["dataset_reader"]) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params["iterator"] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. print("Reading with original model") model_dataset = reader.read(params["validation_data_path"]) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) print("Reading with loaded model") loaded_dataset = reader.read(params["validation_data_path"]) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore, disable_dropout) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, "stateful") and module.stateful: module.reset_states() print("Predicting with original model") model_predictions = model(**model_batch) print("Predicting with loaded model") loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1): save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next( iterator(model_dataset, shuffle=False, cuda_device=cuda_device)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next( iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def test_train_model(self): params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "data_loader": { "batch_size": 2 }, "trainer": { "num_epochs": 2, "optimizer": "adam" }, }) train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) # It's OK if serialization dir exists but is empty: serialization_dir2 = os.path.join(self.TEST_DIR, "empty_directory") assert not os.path.exists(serialization_dir2) os.makedirs(serialization_dir2) train_model(params(), serialization_dir=serialization_dir2) # It's not OK if serialization dir exists and has junk in it non-empty: serialization_dir3 = os.path.join(self.TEST_DIR, "non_empty_directory") assert not os.path.exists(serialization_dir3) os.makedirs(serialization_dir3) with open(os.path.join(serialization_dir3, "README.md"), "w") as f: f.write("TEST") with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=serialization_dir3) # It's also not OK if serialization dir is a real serialization dir: with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) # But it's OK if serialization dir exists and --recover is specified: train_model( params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), recover=True, ) # It's ok serialization dir exists and --force is specified (it will be deleted): train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), force=True) # But --force and --recover cannot both be specified with pytest.raises(ConfigurationError): train_model( params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), force=True, recover=True, )
def test_train_model_distributed_with_sharded_reader(self): params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sharded", "base_reader": { "type": "sequence_tagging" }, "lazy": True, }, "train_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "validation_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "data_loader": { "batch_size": 2 }, "trainer": { "num_epochs": 2, "optimizer": "adam" }, "distributed": { "cuda_devices": [0, 1] }, }) out_dir = os.path.join(self.TEST_DIR, "test_distributed_train") train_model(params(), serialization_dir=out_dir) # Check that some logs specific to distributed # training are where we expect. serialized_files = os.listdir(out_dir) assert "stderr_worker0.log" in serialized_files assert "stdout_worker0.log" in serialized_files assert "stderr_worker1.log" in serialized_files assert "stdout_worker1.log" in serialized_files assert "model.tar.gz" in serialized_files # Check we can load the seralized model archive = load_archive(out_dir) assert archive.model # Check that we created a vocab from all the shards. tokens = archive.model.vocab._token_to_index["tokens"].keys() assert tokens == { "@@PADDING@@", "@@UNKNOWN@@", "are", ".", "animals", "plants", "vehicles", "cats", "dogs", "snakes", "birds", "ferns", "trees", "flowers", "vegetables", "cars", "buses", "planes", "rockets", } # TODO: This is somewhat brittle. Make these constants in trainer.py. train_early = "finishing training early!" validation_early = "finishing validation early!" train_complete = "completed its entire epoch (training)." validation_complete = "completed its entire epoch (validation)." # There are three shards, but only two workers, so the first worker will have to discard some data. with open(os.path.join(out_dir, "stdout_worker0.log")) as f: worker0_log = f.read() assert train_early in worker0_log assert validation_early in worker0_log assert train_complete not in worker0_log assert validation_complete not in worker0_log with open(os.path.join(out_dir, "stdout_worker1.log")) as f: worker1_log = f.read() assert train_early not in worker1_log assert validation_early not in worker1_log assert train_complete in worker1_log assert validation_complete in worker1_log
def from_params(cls, params: Params) -> 'Elmo': # Add files to archive params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) num_output_representations = params.pop('num_output_representations') do_layer_norm = params.pop_bool('do_layer_norm', False) keep_sentence_boundaries = params.pop_bool('keep_sentence_boundaries', False) dropout = params.pop_float('dropout', 0.5) scalar_mix_parameters = params.pop('scalar_mix_parameters', None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, num_output_representations=num_output_representations, requires_grad=requires_grad, do_layer_norm=do_layer_norm, keep_sentence_boundaries=keep_sentence_boundaries, dropout=dropout, scalar_mix_parameters=scalar_mix_parameters)
def test_can_build_from_params(self): reader = SquadReaderV2.from_params(Params({})) # pylint: disable=protected-access assert reader._tokenizer.__class__.__name__ == 'WordTokenizer' assert reader._token_indexers[ "tokens"].__class__.__name__ == 'SingleIdTokenIndexer'
def debug_vocab(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- parameter_filename : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', -1)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) vocab = model.vocab vocab_namespace_dict = vocab._token_to_index vocab_oov_token = vocab._oov_token vocab_non_padded_namespaces = vocab._non_padded_namespaces # this is a set vocab_tokens_dict = vocab_namespace_dict['tokens'] vocab_labels_dict = vocab_namespace_dict['labels'] print() print("Vocab's OOV token: " + vocab_oov_token) print("Non-padded namespaces in vocab: " + str(list(vocab_non_padded_namespaces))) print() print("Number of words in vocab's tokens dict: " + str(len(vocab_tokens_dict))) if any( namespace_match(pattern, 'tokens') for pattern in vocab_non_padded_namespaces): is_padded = False else: is_padded = True print("tokens will return True for is_padded: " + str(is_padded)) print("Vocab's OOV token is in its tokens dict (should be True): " + str(vocab_oov_token in vocab_tokens_dict)) print() print("Number of words in vocab's labels dict: " + str(len(vocab_labels_dict))) if any( namespace_match(pattern, 'labels') for pattern in vocab_non_padded_namespaces): is_padded = False else: is_padded = True print("labels will return True for is_padded: " + str(is_padded)) print("Vocab's OOV token is in its labels dict (should be False): " + str(vocab_oov_token in vocab_labels_dict))
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate(best_model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
import torch import torch.optim as optim reader = MultiwozReader() print("Reading the delexiclaized data from training set and validation set...") train_dataset = reader.read("./data/train_delex.json") valid_dataset = reader.read("./data/valid_delex.json") print("Building vocabulary from training set and validation set...") vocab = Vocabulary.from_instances(train_dataset + valid_dataset) print("Temporary vocabulary has been built.") params = Params({"token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 128 }}}) EMBEDDING_DIM = 128 source_embedder = BasicTextFieldEmbedder.from_params(vocab, params=params) HIDDEN_DIM = 256 encoder1 = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, bidirectional=True, batch_first=True)) encoder2 = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, bidirectional=True, batch_first=True)) attention = DotProductAttention() print("Use SPNet with default setting...") model = SPNet(vocab, source_embedder, encoder1, encoder2, attention)
def from_params(cls, vocab: Vocabulary, params: Params) -> "ElmoTokenEmbedder": # type: ignore params.add_file_to_archive("options_file") params.add_file_to_archive("weight_file") options_file = params.pop("options_file") weight_file = params.pop("weight_file") requires_grad = params.pop("requires_grad", False) do_layer_norm = params.pop_bool("do_layer_norm", False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) scalar_mix_parameters = params.pop("scalar_mix_parameters", None) params.assert_empty(cls.__name__) return cls( options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache, scalar_mix_parameters=scalar_mix_parameters, )