def test_from_params(self): # pylint: disable=protected-access params = Params({}) with pytest.raises(ConfigurationError): iterator = BucketIterator.from_params(params) sorting_keys = [("s1", "nt"), ("s2", "nt2")] params['sorting_keys'] = sorting_keys iterator = BucketIterator.from_params(params) assert iterator._sorting_keys == sorting_keys assert iterator._padding_noise == 0.1 assert not iterator._biggest_batch_first assert iterator._batch_size == 32 params = Params({ "sorting_keys": sorting_keys, "padding_noise": 0.5, "biggest_batch_first": True, "batch_size": 100 }) iterator = BucketIterator.from_params(params) assert iterator._sorting_keys == sorting_keys assert iterator._padding_noise == 0.5 assert iterator._biggest_batch_first assert iterator._batch_size == 100
def from_params(self, params: Params) -> PytorchSeq2VecWrapper: if not params.pop('batch_first', True): raise ConfigurationError("Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params['batch_first'] = True module = self._module_class(**params.as_dict()) return PytorchSeq2VecWrapper(module)
def setUp(self): super().setUp() self.params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "iterator": { "type": "basic", "batch_size": 2 }, "trainer": { "num_epochs": 2, "optimizer": "adam", } })
def test_forward(self): batch = 16 len1, len2 = 21, 24 seq_len1 = torch.randint(low=len1 - 10, high=len1 + 1, size=(batch,)).long() seq_len2 = torch.randint(low=len2 - 10, high=len2 + 1, size=(batch,)).long() mask1 = [] for w in seq_len1: mask1.append([1] * w.item() + [0] * (len1 - w.item())) mask1 = torch.FloatTensor(mask1) mask2 = [] for w in seq_len2: mask2.append([1] * w.item() + [0] * (len2 - w.item())) mask2 = torch.FloatTensor(mask2) d = 200 # hidden dimension l = 20 # number of perspective test1 = torch.randn(batch, len1, d) test2 = torch.randn(batch, len2, d) test1 = test1 * mask1.view(-1, len1, 1).expand(-1, len1, d) test2 = test2 * mask2.view(-1, len2, 1).expand(-1, len2, d) test1_fw, test1_bw = torch.split(test1, d // 2, dim=-1) test2_fw, test2_bw = torch.split(test2, d // 2, dim=-1) ml_fw = BiMpmMatching.from_params(Params({"is_forward": True, "num_perspectives": l})) ml_bw = BiMpmMatching.from_params(Params({"is_forward": False, "num_perspectives": l})) vecs_p_fw, vecs_h_fw = ml_fw(test1_fw, mask1, test2_fw, mask2) vecs_p_bw, vecs_h_bw = ml_bw(test1_bw, mask1, test2_bw, mask2) vecs_p, vecs_h = torch.cat(vecs_p_fw + vecs_p_bw, dim=2), torch.cat(vecs_h_fw + vecs_h_bw, dim=2) assert vecs_p.size() == torch.Size([batch, len1, 10 + 10 * l]) assert vecs_h.size() == torch.Size([batch, len2, 10 + 10 * l]) assert ml_fw.get_output_dim() == ml_bw.get_output_dim() == vecs_p.size(2) // 2 == vecs_h.size(2) // 2
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises( ValueError, message= "No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str( self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri( archive_path, 'folder/fake_embeddings.5d.txt') params = Params({'pretrained_file': file_uri, 'embedding_dim': 5}) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def test_from_params(self): # pylint: disable=protected-access params = Params({}) iterator = BasicIterator.from_params(params) assert iterator._batch_size == 32 # default value params = Params({"batch_size": 10}) iterator = BasicIterator.from_params(params) assert iterator._batch_size == 10
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder': # type: ignore # pylint: disable=arguments-differ embedding_params: Params = params.pop("embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. embedding_params.setdefault("vocab_namespace", "token_characters") embedding = Embedding.from_params(vocab, embedding_params) encoder_params: Params = params.pop("encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def test_extras(self): # pylint: disable=unused-variable,arguments-differ from srl_model.common.registrable import Registrable class A(Registrable): pass @A.register("b") class B(A): def __init__(self, size: int, name: str) -> None: self.size = size self.name = name @A.register("c") class C(A): def __init__(self, size: int, name: str) -> None: self.size = size self.name = name # custom from params @classmethod def from_params(cls, params: Params, size: int) -> 'C': # type: ignore name = params.pop('name') return cls(size=size, name=name) # Check that extras get passed, even though A doesn't need them. params = Params({"type": "b", "size": 10}) b = A.from_params(params, name="extra") assert b.name == "extra" assert b.size == 10 # Check that extra extras don't get passed. params = Params({"type": "b", "size": 10}) b = A.from_params(params, name="extra", unwanted=True) assert b.name == "extra" assert b.size == 10 # Now the same with a custom from_params. params = Params({"type": "c", "name": "extra_c"}) c = A.from_params(params, size=20) assert c.name == "extra_c" assert c.size == 20 # Check that extra extras don't get passed. params = Params({"type": "c", "name": "extra_c"}) c = A.from_params(params, size=20, unwanted=True) assert c.name == "extra_c" assert c.size == 20
def test_mismatched_dimensions_raise_configuration_errors(self): params = Params.from_file(self.param_file) # Make the input_dim to the first feedforward_layer wrong - it should be 2. params["model"]["attend_feedforward"]["input_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model")) params = Params.from_file(self.param_file) # Make the projection output_dim of the last layer wrong - it should be # 3, equal to the number of classes. params["model"]["aggregate_feedforward"]["output_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def test_can_construct_from_params(self): params = Params({ 'embedding_dim': 5, }) encoder = BagOfEmbeddingsEncoder.from_params(params) assert encoder.get_input_dim() == 5 assert encoder.get_output_dim() == 5 params = Params({ 'embedding_dim': 12, 'averaged': True }) encoder = BagOfEmbeddingsEncoder.from_params(params) assert encoder.get_input_dim() == 12 assert encoder.get_output_dim() == 12
def test_search(self): beam_search = BeamSearch.from_params(Params({'beam_size': 4})) initial_state = SimpleDecoderState([0, 1, 2, 3], [[], [], [], []], [ torch.Tensor([0.0]), torch.Tensor([0.0]), torch.Tensor([0.0]), torch.Tensor([0.0]) ], [-3, 1, -20, 5]) decoder_step = SimpleDecoderStep(include_value_in_score=True) best_states = beam_search.search(5, initial_state, decoder_step, keep_final_unfinished_states=False) # Instance with batch index 2 needed too many steps to finish, and batch index 3 had no # path to get to a finished state. (See the simple transition system definition; goal is # to end up at 4, actions are either add one or two to starting value.) assert len(best_states) == 2 assert best_states[0][0].action_history[0] == [-1, 1, 3, 4] assert best_states[1][0].action_history[0] == [3, 4] best_states = beam_search.search(5, initial_state, decoder_step, keep_final_unfinished_states=True) # Now we're keeping final unfinished states, which allows a "best state" for the instances # that didn't have one before. Our previous best states for the instances that finish # doesn't change, because the score for taking another step is always negative at these # values. assert len(best_states) == 4 assert best_states[0][0].action_history[0] == [-1, 1, 3, 4] assert best_states[1][0].action_history[0] == [3, 4] assert best_states[2][0].action_history[0] == [-18, -16, -14, -12, -10] assert best_states[3][0].action_history[0] == [7, 9, 11, 13, 15]
def test_train_with_test_set(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "lazy-test"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "test_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "evaluate_on_test": True, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'lazy_test_set'))
def test_forward_gives_correct_output(self): params = Params({ 'input_dim': 2, 'output_dims': 3, 'pool_sizes': 4, 'dropout': 0.0, 'num_layers': 2 }) maxout = Maxout.from_params(params) constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.) initializer = InitializerApplicator([(".*", constant_init)]) initializer(maxout) input_tensor = torch.FloatTensor([[-3, 1]]) output = maxout(input_tensor).data.numpy() assert output.shape == (1, 3) # This output was checked by hand # The output of the first maxout layer is [-1, -1, -1], since the # matrix multiply gives us [-2]*12. Reshaping and maxing # produces [-2, -2, -2] and the bias increments these values. # The second layer output is [-2, -2, -2], since the matrix # matrix multiply gives us [-3]*12. Reshaping and maxing # produces [-3, -3, -3] and the bias increments these values. assert_almost_equal(output, [[-2, -2, -2]])
def fine_tune_model_from_file_paths(model_archive_path: str, config_file: str, serialization_dir: str, overrides: str = "", extend_vocab: bool = False, file_friendly_logging: bool = False) -> Model: """ A wrapper around :func:`fine_tune_model` which loads the model archive from a file. Parameters ---------- model_archive_path : ``str`` Path to a saved model archive that is the result of running the ``train`` command. config_file : ``str`` A configuration file specifying how to continue training. The format is identical to the configuration file for the ``train`` command, but any contents in the ``model`` section is ignored (as we are using the provided model archive instead). serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`fine_tune_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`fine_tune_model`. """ # We don't need to pass in `cuda_device` here, because the trainer will call `model.cuda()` if # necessary. archive = load_archive(model_archive_path) params = Params.from_file(config_file, overrides) return fine_tune_model(model=archive.model, params=params, serialization_dir=serialization_dir, extend_vocab=extend_vocab, file_friendly_logging=file_friendly_logging)
def test_error_is_throw_when_cuda_device_is_not_available(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "cuda_device": torch.cuda.device_count(), "optimizer": "adam" } }) with pytest.raises(ConfigurationError, message="Experiment specified a GPU but none is available;" " if you want to run on CPU use the override" " 'trainer.cuda_device=-1' in the json config file."): train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
def test_forward_works_with_projection_layer(self): params = Params({ 'options_file': self.FIXTURES_ROOT / 'elmo' / 'options.json', 'weight_file': self.FIXTURES_ROOT / 'elmo' / 'lm_weights.hdf5', 'projection_dim': 20 }) word1 = [0] * 50 word2 = [0] * 50 word1[0] = 6 word1[1] = 5 word1[2] = 4 word1[3] = 3 word2[0] = 3 word2[1] = 2 word2[2] = 1 word2[3] = 0 embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params) input_tensor = torch.LongTensor([[word1, word2]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 2, 20) input_tensor = torch.LongTensor([[[word1]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 1, 20)
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write( f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def test_can_construct_from_params(self): params = Params({ 'embedding_dim': 5, 'num_filters': 4, 'ngram_filter_sizes': [3, 5] }) encoder = CnnEncoder.from_params(params) assert encoder.get_output_dim() == 8 params = Params({ 'embedding_dim': 5, 'num_filters': 4, 'ngram_filter_sizes': [3, 5], 'output_dim': 7 }) encoder = CnnEncoder.from_params(params) assert encoder.get_output_dim() == 7
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read(self.FIXTURES_ROOT / 'data' / 'squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_embedders'][ 'token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab=vocab, params=params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, "embedder_to_indexer_map": { "words": ["words"], "elmo": ["elmo", "words"] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs)
def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params( vocab=self.vocab, params=params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) }
def test_forward_works_on_higher_order_input(self): params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { 'words': (torch.rand(3, 4, 5, 6) * 20).long(), 'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def train_model_from_file(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False, recover: bool = False) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- param_path : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) return train_model(params, serialization_dir, file_friendly_logging, recover)
def test_can_construct_from_params(self): params = Params({ 'tensor_1_dim': 4, 'tensor_2_dim': 4, 'combination': 'x,y,x*y,y-x' }) linear = LinearSimilarity.from_params(params) assert list(linear._weight_vector.size()) == [16]
def test_can_init_dot(self): legacy_attention = MatrixAttention.from_params( Params({ "type": "linear", "tensor_1_dim": 3, "tensor_2_dim": 3 })) isinstance(legacy_attention, LinearMatrixAttention)
def test_from_params_requires_batch_first(self): params = Params({ "type": "lstm", "batch_first": False, }) with pytest.raises(ConfigurationError): # pylint: disable=unused-variable encoder = Seq2VecEncoder.from_params(params)
def test_model_load(self): params = Params.from_file(self.FIXTURES_ROOT / 'decomposable_attention' / 'experiment.json') model = Model.load(params, serialization_dir=self.FIXTURES_ROOT / 'decomposable_attention' / 'serialization') assert isinstance(model, DecomposableAttention)
def test_can_build_from_params(self): params = Params({ "type": "legacy", 'similarity_function': { 'type': 'cosine' } }) attention = MatrixAttention.from_params(params) # pylint: disable=protected-access assert attention._similarity_function.__class__.__name__ == 'CosineSimilarity'
def test_can_build_from_params(self): params = Params({ 'similarity_function': { 'type': 'cosine' }, 'normalize': False }) attention = LegacyAttention.from_params(params) # pylint: disable=protected-access assert attention._similarity_function.__class__.__name__ == 'CosineSimilarity' assert attention._normalize is False