def test_regexes_with_backslashes(self): bad_regex = self.TEST_DIR / 'bad_regex.jsonnet' good_regex = self.TEST_DIR / 'good_regex.jsonnet' with open(bad_regex, 'w') as f: f.write(r'{"myRegex": "a\.b"}') with open(good_regex, 'w') as f: f.write(r'{"myRegex": "a\\.b"}') with pytest.raises(RuntimeError): Params.from_file(bad_regex) params = Params.from_file(good_regex) regex = params['myRegex'] assert re.match(regex, "a.b") assert not re.match(regex, "a-b") # Check roundtripping good_regex2 = self.TEST_DIR / 'good_regex2.jsonnet' with open(good_regex2, 'w') as f: f.write(json.dumps(params.as_dict())) params2 = Params.from_file(good_regex2) assert params.as_dict() == params2.as_dict()
def prepare_environment(params: Params): """ Sets random seeds for reproducible experiments. This may not work as expected if you use this from within a python project in which you have already imported Pytorch. If you use the scripts/run_model.py entry point to training models with this library, your experiments should be reasonably reproducible. If you are using this from your own project, you will want to call this function before importing Pytorch. Complete determinism is very difficult to achieve with libraries doing optimized linear algebra due to massively parallel execution, which is exacerbated by using GPUs. Parameters ---------- params: Params object or dict, required. A ``Params`` object or dict holding the json parameters. """ seed = params.pop_int("random_seed", 13370) numpy_seed = params.pop_int("numpy_seed", 1337) torch_seed = params.pop_int("pytorch_seed", 133) if seed is not None: random.seed(seed) if numpy_seed is not None: numpy.random.seed(numpy_seed) if torch_seed is not None: torch.manual_seed(torch_seed) # Seed all GPUs with the same seed if available. if torch.cuda.is_available(): torch.cuda.manual_seed_all(torch_seed) log_pytorch_version_info()
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory_path' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def remove_pretrained_embedding_params(params: Params): keys = params.keys() if 'pretrained_file' in keys: del params['pretrained_file'] for value in params.values(): if isinstance(value, Params): remove_pretrained_embedding_params(value)
def test_fine_tune_nograd_regex(self): original_model = load_archive(self.model_archive).model name_parameters_original = dict(original_model.named_parameters()) regex_lists = [[], [".*attend_feedforward.*", ".*token_embedder.*"], [".*compare_feedforward.*"]] for regex_list in regex_lists: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = regex_list shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model( model=original_model, params=params, serialization_dir=self.serialization_dir) # If regex is matched, parameter name should have requires_grad False # If regex is matched, parameter name should have same requires_grad # as the originally loaded model for name, parameter in tuned_model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad \ == name_parameters_original[name].requires_grad # If all parameters have requires_grad=False, then error. with pytest.raises(Exception) as _: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = ["*"] shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model( model=original_model, params=params, serialization_dir=self.serialization_dir)
def test_known_configs(self): configs = os.listdir(self.PROJECT_ROOT / "training_config") # Our configs use environment variable substitution, and the _jsonnet parser # will fail if we don't pass it correct environment variables. forced_variables = [ # constituency parser 'PTB_TRAIN_PATH', 'PTB_DEV_PATH', 'PTB_TEST_PATH', # srl_elmo_5.5B 'SRL_TRAIN_DATA_PATH', 'SRL_VALIDATION_DATA_PATH', # coref 'COREF_TRAIN_DATA_PATH', 'COREF_DEV_DATA_PATH', 'COREF_TEST_DATA_PATH', # ner 'NER_TRAIN_DATA_PATH', 'NER_TEST_A_PATH', 'NER_TEST_B_PATH' ] for var in forced_variables: os.environ[var] = os.environ.get(var) or str(self.TEST_DIR) for config in configs: try: Params.from_file(self.PROJECT_ROOT / "training_config" / config) except Exception as e: raise AssertionError(f"unable to load params for {config}, because {e}") for var in forced_variables: if os.environ[var] == str(self.TEST_DIR): del os.environ[var]
def make_vocab_from_params(params: Params, serialization_dir: str): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [ instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation ] vocab = Vocabulary.from_params(vocab_params, instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def from_params(cls, params: Params) -> 'B': params.add_file_to_archive("filename") filename = params.pop("filename") c_params = params.pop("c") c = C.from_params(c_params) return cls(filename, c)
def from_params(cls, optimizer: torch.optim.Optimizer, params: Params): # type: ignore # pylint: disable=arguments-differ scheduler = params.pop_choice("type", LearningRateScheduler.list_available()) schedulers = LearningRateScheduler.by_name(scheduler)(optimizer, **params.as_dict()) # type: ignore if isinstance(schedulers, torch.optim.lr_scheduler.ReduceLROnPlateau): return LearningRateWithMetricsWrapper(schedulers) else: return LearningRateWithoutMetricsWrapper(schedulers)
def test_as_ordered_dict(self): # keyD > keyC > keyE; keyDA > keyDB; Next all other keys alphabetically preference_orders = [["keyD", "keyC", "keyE"], ["keyDA", "keyDB"]] params = Params({"keyC": "valC", "keyB": "valB", "keyA": "valA", "keyE": "valE", "keyD": {"keyDB": "valDB", "keyDA": "valDA"}}) ordered_params_dict = params.as_ordered_dict(preference_orders) expected_ordered_params_dict = OrderedDict({'keyD': {'keyDA': 'valDA', 'keyDB': 'valDB'}, 'keyC': 'valC', 'keyE': 'valE', 'keyA': 'valA', 'keyB': 'valB'}) assert json.dumps(ordered_params_dict) == json.dumps(expected_ordered_params_dict)
def test_no_metric_wrapper_can_support_none_for_metrics(self): model = torch.nn.Sequential(torch.nn.Linear(10, 10)) lrs = LearningRateScheduler.from_params( Optimizer.from_params(model.named_parameters(), Params({"type": "adam"})), Params({ "type": "step", "step_size": 1 })) lrs.step(None, None)
def test_reduce_on_plateau_error_throw_when_no_metrics_exist(self): model = torch.nn.Sequential(torch.nn.Linear(10, 10)) with self.assertRaises(ConfigurationError) as context: LearningRateScheduler.from_params( Optimizer.from_params(model.named_parameters(), Params({"type": "adam"})), Params({"type": "reduce_on_plateau"})).step(None, None) self.assertTrue( 'The reduce_on_plateau learning rate scheduler requires a validation metric' in str(context.exception))
def test_noam_learning_rate_schedule_does_not_crash(self): model = torch.nn.Sequential(torch.nn.Linear(10, 10)) lrs = LearningRateScheduler.from_params( Optimizer.from_params(model.named_parameters(), Params({"type": "adam"})), Params({ "type": "noam", "model_size": 10, "warmup_steps": 2000 })) lrs.step(None) lrs.step_batch(None)
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name)
def test_elmo_num_repr_set_flags_mismatch_throws_configuration_error(self): # pylint: disable=line-too-long params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json') # Elmo is specified in the model, with num_output_representations=2. Set # only one flag to true. tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) # set num_output_representations to 1, and set both flags to True. tmp_params = deepcopy(params) tmp_params["model"]["elmo"]["num_output_representations"] = 1 tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the encoder wrong - it should be 2 to match # the embedding dimension from the text_field_embedder. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def setUp(self): super(TestTrainer, self).setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)
def test_regex_matches_are_initialized_correctly(self): class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() self.linear_1_with_funky_name = torch.nn.Linear(5, 10) self.linear_2 = torch.nn.Linear(10, 5) self.conv = torch.nn.Conv1d(5, 5, 5) def forward(self, inputs): # pylint: disable=arguments-differ pass # Make sure we handle regexes properly json_params = """{"initializer": [ ["conv", {"type": "constant", "val": 5}], ["funky_na.*bi", {"type": "constant", "val": 7}] ]} """ params = Params(json.loads(_jsonnet.evaluate_snippet("", json_params))) initializers = InitializerApplicator.from_params(params['initializer']) model = Net() initializers(model) for parameter in model.conv.parameters(): assert torch.equal(parameter.data, torch.ones(parameter.size()) * 5) parameter = model.linear_1_with_funky_name.bias assert torch.equal(parameter.data, torch.ones(parameter.size()) * 7)
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the phrase layer wrong - it should be 150 to match # the embedding + binary feature dimensions. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Ensembles don't have vocabularies or weights of their own, so they override _load. """ model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=None, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def from_params(cls: Type[T], params: Params, **extras) -> T: """ This is the automatic implementation of `from_params`. Any class that subclasses `FromParams` (or `Registrable`, which itself subclasses `FromParams`) gets this implementation for free. If you want your class to be instantiated from params in the "obvious" way -- pop off parameters and hand them to your constructor with the same names -- this provides that functionality. If you need more complex logic in your from `from_params` method, you'll have to implement your own method that overrides this one. """ # pylint: disable=protected-access from srl_model.common.registrable import Registrable # import here to avoid circular imports logger.info( f"instantiating class {cls} from params {getattr(params, 'params', params)} " f"and extras {extras}") if params is None: return None registered_subclasses = Registrable._registry.get(cls) if registered_subclasses is not None: # We know ``cls`` inherits from Registrable, so we'll use a cast to make mypy happy. # We have to use a disable to make pylint happy. # pylint: disable=no-member as_registrable = cast(Type[Registrable], cls) default_to_first_choice = as_registrable.default_implementation is not None choice = params.pop_choice( "type", choices=as_registrable.list_available(), default_to_first_choice=default_to_first_choice) subclass = registered_subclasses[choice] # We want to call subclass.from_params. It's possible that it's just the "free" # implementation here, in which case it accepts `**extras` and we are not able # to make any assumptions about what extra parameters it needs. # # It's also possible that it has a custom `from_params` method. In that case it # won't accept any **extra parameters and we'll need to filter them out. if not takes_arg(subclass.from_params, 'extras'): # Necessarily subclass.from_params is a custom implementation, so we need to # pass it only the args it's expecting. extras = { k: v for k, v in extras.items() if takes_arg(subclass.from_params, k) } return subclass.from_params(params=params, **extras) else: # This is not a base class, so convert our params and extras into a dict of kwargs. if cls.__init__ == object.__init__: # This class does not have an explicit constructor, so don't give it any kwargs. # Without this logic, create_kwargs will look at object.__init__ and see that # it takes *args and **kwargs and look for those. kwargs: Dict[str, Any] = {} else: # This class has a constructor, so create kwargs for it. kwargs = create_kwargs(cls, params, **extras) return cls(**kwargs) # type: ignore
def test_optimizer_basic(self): optimizer_params = Params({"type": "sgd", "lr": 1}) parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, optimizer_params) param_groups = optimizer.param_groups assert len(param_groups) == 1 assert param_groups[0]['lr'] == 1
def test_can_optimise_model_with_dense_and_sparse_params(self): optimizer_params = Params({"type": "dense_sparse_adam"}) parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, optimizer_params) iterator = BasicIterator(2) iterator.index_with(self.vocab) Trainer(self.model, optimizer, iterator, self.instances).train()
def test_from_params(self): optim = self._get_optimizer() sched = LearningRateScheduler.from_params( optim, Params({ "type": "cosine", "t_max": 5 })).lr_scheduler assert sched.t_max == 5 assert sched._initialized is True # Learning should be unchanged after initializing scheduler. assert optim.param_groups[0]["lr"] == 1.0 with self.assertRaises(TypeError): # t_max is required. LearningRateScheduler.from_params(optim, Params({"type": "cosine"}))
def test_load_from_file(self): filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json' params = Params.from_file(filename) assert "dataset_reader" in params assert "trainer" in params model_params = params.pop("model") assert model_params.pop("type") == "bidaf"
def test_env_var_substitution(self): substitutor = self.TEST_DIR / 'substitutor.jsonnet' key = 'TEST_ENV_VAR_SUBSTITUTION' assert os.environ.get(key) is None with open(substitutor, 'w') as f: f.write(f'{{"path": std.extVar("{key}")}}') # raises without environment variable set with pytest.raises(RuntimeError): Params.from_file(substitutor) os.environ[key] = "PERFECT" params = Params.from_file(substitutor) assert params['path'] == "PERFECT" del os.environ[key]
def test_as_flat_dict(self): params = Params({ 'a': 10, 'b': { 'c': 20, 'd': 'stuff' } }).as_flat_dict() assert params == {'a': 10, 'b.c': 20, 'b.d': 'stuff'}
def test_fine_tune_does_not_expand_vocab_by_default(self): params = Params.from_file(self.config_file) # snli2 has a new token in it params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl') model = load_archive(self.model_archive).model # By default, no vocab expansion. fine_tune_model(model, params, self.serialization_dir)
def test_elmo_but_no_set_flags_throws_configuration_error(self): # pylint: disable=line-too-long params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json') # Elmo is specified in the model, but set both flags to false. params["model"]["use_input_elmo"] = False params["model"]["use_integrator_output_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get("model"))