def make_vocab_from_params(params: Params, serialization_dir: str): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def test_fine_tune_nograd_regex(self): original_model = load_archive(self.model_archive).model name_parameters_original = dict(original_model.named_parameters()) regex_lists = [[], [".*attend_feedforward.*", ".*token_embedder.*"], [".*compare_feedforward.*"]] for regex_list in regex_lists: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = regex_list shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model(model=original_model, params=params, serialization_dir=self.serialization_dir) # If regex is matched, parameter name should have requires_grad False # If regex is matched, parameter name should have same requires_grad # as the originally loaded model for name, parameter in tuned_model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad \ == name_parameters_original[name].requires_grad # If all parameters have requires_grad=False, then error. with pytest.raises(Exception) as _: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = ["*"] shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model(model=original_model, params=params, serialization_dir=self.serialization_dir)
def test_simple_tagger_constraint_type_deprecated(self): params = Params({"model": { "type": "crf_tagger", "constraint_type": "IOB1", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 50 }, } }, "encoder": { "type": "gru", "input_size": 50, "hidden_size": 10, "num_layers": 2, "dropout": 0.5, "bidirectional": True }}}) with pytest.warns(DeprecationWarning): model = Model.from_params(vocab=self.vocab, params=params.pop("model")) assert model._f1_metric is not None assert model._f1_metric._label_encoding == "IOB1" assert model.label_encoding == "IOB1" assert model.crf._constraint_mask.sum().item() != (model.num_tags + 2)**2
def make_vocab_from_params(params: Params): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) vocab_dir = vocab_params.get('directory_path') if vocab_dir is None: raise ConfigurationError("To use `make-vocab` your configuration must contain a value " "at vocabulary.directory_path") os.makedirs(vocab_dir, exist_ok=True) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(Params({}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def prepare_environment(params: Params): """ Sets random seeds for reproducible experiments. This may not work as expected if you use this from within a python project in which you have already imported Pytorch. If you use the scripts/run_model.py entry point to training models with this library, your experiments should be reasonably reproducible. If you are using this from your own project, you will want to call this function before importing Pytorch. Complete determinism is very difficult to achieve with libraries doing optimized linear algebra due to massively parallel execution, which is exacerbated by using GPUs. Parameters ---------- params: Params object or dict, required. A ``Params`` object or dict holding the json parameters. """ seed = params.pop_int("random_seed", 13370) numpy_seed = params.pop_int("numpy_seed", 1337) torch_seed = params.pop_int("pytorch_seed", 133) if seed is not None: random.seed(seed) if numpy_seed is not None: numpy.random.seed(numpy_seed) if torch_seed is not None: torch.manual_seed(torch_seed) # Seed all GPUs with the same seed if available. if torch.cuda.is_available(): torch.cuda.manual_seed_all(torch_seed) log_pytorch_version_info()
def test_regexes_with_backslashes(self): bad_regex = self.TEST_DIR / 'bad_regex.jsonnet' good_regex = self.TEST_DIR / 'good_regex.jsonnet' with open(bad_regex, 'w') as f: f.write(r'{"myRegex": "a\.b"}') with open(good_regex, 'w') as f: f.write(r'{"myRegex": "a\\.b"}') with pytest.raises(RuntimeError): Params.from_file(bad_regex) params = Params.from_file(good_regex) regex = params['myRegex'] assert re.match(regex, "a.b") assert not re.match(regex, "a-b") # Check roundtripping good_regex2 = self.TEST_DIR / 'good_regex2.jsonnet' with open(good_regex2, 'w') as f: f.write(json.dumps(params.as_dict())) params2 = Params.from_file(good_regex2) assert params.as_dict() == params2.as_dict()
def test_known_configs(self): configs = os.listdir(self.PROJECT_ROOT / "training_config") # Our configs use environment variable substitution, and the _jsonnet parser # will fail if we don't pass it correct environment variables. forced_variables = [ # constituency parser 'PTB_TRAIN_PATH', 'PTB_DEV_PATH', 'PTB_TEST_PATH', # srl_elmo_5.5B 'SRL_TRAIN_DATA_PATH', 'SRL_VALIDATION_DATA_PATH', # coref 'COREF_TRAIN_DATA_PATH', 'COREF_DEV_DATA_PATH', 'COREF_TEST_DATA_PATH', # ner 'NER_TRAIN_DATA_PATH', 'NER_TEST_A_PATH', 'NER_TEST_B_PATH' ] for var in forced_variables: os.environ[var] = os.environ.get(var) or str(self.TEST_DIR) for config in configs: try: Params.from_file(self.PROJECT_ROOT / "training_config" / config) except Exception as e: raise AssertionError(f"unable to load params for {config}, because {e}") for var in forced_variables: if os.environ[var] == str(self.TEST_DIR): del os.environ[var]
def remove_pretrained_embedding_params(params: Params): keys = params.keys() if 'pretrained_file' in keys: del params['pretrained_file'] for value in params.values(): if isinstance(value, Params): remove_pretrained_embedding_params(value)
def from_params(cls, params: Params) -> 'B': params.add_file_to_archive("filename") filename = params.pop("filename") c_params = params.pop("c") c = C.from_params(c_params) return cls(filename, c)
def from_params(cls, optimizer: torch.optim.Optimizer, params: Params): # type: ignore # pylint: disable=arguments-differ scheduler = params.pop_choice("type", LearningRateScheduler.list_available()) schedulers = LearningRateScheduler.by_name(scheduler)(optimizer, **params.as_dict()) # type: ignore if isinstance(schedulers, torch.optim.lr_scheduler.ReduceLROnPlateau): return LearningRateWithMetricsWrapper(schedulers) else: return LearningRateWithoutMetricsWrapper(schedulers)
def test_as_ordered_dict(self): # keyD > keyC > keyE; keyDA > keyDB; Next all other keys alphabetically preference_orders = [["keyD", "keyC", "keyE"], ["keyDA", "keyDB"]] params = Params({"keyC": "valC", "keyB": "valB", "keyA": "valA", "keyE": "valE", "keyD": {"keyDB": "valDB", "keyDA": "valDA"}}) ordered_params_dict = params.as_ordered_dict(preference_orders) expected_ordered_params_dict = OrderedDict({'keyD': {'keyDA': 'valDA', 'keyDB': 'valDB'}, 'keyC': 'valC', 'keyE': 'valE', 'keyA': 'valA', 'keyB': 'valB'}) assert json.dumps(ordered_params_dict) == json.dumps(expected_ordered_params_dict)
def test_to_file(self): # Test to_file works with or without preference orders params_dict = {"keyA": "valA", "keyB": "valB"} expected_ordered_params_dict = OrderedDict({"keyB": "valB", "keyA": "valA"}) params = Params(params_dict) file_path = self.TEST_DIR / 'config.jsonnet' # check with preference orders params.to_file(file_path, [["keyB", "keyA"]]) with open(file_path, "r") as handle: ordered_params_dict = OrderedDict(json.load(handle)) assert json.dumps(expected_ordered_params_dict) == json.dumps(ordered_params_dict) # check without preference orders doesn't give error params.to_file(file_path)
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name)
def test_from_params(self): params = Params({"regularizers": [("conv", "l1"), ("linear", {"type": "l2", "alpha": 10})]}) regularizer_applicator = RegularizerApplicator.from_params(params.pop("regularizers")) regularizers = regularizer_applicator._regularizers # pylint: disable=protected-access conv = linear = None for regex, regularizer in regularizers: if regex == "conv": conv = regularizer elif regex == "linear": linear = regularizer assert isinstance(conv, L1Regularizer) assert isinstance(linear, L2Regularizer) assert linear.alpha == 10
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Ensembles don't have vocabularies or weights of their own, so they override _load. """ model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=None, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def main(param_file: str, extra_beaker_commands: List[str]): ecr_repository = "896129387501.dkr.ecr.us-west-2.amazonaws.com" commit = subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True).strip() image = f"{ecr_repository}/allennlp/allennlp:{commit}" overrides = "" # Reads params and sets environment. params = Params.from_file(param_file, overrides) flat_params = params.as_flat_dict() env = [] for k, v in flat_params.items(): k = str(k).replace('.', '_') env.append(f"--env={k}={v}") # If the git repository is dirty, add a random hash. result = subprocess.run('git diff-index --quiet HEAD --', shell=True) if result.returncode != 0: dirty_hash = "%x" % random_int image += "-" + dirty_hash # Get temporary ecr login. For this command to work, you need the python awscli # package with a version more recent than 1.11.91. print("Generating ECR Login Command") login_command = subprocess.check_output('aws --region=us-west-2 ecr get-login --no-include-email', shell=True) print("Logging into ECR") subprocess.run(login_command, shell=True, check=True) print(f"Building the Docker image ({image})") subprocess.run(f'docker build -t {image} .', shell=True, check=True) print(f"Pushing the Docker image ({image})") subprocess.run(f'docker push {image}', shell=True, check=True) config_dataset_id = subprocess.check_output(f'beaker dataset create --quiet {param_file}', shell=True, universal_newlines=True).strip() filename = os.path.basename(param_file) allennlp_command = [ "python", "-m", "allennlp.run", "train", "/config.json", "-s", "/output", "--file-friendly-logging" ] # TODO(michaels): add back in the env list. # Presently this makes the Beaker UI unusably cluttered. command = [ '/usr/local/bin/beaker', 'experiment', 'run', '--result-path', '/output', "--source", f"{config_dataset_id}:/config.json"] + env + extra_beaker_commands + [image] + allennlp_command print(' '.join(command)) subprocess.run(command, check=True)
def test_mismatching_contextualizer_unidirectionality_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the contextualizer unidirectionality wrong - it should be # False to match the language model. params["model"]["contextualizer"]["bidirectional"] = (not self.bidirectional) with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get("model"))
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the encoder wrong - it should be 2 to match # the embedding dimension from the text_field_embedder. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the phrase layer wrong - it should be 150 to match # the embedding + binary feature dimensions. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def from_params(cls, params: Params) -> 'TokenCharactersIndexer': """ Parameters ---------- namespace : ``str``, optional (default=``token_characters``) We will use this namespace in the :class:`Vocabulary` to map the characters in each token to indices. character_tokenizer : ``Params``, optional (default=``Params({})``) We use a :class:`CharacterTokenizer` to handle splitting tokens into characters, as it has options for byte encoding and other things. These parameters get passed to the character tokenizer. The default is to use unicode characters and to retain casing. """ namespace = params.pop('namespace', 'token_characters') character_tokenizer_params = params.pop('character_tokenizer', {}) character_tokenizer = CharacterTokenizer.from_params(character_tokenizer_params) params.assert_empty(cls.__name__) return cls(namespace=namespace, character_tokenizer=character_tokenizer)
def test_elmo_but_no_set_flags_throws_configuration_error(self): # pylint: disable=line-too-long params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json') # Elmo is specified in the model, but set both flags to false. params["model"]["use_input_elmo"] = False params["model"]["use_integrator_output_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get("model"))
def from_params(cls: Type[T], params: Params, **extras) -> T: """ This is the automatic implementation of `from_params`. Any class that subclasses `FromParams` (or `Registrable`, which itself subclasses `FromParams`) gets this implementation for free. If you want your class to be instantiated from params in the "obvious" way -- pop off parameters and hand them to your constructor with the same names -- this provides that functionality. If you need more complex logic in your from `from_params` method, you'll have to implement your own method that overrides this one. """ # pylint: disable=protected-access from allennlp.common.registrable import Registrable # import here to avoid circular imports logger.info(f"instantiating class {cls} from params {getattr(params, 'params', params)} " f"and extras {extras}") if params is None: return None registered_subclasses = Registrable._registry.get(cls) if registered_subclasses is not None: # We know ``cls`` inherits from Registrable, so we'll use a cast to make mypy happy. # We have to use a disable to make pylint happy. # pylint: disable=no-member as_registrable = cast(Type[Registrable], cls) default_to_first_choice = as_registrable.default_implementation is not None choice = params.pop_choice("type", choices=as_registrable.list_available(), default_to_first_choice=default_to_first_choice) subclass = registered_subclasses[choice] # We want to call subclass.from_params. It's possible that it's just the "free" # implementation here, in which case it accepts `**extras` and we are not able # to make any assumptions about what extra parameters it needs. # # It's also possible that it has a custom `from_params` method. In that case it # won't accept any **extra parameters and we'll need to filter them out. if not takes_arg(subclass.from_params, 'extras'): # Necessarily subclass.from_params is a custom implementation, so we need to # pass it only the args it's expecting. extras = {k: v for k, v in extras.items() if takes_arg(subclass.from_params, k)} return subclass.from_params(params=params, **extras) else: # This is not a base class, so convert our params and extras into a dict of kwargs. if cls.__init__ == object.__init__: # This class does not have an explicit constructor, so don't give it any kwargs. # Without this logic, create_kwargs will look at object.__init__ and see that # it takes *args and **kwargs and look for those. kwargs: Dict[str, Any] = {} else: # This class has a constructor, so create kwargs for it. kwargs = create_kwargs(cls, params, **extras) return cls(**kwargs) # type: ignore
def load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. Parameters ---------- config: Params The configuration that was used to train the model. It should definitely have a `model` section, and should probably have a `trainer` section as well. serialization_dir: str = None The directory containing the serialized weights, parameters, and vocabulary of the model. weights_file: str = None By default we load the weights from `best.th` in the serialization directory, but you can override that value here. cuda_device: int = -1 By default we load the model on the CPU, but if you want to load it for GPU usage you can specify the id of your GPU here Returns ------- model: Model The model specified in the configuration, loaded with the serialized vocabulary and the trained weights. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. _remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab, model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def test_fine_tune_does_not_expand_vocab_by_default(self): params = Params.from_file(self.config_file) # snli2 has a new token in it params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl') model = load_archive(self.model_archive).model # By default, no vocab expansion. fine_tune_model(model, params, self.serialization_dir)
def test_fine_tune_runtime_errors_with_vocab_expansion(self): params = Params.from_file(self.config_file) params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl') model = load_archive(self.model_archive).model # If we do vocab expansion, we get a runtime error because of the embedding. with pytest.raises(RuntimeError): fine_tune_model(model, params, self.serialization_dir, extend_vocab=True)
def from_params(cls, params: Params) -> 'Initializer': # type: ignore # pylint: disable=arguments-differ # Just a string - corresponds to the name of an initializer. if isinstance(params, str): return cls.by_name(params)() else: choice = params.pop_choice("type", cls.list_available()) return cls.by_name(choice).from_params(params)
def test_load_from_file(self): filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json' params = Params.from_file(filename) assert "dataset_reader" in params assert "trainer" in params model_params = params.pop("model") assert model_params.pop("type") == "bidaf"
def test_env_var_substitution(self): substitutor = self.TEST_DIR / 'substitutor.jsonnet' key = 'TEST_ENV_VAR_SUBSTITUTION' assert os.environ.get(key) is None with open(substitutor, 'w') as f: f.write(f'{{"path": std.extVar("{key}")}}') # raises without environment variable set with pytest.raises(RuntimeError): Params.from_file(substitutor) os.environ[key] = "PERFECT" params = Params.from_file(substitutor) assert params['path'] == "PERFECT" del os.environ[key]
def make_vocab_from_args(args: argparse.Namespace): """ Just converts from an ``argparse.Namespace`` object to params. """ parameter_path = args.param_path overrides = args.overrides params = Params.from_file(parameter_path, overrides) make_vocab_from_params(params)
def from_params( cls: Type[T], params: Params, constructor_to_call: Callable[..., T] = None, constructor_to_inspect: Callable[..., T] = None, **extras, ) -> T: """ This is the automatic implementation of `from_params`. Any class that subclasses `FromParams` (or `Registrable`, which itself subclasses `FromParams`) gets this implementation for free. If you want your class to be instantiated from params in the "obvious" way -- pop off parameters and hand them to your constructor with the same names -- this provides that functionality. If you need more complex logic in your from `from_params` method, you'll have to implement your own method that overrides this one. The `constructor_to_call` and `constructor_to_inspect` arguments deal with a bit of redirection that we do. We allow you to register particular `@classmethods` on a class as the constructor to use for a registered name. This lets you, e.g., have a single `Vocabulary` class that can be constructed in two different ways, with different names registered to each constructor. In order to handle this, we need to know not just the class we're trying to construct (`cls`), but also what method we should inspect to find its arguments (`constructor_to_inspect`), and what method to call when we're done constructing arguments (`constructor_to_call`). These two methods are the same when you've used a `@classmethod` as your constructor, but they are `different` when you use the default constructor (because you inspect `__init__`, but call `cls()`). """ from allennlp.common.registrable import Registrable # import here to avoid circular imports logger.info( f"instantiating class {cls} from params {getattr(params, 'params', params)} " f"and extras {set(extras.keys())}") if params is None: return None if isinstance(params, str): params = Params({"type": params}) registered_subclasses = Registrable._registry.get(cls) if registered_subclasses is not None and not constructor_to_call: # We know `cls` inherits from Registrable, so we'll use a cast to make mypy happy. as_registrable = cast(Type[Registrable], cls) default_to_first_choice = as_registrable.default_implementation is not None choice = params.pop_choice( "type", choices=as_registrable.list_available(), default_to_first_choice=default_to_first_choice, ) subclass, constructor_name = as_registrable.resolve_class_name( choice) # See the docstring for an explanation of what's going on here. if not constructor_name: constructor_to_inspect = subclass.__init__ constructor_to_call = subclass # type: ignore else: constructor_to_inspect = getattr(subclass, constructor_name) constructor_to_call = constructor_to_inspect if hasattr(subclass, "from_params"): # We want to call subclass.from_params. extras = create_extras(subclass, extras) # mypy can't follow the typing redirection that we do, so we explicitly cast here. retyped_subclass = cast(Type[T], subclass) return retyped_subclass.from_params( params=params, constructor_to_call=constructor_to_call, constructor_to_inspect=constructor_to_inspect, **extras, ) else: # In some rare cases, we get a registered subclass that does _not_ have a # from_params method (this happens with Activations, for instance, where we # register pytorch modules directly). This is a bit of a hack to make those work, # instead of adding a `from_params` method for them somehow. We just trust that # you've done the right thing in passing your parameters, and nothing else needs to # be recursively constructed. extras = create_extras(subclass, extras) constructor_args = {**params, **extras} return subclass(**constructor_args) # type: ignore else: # This is not a base class, so convert our params and extras into a dict of kwargs. # See the docstring for an explanation of what's going on here. if not constructor_to_inspect: constructor_to_inspect = cls.__init__ if not constructor_to_call: constructor_to_call = cls if constructor_to_inspect == object.__init__: # This class does not have an explicit constructor, so don't give it any kwargs. # Without this logic, create_kwargs will look at object.__init__ and see that # it takes *args and **kwargs and look for those. kwargs: Dict[str, Any] = {} else: # This class has a constructor, so create kwargs for it. kwargs = create_kwargs(constructor_to_inspect, cls, params, **extras) return constructor_to_call(**kwargs) # type: ignore
class TestMakeVocabFromParams(AllenNlpTestCase): @pytest.mark.parametrize( "params", [ Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "test_data_path": "path-to-validation-file", "datasets_for_vocab_creation": [], "data_loader": { "batch_size": 2 }, }), Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "datasets_for_vocab_creation": [], "data_loader": { "batch_size": 2 }, }), Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "test_data_path": "path-to-validation-file", "vocabulary": { "type": "empty" }, "data_loader": { "batch_size": 2 }, }), ], ) def test_no_instances_read_for_vocab(self, caplog, params): _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "...train-util-test-reader reading from" not in log_messages assert "Reading training data" not in log_messages assert "Reading validation data" not in log_messages assert "Reading test data" not in log_messages def test_only_train_read_for_vocab(self, caplog): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "data_loader": { "batch_size": 2 }, }) _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "...train-util-test-reader reading from path-to-training-file" in log_messages assert "...train-util-test-reader reading from path-to-validation-file" not in log_messages assert "...train-util-test-reader reading from path-to-test-file" not in log_messages assert "Reading training data" in log_messages assert "Reading validation data" not in log_messages assert "Reading test data" not in log_messages def test_all_datasets_read_for_vocab(self, caplog): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "test_data_path": "path-to-test-file", "data_loader": { "batch_size": 2 }, }) _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "...train-util-test-reader reading from path-to-training-file" in log_messages assert "...train-util-test-reader reading from path-to-validation-file" in log_messages assert "...train-util-test-reader reading from path-to-test-file" in log_messages assert "Reading training data" in log_messages assert "Reading validation data" in log_messages assert "Reading test data" in log_messages def test_only_specified_datasets_read_for_vocab(self, caplog): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "test_data_path": "path-to-test-file", "datasets_for_vocab_creation": ["train", "validation"], "data_loader": { "batch_size": 2 }, }) _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "...train-util-test-reader reading from path-to-training-file" in log_messages assert "...train-util-test-reader reading from path-to-validation-file" in log_messages assert "...train-util-test-reader reading from path-to-test-file" not in log_messages assert "Reading training data" in log_messages assert "Reading validation data" in log_messages assert "Reading test data" not in log_messages def test_using_seperate_validation_reader(self, caplog): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "validation_dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "data_loader": { "batch_size": 2 }, }) _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "Using a separate dataset reader to load validation and test data" in log_messages def test_invalid_datasets_for_vocab_creation(self): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "datasets_for_vocab_creation": ["train", "validation", "test"], "data_loader": { "batch_size": 2 }, }) with pytest.raises(ConfigurationError, match="invalid 'datasets_for_vocab_creation' test"): make_vocab_from_params(params, str(self.TEST_DIR)) def test_raise_error_if_directory_non_empty(self): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "data_loader": { "batch_size": 2 }, }) os.makedirs(self.TEST_DIR / "vocabulary") with open(self.TEST_DIR / "vocabulary" / "blah", "w") as random_file: random_file.write("BLAH!") with pytest.raises(ConfigurationError, match="The 'vocabulary' directory in the provided"): make_vocab_from_params(params, str(self.TEST_DIR)) def test_get_metrics(self): class FakeModel(Model): def forward(self, **kwargs): return {} model = FakeModel(None) total_loss = 100.0 batch_loss = 10.0 num_batches = 2 metrics = get_metrics(model, total_loss, None, batch_loss, None, num_batches) assert metrics["loss"] == float(total_loss / num_batches) assert metrics["batch_loss"] == batch_loss metrics = get_metrics(model, total_loss, None, None, None, num_batches) assert metrics["loss"] == float(total_loss / num_batches) assert "batch_loss" not in metrics def test_exception_serialization(self): e = ConfigurationError("example") assert {"message": "example"} == vars(pickle.loads(pickle.dumps(e)))
encoder_out = self.encoder(embeddings, mask) tag_logits = self.hidden2tag(encoder_out) output = {"tag_logits": tag_logits} if labels is not None: self.accuracy(tag_logits, labels, mask) output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)} # In practice you'd probably do this from the command line: # $ allennlp train tutorials/tagger/experiment.jsonnet -s /tmp/serialization_dir --include-package tutorials.tagger.config_allennlp # if __name__ == "__main__": params = Params.from_file('./character_experiment.jsonnet') serialization_dir = tempfile.mkdtemp() model = train_model(params, serialization_dir) # Make predictions predictor = SentenceTaggerPredictor(model, dataset_reader=PosDatasetReader()) tag_logits = predictor.predict("The dog ate the apple")['tag_logits'] print(tag_logits) tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) shutil.rmtree(serialization_dir)
def from_params(cls: Type[T], params: Params, **extras) -> T: """ This is the automatic implementation of `from_params`. Any class that subclasses `FromParams` (or `Registrable`, which itself subclasses `FromParams`) gets this implementation for free. If you want your class to be instantiated from params in the "obvious" way -- pop off parameters and hand them to your constructor with the same names -- this provides that functionality. If you need more complex logic in your from `from_params` method, you'll have to implement your own method that overrides this one. """ # pylint: disable=protected-access from allennlp.common.registrable import Registrable # import here to avoid circular imports logger.info( f"instantiating class {cls} from params {getattr(params, 'params', params)} " f"and extras {set(extras.keys())}") if params is None: return None if isinstance(params, str): params = Params({"type": params}) registered_subclasses = Registrable._registry.get(cls) if registered_subclasses is not None: # We know ``cls`` inherits from Registrable, so we'll use a cast to make mypy happy. # We have to use a disable to make pylint happy. # pylint: disable=no-member as_registrable = cast(Type[Registrable], cls) default_to_first_choice = as_registrable.default_implementation is not None choice = params.pop_choice( "type", choices=as_registrable.list_available(), default_to_first_choice=default_to_first_choice) subclass = registered_subclasses[choice] # We want to call subclass.from_params. It's possible that it's just the "free" # implementation here, in which case it accepts `**extras` and we are not able # to make any assumptions about what extra parameters it needs. # # It's also possible that it has a custom `from_params` method. In that case it # won't accept any **extra parameters and we'll need to filter them out. if not takes_arg(subclass.from_params, 'extras'): # Necessarily subclass.from_params is a custom implementation, so we need to # pass it only the args it's expecting. extras = { k: v for k, v in extras.items() if takes_arg(subclass.from_params, k) } return subclass.from_params(params=params, **extras) else: # This is not a base class, so convert our params and extras into a dict of kwargs. if cls.__init__ == object.__init__: # This class does not have an explicit constructor, so don't give it any kwargs. # Without this logic, create_kwargs will look at object.__init__ and see that # it takes *args and **kwargs and look for those. kwargs: Dict[str, Any] = {} else: # This class has a constructor, so create kwargs for it. kwargs = create_kwargs(cls, params, **extras) return cls(**kwargs) # type: ignore
) parser.add_argument( '-folder', dest='folder', help='folder location', type=str, ) parser.add_argument( '-no_tqdm', dest='no_tqdm', action='store_true', ) args = parser.parse_args() params = Params.from_file(args.params) train, val, test = VCR.splits( mode='rationale' if args.rationale else 'answer', embs_to_load=params['dataset_reader'].get('embs', 'bert_da'), only_use_relevant_dets=params['dataset_reader'].get( 'only_use_relevant_dets', True)) NUM_GPUS = torch.cuda.device_count() NUM_CPUS = multiprocessing.cpu_count() if NUM_GPUS == 0: raise ValueError("you need gpus!") def _to_gpu(td): if NUM_GPUS > 1: return td for k in td:
def from_params(cls, params: Params) -> "SpanExtractor": choice = params.pop_choice('type', cls.list_available()) return cls.by_name(choice).from_params(params)
def test_from_params_valid_vocab_extension_thoroughly(self): """ Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana """ vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary( non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens0": SingleIdTokenIndexer("tokens0")}, ) text_field1 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens1": SingleIdTokenIndexer("tokens1")}, ) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([ Instance({ "text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5, }) ]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens5"], }) extended_vocab = Vocabulary.from_params(params, instances=instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == { "tokens1", "tokens3", "tokens5" } # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size( "tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size( "tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index( token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index( index, namespace) extended_vocab_token = extended_vocab.get_token_from_index( index, namespace) assert vocab_token == extended_vocab_token
def main(param_file: str, args: argparse.Namespace): commit = subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True).strip() image = f"allennlp/sparc_rc:{commit}" overrides = "" # Reads params and sets environment. params = Params.from_file(param_file, overrides) flat_params = params.as_flat_dict() env = {} for k, v in flat_params.items(): k = str(k).replace('.', '_') env[k] = str(v) # If the git repository is dirty, add a random hash. result = subprocess.run('git diff-index --quiet HEAD --', shell=True) if result.returncode != 0: dirty_hash = "%x" % random_int image += "-" + dirty_hash if args.blueprint: blueprint = args.blueprint print(f"Using the specified blueprint: {blueprint}") else: print(f"Building the Docker image ({image})...") subprocess.run(f'docker build -t {image} .', shell=True, check=True) print(f"Create a Beaker blueprint...") blueprint = subprocess.check_output( f'beaker blueprint create --quiet {image}', shell=True, universal_newlines=True).strip() print(f" Blueprint created: {blueprint}") config_dataset_id = subprocess.check_output( f'beaker dataset create --quiet {param_file}', shell=True, universal_newlines=True).strip() allennlp_command = [ "python", "-m", "allennlp.run", "train", "/config.json", "-s", "/output", "--file-friendly-logging", "--include-package", "reading_comprehension" ] dataset_mounts = [] for source in args.source + [f"{config_dataset_id}:/config.json"]: datasetId, containerPath = source.split(":") dataset_mounts.append({ "datasetId": datasetId, "containerPath": containerPath }) for var in args.env: key, value = var.split("=") env[key] = value requirements = {} if args.cpu: requirements["cpu"] = float(args.cpu) if args.memory: requirements["memory"] = args.memory if args.gpu_count: requirements["gpuCount"] = int(args.gpu_count) config_spec = { "description": args.desc, "blueprint": blueprint, "resultPath": "/output", "args": allennlp_command, "datasetMounts": dataset_mounts, "requirements": requirements, "env": env } config_task = {"spec": config_spec, "name": "training"} config = {"tasks": [config_task]} output_path = args.spec_output_path if args.spec_output_path else tempfile.mkstemp( ".yaml", "beaker-config-")[1] with open(output_path, "w") as output: output.write(json.dumps(config, indent=4)) print(f"Beaker spec written to {output_path}.") experiment_command = [ "beaker", "experiment", "create", "--file", output_path ] if args.name: experiment_command.append("--name") experiment_command.append(args.name.replace(" ", "-")) if args.dry_run: print( f"This is a dry run (--dry-run). Launch your job with the following command:" ) print(f" " + " ".join(experiment_command)) else: print(f"Running the experiment:") print(f" " + " ".join(experiment_command)) subprocess.run(experiment_command)
def load_archive( archive_file: str, cuda_device: int = -1, opt_level: str = None, overrides: str = "", weights_file: str = None, ) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. # Parameters archive_file : `str` The archive file to load the model from. cuda_device : `int`, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. opt_level : `str`, optional, (default = `None`) Each `opt_level` establishes a set of properties that govern Amp’s implementation of pure or mixed precision training. Must be a choice of `"O0"`, `"O1"`, `"O2"`, or `"O3"`. See the Apex [documentation](https://nvidia.github.io/apex/amp.html#opt-levels-and-properties) for more details. If `None`, defaults to the `opt_level` found in the model params. If `cuda_device==-1`, Amp is not used and this argument is ignored. overrides : `str`, optional (default = "") JSON overrides to apply to the unarchived `Params` object. weights_file : `str`, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info( f"extracting archive file {resolved_archive_file} to temp dir {tempdir}" ) with tarfile.open(resolved_archive_file, "r:gz") as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load( config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device, opt_level=opt_level, ) return Archive(model=model, config=config)
def from_params(cls, params: Params): return cls(**params.as_dict())
def load_archive( archive_file: Union[str, PathLike], cuda_device: int = -1, overrides: Union[str, Dict[str, Any]] = "", weights_file: str = None, ) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. # Parameters archive_file : `Union[str, PathLike]` The archive file to load the model from. cuda_device : `int`, optional (default = `-1`) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides : `Union[str, Dict[str, Any]]`, optional (default = `""`) JSON overrides to apply to the unarchived `Params` object. weights_file : `str`, optional (default = `None`) The weights file to use. If unspecified, weights.th in the archive_file will be used. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) meta: Optional[Meta] = None tempdir = None try: if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: with extracted_archive(resolved_archive_file, cleanup=False) as tempdir: serialization_dir = tempdir if weights_file: weights_path = weights_file else: weights_path = get_weights_path(serialization_dir) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) # Instantiate model and dataset readers. Use a duplicate of the config, as it will get consumed. dataset_reader, validation_dataset_reader = _load_dataset_readers( config.duplicate(), serialization_dir) model = _load_model(config.duplicate(), weights_path, serialization_dir, cuda_device) # Load meta. meta_path = os.path.join(serialization_dir, META_NAME) if os.path.exists(meta_path): meta = Meta.from_path(meta_path) finally: if tempdir is not None: logger.info( f"removing temporary unarchived model dir at {tempdir}") shutil.rmtree(tempdir, ignore_errors=True) # Check version compatibility. if meta is not None: _check_version_compatibility(archive_file, meta) return Archive( model=model, config=config, dataset_reader=dataset_reader, validation_dataset_reader=validation_dataset_reader, meta=meta, )
mask = get_text_field_mask(sentence) embeddings = self.word_embeddings(sentence) encoder_out = self.encoder(embeddings, mask) tag_logits = self.hidden2tag(encoder_out) output = {"tag_logits": tag_logits} if labels is not None: self.accuracy(tag_logits, labels, mask) output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)} # In practice you'd probably do this from the command line: # $ allennlp train tutorials/tagger/experiment.jsonnet -s /tmp/serialization_dir --include-package tutorials.tagger.config_allennlp # if __name__ == "__main__": params = Params.from_file('./config.jsonnet') serialization_dir = tempfile.mkdtemp() model = train_model(params, serialization_dir) # Make predictions predictor = SentenceTaggerPredictor(model, dataset_reader=PosDatasetReader()) tag_logits = predictor.predict("骑着 狗 出去 逛 街")['tag_logits'] print(tag_logits) tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) shutil.rmtree(serialization_dir)
if __name__ == "__main__": reader = UDDatasetReader() train_dataset = reader.read('data/UD_English-EWT/en_ewt-ud-train.conllu') validation_dataset = reader.read( 'data/UD_English-EWT/en_ewt-ud-dev.conllu') vocab = Vocabulary.from_instances(train_dataset + validation_dataset) EMBEDDING_DIM = 100 HIDDEN_DIM = 200 model_params = Params({ 'type': 'lstm', 'input_size': EMBEDDING_DIM, 'hidden_size': HIDDEN_DIM }) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embedding = BasicTextFieldEmbedder({'tokens': token_embedding}) lstm = Seq2SeqEncoder.from_params(model_params) model = POSTagger(word_embedding, lstm, vocab) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=64, sorting_keys=[('sentence', 'num_tokens')]) iterator.index_with(vocab)
def create_serialization_dir( params: Params, serialization_dir: str, recover: bool, force: bool) -> None: """ This function creates the serialization directory if it doesn't exist. If it already exists and is non-empty, then it verifies that we're recovering from a training with an identical configuration. Parameters ---------- params: ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir: ``str`` The directory in which to save results and logs. recover: ``bool`` If ``True``, we will try to recover from an existing serialization directory, and crash if the directory doesn't exist, or doesn't match the configuration we're given. force: ``bool`` If ``True``, we will overwrite the serialization directory if it already exists. """ if recover and force: raise ConfigurationError("Illegal arguments: both force and recover are true.") if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): if not recover: raise ConfigurationError(f"Serialization directory ({serialization_dir}) already exists and is " f"not empty. Specify --recover to recover training from existing output.") logger.info(f"Recovering from prior training at {serialization_dir}.") recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME) if not os.path.exists(recovered_config_file): raise ConfigurationError("The serialization directory already exists but doesn't " "contain a config.json. You probably gave the wrong directory.") else: loaded_params = Params.from_file(recovered_config_file) # Check whether any of the training configuration differs from the configuration we are # resuming. If so, warn the user that training may fail. fail = False flat_params = params.as_flat_dict() flat_loaded = loaded_params.as_flat_dict() # Exclude some keys from being checked as matching config no_check_keys = ['trainer.cuda_device', 'train_data_path', 'validation_data_path', 'test_data_path'] # Make this the overrides for key in no_check_keys: flat_params.pop(key, None) flat_loaded.pop(key, None) for key in flat_params.keys() - flat_loaded.keys(): logger.error(f"Key '{key}' found in training configuration but not in the serialization " f"directory we're recovering from.") fail = True for key in flat_loaded.keys() - flat_params.keys(): logger.error(f"Key '{key}' found in the serialization directory we're recovering from " f"but not in the training config.") fail = True for key in flat_params.keys(): if flat_params.get(key, None) != flat_loaded.get(key, None): logger.error(f"Value for '{key}' in training configuration does not match that the value in " f"the serialization directory we're recovering from: " f"{flat_params[key]} != {flat_loaded[key]}") fail = True if fail: raise ConfigurationError("Training configuration does not match the configuration we're " "recovering from.") else: if recover: raise ConfigurationError(f"--recover specified but serialization_dir ({serialization_dir}) " "does not exist. There is nothing to recover from.") os.makedirs(serialization_dir, exist_ok=True)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Model': choice = params.pop_choice("type", cls.list_available()) return cls.by_name(choice).from_params(vocab, params)
def test_replace_none(self): params = Params({"a": "None", "b": [1.0, "None", 2], "c": {"d": "None"}}) assert params["a"] is None assert params["b"][1] is None assert params["c"]["d"] is None
def test_locally_normalised_span_extractor_can_build_from_params(self): params = Params({"type": "self_attentive", "input_dim": 5}) extractor = SpanExtractor.from_params(params) assert isinstance(extractor, SelfAttentiveSpanExtractor)
def from_params(cls, params: Params) -> 'A': b_params = params.pop("b") return cls(B.from_params(b_params))
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / "vocab_save" # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_tokens_to_namespace(["d", "a", "b"], namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) instances = Batch([Instance({"text": text_field})]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index( "c", "tokens") # should be present assert extended_vocab.get_token_index( "e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace( "a", namespace="tokens1") # index2 text_field = TextField( [Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def test_add_file_with_list_history_to_archive(self): # Creates actual files since add_file_to_archive will throw an exception # if the file does not exist. tempdir = tempfile.mkdtemp() my_file = os.path.join(tempdir, "my_file.txt") my_other_file = os.path.join(tempdir, "my_other_file.txt") open(my_file, 'w').close() open(my_other_file, 'w').close() # Some nested classes just to exercise the ``from_params`` # and ``add_file_to_archive`` methods. class C: def __init__(self, c_file: str) -> None: self.c_file = c_file @classmethod def from_params(cls, params: Params) -> 'C': params.add_file_to_archive("c_file") c_file = params.pop("c_file") return cls(c_file) class B: def __init__(self, filename: str, c) -> None: self.filename = filename self.c_dict = {"here": c} @classmethod def from_params(cls, params: Params) -> 'B': params.add_file_to_archive("filename") filename = params.pop("filename") c_params = params.pop("c") c = C.from_params(c_params) return cls(filename, c) class A: def __init__(self, bs) -> None: self.bs = bs @classmethod def from_params(cls, params: Params) -> 'A': bs = params.pop("bs") return cls(bs=[B.from_params(b_params) for b_params in bs]) params = Params({ "a": { "bs": [ { "filename": my_file, "c": { "c_file": my_other_file }, }, ], } }) # Construct ``A`` from params but then just throw it away. A.from_params(params.pop("a")) assert params.files_to_archive == { "a.bs.0.filename": my_file, "a.bs.0.c.c_file": my_other_file }
def from_params(cls, params: Params, vocab: Optional[Vocabulary] = None): metric_type = params.pop_choice("type", cls.list_available()) if vocab: params["vocabulary"] = vocab return cls.by_name(metric_type)(**params.as_dict()) # type: ignore
def test_bad_unicode_environment_variables(self): filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json' os.environ['BAD_ENVIRONMENT_VARIABLE'] = "\udce2" Params.from_file(filename) del os.environ['BAD_ENVIRONMENT_VARIABLE']
def create_kwargs(cls: Type[T], params: Params, **extras) -> Dict[str, Any]: """ Given some class, a `Params` object, and potentially other keyword arguments, create a dict of keyword args suitable for passing to the class's constructor. The function does this by finding the class's constructor, matching the constructor arguments to entries in the `params` object, and instantiating values for the parameters using the type annotation and possibly a from_params method. Any values that are provided in the `extras` will just be used as is. For instance, you might provide an existing `Vocabulary` this way. """ # Get the signature of the constructor. from allennlp.models.archival import load_archive # import here to avoid circular imports signature = inspect.signature(cls.__init__) kwargs: Dict[str, Any] = {} # Iterate over all the constructor parameters and their annotations. for name, param in signature.parameters.items(): # Skip "self". You're not *required* to call the first parameter "self", # so in theory this logic is fragile, but if you don't call the self parameter # "self" you kind of deserve what happens. if name == "self": continue # If the annotation is a compound type like typing.Dict[str, int], # it will have an __origin__ field indicating `typing.Dict` # and an __args__ field indicating `(str, int)`. We capture both. annotation = remove_optional(param.annotation) origin = getattr(annotation, '__origin__', None) args = getattr(annotation, '__args__', []) # The parameter is optional if its default value is not the "no default" sentinel. default = param.default optional = default != _NO_DEFAULT # Some constructors expect extra non-parameter items, e.g. vocab: Vocabulary. # We check the provided `extras` for these and just use them if they exist. if name in extras: kwargs[name] = extras[name] # Next case is when argument should be loaded from pretrained archive. elif name in params and isinstance( params.get(name), Params) and "_pretrained" in params.get(name): load_module_params = params.pop(name).pop("_pretrained") archive_file = load_module_params.pop("archive_file") module_path = load_module_params.pop("module_path") freeze = load_module_params.pop("freeze", True) archive = load_archive(archive_file) kwargs[name] = archive.extract_module(module_path, freeze) # pylint: disable=no-member if not isinstance(kwargs[name], annotation): raise ConfigurationError( f"The module from model at {archive_file} at path {module_path} " f"was expected of type {annotation} but is of type {type(kwargs[name])}" ) # # The next case is when the parameter type is itself constructible from_params. elif hasattr(annotation, 'from_params'): if name in params: # Our params have an entry for this, so we use that. subparams = params.pop(name) if takes_arg(annotation.from_params, 'extras'): # If annotation.params accepts **extras, we need to pass them all along. # For example, `BasicTextFieldEmbedder.from_params` requires a Vocabulary # object, but `TextFieldEmbedder.from_params` does not. subextras = extras else: # Otherwise, only supply the ones that are actual args; any additional ones # will cause a TypeError. subextras = { k: v for k, v in extras.items() if takes_arg(annotation.from_params, k) } # In some cases we allow a string instead of a param dict, so # we need to handle that case separately. if isinstance(subparams, str): kwargs[name] = annotation.by_name(subparams)() else: kwargs[name] = annotation.from_params(params=subparams, **subextras) elif not optional: # Not optional and not supplied, that's an error! raise ConfigurationError( f"expected key {name} for {cls.__name__}") else: kwargs[name] = default # If the parameter type is a Python primitive, just pop it off # using the correct casting pop_xyz operation. elif annotation == str: kwargs[name] = (params.pop(name, default) if optional else params.pop(name)) elif annotation == int: kwargs[name] = (params.pop_int(name, default) if optional else params.pop_int(name)) elif annotation == bool: kwargs[name] = (params.pop_bool(name, default) if optional else params.pop_bool(name)) elif annotation == float: kwargs[name] = (params.pop_float(name, default) if optional else params.pop_float(name)) # This is special logic for handling types like Dict[str, TokenIndexer], # List[TokenIndexer], Tuple[TokenIndexer, Tokenizer], and Set[TokenIndexer], # which it creates by instantiating each value from_params and returning the resulting structure. elif origin in (Dict, dict) and len(args) == 2 and hasattr( args[-1], 'from_params'): value_cls = annotation.__args__[-1] value_dict = {} for key, value_params in params.pop(name, Params({})).items(): value_dict[key] = value_cls.from_params(params=value_params, **extras) kwargs[name] = value_dict elif origin in (List, list) and len(args) == 1 and hasattr( args[0], 'from_params'): value_cls = annotation.__args__[0] value_list = [] for value_params in params.pop(name, Params({})): value_list.append( value_cls.from_params(params=value_params, **extras)) kwargs[name] = value_list elif origin in (Tuple, tuple) and all( hasattr(arg, 'from_params') for arg in args): value_list = [] for value_cls, value_params in zip(annotation.__args__, params.pop(name, Params({}))): value_list.append( value_cls.from_params(params=value_params, **extras)) kwargs[name] = tuple(value_list) elif origin in (Set, set) and len(args) == 1 and hasattr( args[0], 'from_params'): value_cls = annotation.__args__[0] value_set = set() for value_params in params.pop(name, Params({})): value_set.add( value_cls.from_params(params=value_params, **extras)) kwargs[name] = value_set else: # Pass it on as is and hope for the best. ¯\_(ツ)_/¯ if optional: kwargs[name] = params.pop(name, default) else: kwargs[name] = params.pop(name) params.assert_empty(cls.__name__) return kwargs
def from_params(cls, params: Params) -> 'C': params.add_file_to_archive("c_file") c_file = params.pop("c_file") return cls(c_file)
def run( config: str, name: str, allennlp_version: str, models_version: str, packages: str, gpus: int, workspace: str, user: str, include: Tuple[Tuple[str, str], ...], verbose: int, dry_run: bool, cluster: str, ): # We create a temp directory to use as context for the Docker build, and # also to create a temporary beaker config file. with TemporaryDirectory() as context_dir: # Write the training config to the context directory. training_config_path = os.path.join(context_dir, "config.jsonnet") params = Params.from_file(config) params.to_file(training_config_path) # Create a unique tag to use. image_id = str(uuid.uuid4()) local_image_name = f"allennlp-beaker-{name}:{image_id}" beaker_image_name = f"allennlp-beaker-{name}-{image_id}" if models_version: packages = models_version + " " + packages packages = packages.strip() # Write the Dockefile to the context directory. dockerfile_path = os.path.join(context_dir, "Dockerfile") with open(dockerfile_path, "w") as dockerfile: dockerfile.write(DOCKERFILE) if packages: dockerfile.write(DOCKERFILE_EXTRA_STEPS) # Write the beaker config to the context directory. beaker_config_path = os.path.join(context_dir, "config.yml") with open(beaker_config_path, "w") as beaker_config: beaker_config.write( yaml.dump( create_beaker_config( name=name, image=user + "/" + beaker_image_name, gpus=gpus, description=f"{allennlp_version} {packages}", cluster=cluster, ) ) ) if verbose: click.echo("Beaker config:") for line in shell_out_command(["cat", beaker_config_path]): print(line) # Copy any other include files. if include: for (path, dest) in include: dest = os.path.join(context_dir, dest) click.echo(f"Copying {path} to {dest}") if os.path.isdir(path): shutil.copytree(path, dest) else: shutil.copy(path, dest) # Build the Docker image. click.echo( "Building docker image with name " + click.style(local_image_name, fg="green") + "..." ) build_args = [ "docker", "build", "--build-arg", f"ALLENNLP={allennlp_version}", ] if packages: build_args.extend(["--build-arg", f"PACKAGES={packages}"]) build_args.extend(["-t", local_image_name, context_dir]) if verbose: for line in shell_out_command(build_args): print(line) else: with click_spinner.spinner(): deque(shell_out_command(build_args), maxlen=0) if dry_run: click.echo("Run the following to check the Docker image:\n") click.echo( f" docker run --rm -it --entrypoint /bin/bash {local_image_name}" ) return None # Publish the image to beaker. click.echo("Publishing image to beaker...") with click_spinner.spinner(): deque( shell_out_command( [ "beaker", "image", "create", "-n", beaker_image_name, local_image_name, ] ), maxlen=0, ) # Submit the experiment to beaker. click.echo("Submitting experiment to beaker...") cmds = [ "beaker", "experiment", "create", "--name", name, "-f", beaker_config_path, ] if workspace: cmds.extend(["--workspace", workspace]) echo_command_output(cmds)
def from_params(cls, params: Params) -> 'A': bs = params.pop("bs") return cls(bs=[B.from_params(b_params) for b_params in bs])
def _load( cls, config: Params, serialization_dir: Union[str, PathLike], weights_file: Optional[Union[str, PathLike]] = None, cuda_device: int = -1, ) -> "Model": """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, "vocabulary") # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice) vocab = vocab_class.from_files( vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token") ) model_params = config.get("model") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings/weights from. We're now _loading_ the model, so those weights will already be # stored in our model. We don't need any pretrained weight file or initializers anymore, # and we don't want the code to look for it, so we remove it from the parameters here. remove_keys_from_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() # Load state dict. We pass `strict=False` so PyTorch doesn't raise a RuntimeError # if the state dict is missing keys because we handle this case below. model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) missing_keys, unexpected_keys = model.load_state_dict(model_state, strict=False) # Modules might define a class variable called `authorized_missing_keys`, # a list of regex patterns, that tells us to ignore missing keys that match # any of the patterns. # We sometimes need this in order to load older models with newer versions of AllenNLP. def filter_out_authorized_missing_keys(module, prefix=""): nonlocal missing_keys for pat in getattr(module.__class__, "authorized_missing_keys", None) or []: missing_keys = [ k for k in missing_keys if k.startswith(prefix) and re.search(pat[len(prefix) :], k) is None ] for name, child in module._modules.items(): if child is not None: filter_out_authorized_missing_keys(child, prefix + name + ".") filter_out_authorized_missing_keys(model) if unexpected_keys or missing_keys: raise RuntimeError( f"Error loading state dict for {model.__class__.__name__}\n\t" f"Missing keys: {missing_keys}\n\t" f"Unexpected keys: {unexpected_keys}" ) return model
def test_pop_choice(self): choices = ['my_model', 'other_model'] params = Params({'model': 'my_model'}) assert params.pop_choice('model', choices) == 'my_model' params = Params({'model': 'non_existent_model'}) with pytest.raises(ConfigurationError): params.pop_choice('model', choices) params = Params({'model': 'module.submodule.ModelName'}) assert params.pop_choice('model', 'choices') == 'module.submodule.ModelName' params = Params({'model': 'module.submodule.ModelName'}) with pytest.raises(ConfigurationError): params.pop_choice('model', choices, allow_class_names=False)
def construct_arg( class_name: str, argument_name: str, popped_params: Params, annotation: Type, default: Any, **extras, ) -> Any: """ The first two parameters here are only used for logging if we encounter an error. """ origin = getattr(annotation, "__origin__", None) args = getattr(annotation, "__args__", []) # The parameter is optional if its default value is not the "no default" sentinel. optional = default != _NO_DEFAULT if hasattr(annotation, "from_params"): if popped_params is default: return default elif popped_params is not None: # Our params have an entry for this, so we use that. subextras = create_extras(annotation, extras) # In some cases we allow a string instead of a param dict, so # we need to handle that case separately. if isinstance(popped_params, str): return annotation.by_name(popped_params)() else: if isinstance(popped_params, dict): popped_params = Params(popped_params) return annotation.from_params(params=popped_params, **subextras) elif not optional: # Not optional and not supplied, that's an error! raise ConfigurationError( f"expected key {argument_name} for {class_name}") else: return default # If the parameter type is a Python primitive, just pop it off # using the correct casting pop_xyz operation. elif annotation in {int, bool}: if type(popped_params) in {int, bool}: return annotation(popped_params) else: raise TypeError( f"Expected {argument_name} to be a {annotation.__name__}.") elif annotation == str: # Strings are special because we allow casting from Path to str. if type(popped_params) == str or isinstance(popped_params, Path): return str(popped_params) # type: ignore else: raise TypeError(f"Expected {argument_name} to be a string.") elif annotation == float: # Floats are special because in Python, you can put an int wherever you can put a float. # https://mypy.readthedocs.io/en/stable/duck_type_compatibility.html if type(popped_params) in {int, float}: return popped_params else: raise TypeError(f"Expected {argument_name} to be numeric.") # This is special logic for handling types like Dict[str, TokenIndexer], # List[TokenIndexer], Tuple[TokenIndexer, Tokenizer], and Set[TokenIndexer], # which it creates by instantiating each value from_params and returning the resulting structure. elif origin in (Dict, dict) and len(args) == 2 and can_construct_from_params( args[-1]): value_cls = annotation.__args__[-1] value_dict = {} for key, value_params in popped_params.items(): value_dict[key] = construct_arg( str(value_cls), argument_name + "." + key, value_params, value_cls, _NO_DEFAULT, **extras, ) return value_dict elif origin in (List, list) and len(args) == 1 and can_construct_from_params( args[0]): value_cls = annotation.__args__[0] value_list = [] for i, value_params in enumerate(popped_params): value = construct_arg( str(value_cls), argument_name + f".{i}", value_params, value_cls, _NO_DEFAULT, **extras, ) value_list.append(value) return value_list elif origin in (Tuple, tuple) and all( can_construct_from_params(arg) for arg in args): value_list = [] for i, (value_cls, value_params) in enumerate( zip(annotation.__args__, popped_params)): value = construct_arg( str(value_cls), argument_name + f".{i}", value_params, value_cls, _NO_DEFAULT, **extras, ) value_list.append(value) return tuple(value_list) elif origin in (Set, set) and len(args) == 1 and can_construct_from_params( args[0]): value_cls = annotation.__args__[0] value_set = set() for i, value_params in enumerate(popped_params): value = construct_arg( str(value_cls), argument_name + f".{i}", value_params, value_cls, _NO_DEFAULT, **extras, ) value_set.add(value) return value_set elif origin == Union: # Storing this so we can recover it later if we need to. backup_params = deepcopy(popped_params) # We'll try each of the given types in the union sequentially, returning the first one that # succeeds. for arg_annotation in args: try: return construct_arg( str(arg_annotation), argument_name, popped_params, arg_annotation, default, **extras, ) except (ValueError, TypeError, ConfigurationError, AttributeError): # Our attempt to construct the argument may have modified popped_params, so we # restore it here. popped_params = deepcopy(backup_params) # If none of them succeeded, we crash. raise ConfigurationError( f"Failed to construct argument {argument_name} with type {annotation}" ) elif origin == Lazy: if popped_params is default: return Lazy(lambda **kwargs: default) value_cls = args[0] subextras = create_extras(value_cls, extras) def constructor(**kwargs): # If there are duplicate keys between subextras and kwargs, this will overwrite the ones # in subextras with what's in kwargs. If an argument shows up twice, we should take it # from what's passed to Lazy.construct() instead of what we got from create_extras(). # Almost certainly these will be identical objects, anyway. # We do this by constructing a new dictionary, instead of mutating subextras, just in # case this constructor is called multiple times. constructor_extras = {**subextras, **kwargs} return value_cls.from_params(params=deepcopy(popped_params), **constructor_extras) return Lazy(constructor) # type: ignore else: # Pass it on as is and hope for the best. ¯\_(ツ)_/¯ if isinstance(popped_params, Params): return popped_params.as_dict(quiet=True) return popped_params
def pop_and_construct_arg(class_name: str, argument_name: str, annotation: Type, default: Any, params: Params, **extras) -> Any: """ Does the work of actually constructing an individual argument for [`create_kwargs`](./from_params#create_kwargs). Here we're in the inner loop of iterating over the parameters to a particular constructor, trying to construct just one of them. The information we get for that parameter is its name, its type annotation, and its default value; we also get the full set of `Params` for constructing the object (which we may mutate), and any `extras` that the constructor might need. We take the type annotation and default value here separately, instead of using an `inspect.Parameter` object directly, so that we can handle `Union` types using recursion on this method, trying the different annotation types in the union in turn. """ from allennlp.models.archival import load_archive # import here to avoid circular imports # We used `argument_name` as the method argument to avoid conflicts with 'name' being a key in # `extras`, which isn't _that_ unlikely. Now that we are inside the method, we can switch back # to using `name`. name = argument_name # Some constructors expect extra non-parameter items, e.g. vocab: Vocabulary. # We check the provided `extras` for these and just use them if they exist. if name in extras: if name not in params: return extras[name] else: logger.warning( f"Parameter {name} for class {class_name} was found in both " "**extras and in params. Using the specification found in params, " "but you probably put a key in a config file that you didn't need, " "and if it is different from what we get from **extras, you might " "get unexpected behavior.") # Next case is when argument should be loaded from pretrained archive. elif (name in params and isinstance(params.get(name), Params) and "_pretrained" in params.get(name)): load_module_params = params.pop(name).pop("_pretrained") archive_file = load_module_params.pop("archive_file") module_path = load_module_params.pop("module_path") freeze = load_module_params.pop("freeze", True) archive = load_archive(archive_file) result = archive.extract_module(module_path, freeze) if not isinstance(result, annotation): raise ConfigurationError( f"The module from model at {archive_file} at path {module_path} " f"was expected of type {annotation} but is of type {type(result)}" ) return result popped_params = params.pop( name, default) if default != _NO_DEFAULT else params.pop(name) if popped_params is None: origin = getattr(annotation, "__origin__", None) if origin == Lazy: return Lazy(lambda **kwargs: None) return None return construct_arg(class_name, name, popped_params, annotation, default, **extras)