Example #1
0
def make_vocab_from_params(params: Params, serialization_dir: str):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Example #2
0
 def test_fine_tune_nograd_regex(self):
     original_model = load_archive(self.model_archive).model
     name_parameters_original = dict(original_model.named_parameters())
     regex_lists = [[],
                    [".*attend_feedforward.*", ".*token_embedder.*"],
                    [".*compare_feedforward.*"]]
     for regex_list in regex_lists:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = regex_list
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         tuned_model = fine_tune_model(model=original_model,
                                       params=params,
                                       serialization_dir=self.serialization_dir)
         # If regex is matched, parameter name should have requires_grad False
         # If regex is matched, parameter name should have same requires_grad
         # as the originally loaded model
         for name, parameter in tuned_model.named_parameters():
             if any(re.search(regex, name) for regex in regex_list):
                 assert not parameter.requires_grad
             else:
                 assert parameter.requires_grad \
                 == name_parameters_original[name].requires_grad
     # If all parameters have requires_grad=False, then error.
     with pytest.raises(Exception) as _:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = ["*"]
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         tuned_model = fine_tune_model(model=original_model,
                                       params=params,
                                       serialization_dir=self.serialization_dir)
Example #3
0
 def test_simple_tagger_constraint_type_deprecated(self):
     params = Params({"model": {
             "type": "crf_tagger",
             "constraint_type": "IOB1",
             "text_field_embedder": {
                     "token_embedders": {
                             "tokens": {
                                     "type": "embedding",
                                     "embedding_dim": 50
                             },
                     }
             },
             "encoder": {
                     "type": "gru",
                     "input_size": 50,
                     "hidden_size": 10,
                     "num_layers": 2,
                     "dropout": 0.5,
                     "bidirectional": True
             }}})
     with pytest.warns(DeprecationWarning):
         model = Model.from_params(vocab=self.vocab,
                                   params=params.pop("model"))
     assert model._f1_metric is not None
     assert model._f1_metric._label_encoding == "IOB1"
     assert model.label_encoding == "IOB1"
     assert model.crf._constraint_mask.sum().item() != (model.num_tags + 2)**2
Example #4
0
def make_vocab_from_params(params: Params):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    vocab_dir = vocab_params.get('directory_path')
    if vocab_dir is None:
        raise ConfigurationError("To use `make-vocab` your configuration must contain a value "
                                 "at vocabulary.directory_path")

    os.makedirs(vocab_dir, exist_ok=True)

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(Params({}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))

    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Example #5
0
def prepare_environment(params: Params):
    """
    Sets random seeds for reproducible experiments. This may not work as expected
    if you use this from within a python project in which you have already imported Pytorch.
    If you use the scripts/run_model.py entry point to training models with this library,
    your experiments should be reasonably reproducible. If you are using this from your own
    project, you will want to call this function before importing Pytorch. Complete determinism
    is very difficult to achieve with libraries doing optimized linear algebra due to massively
    parallel execution, which is exacerbated by using GPUs.

    Parameters
    ----------
    params: Params object or dict, required.
        A ``Params`` object or dict holding the json parameters.
    """
    seed = params.pop_int("random_seed", 13370)
    numpy_seed = params.pop_int("numpy_seed", 1337)
    torch_seed = params.pop_int("pytorch_seed", 133)

    if seed is not None:
        random.seed(seed)
    if numpy_seed is not None:
        numpy.random.seed(numpy_seed)
    if torch_seed is not None:
        torch.manual_seed(torch_seed)
        # Seed all GPUs with the same seed if available.
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(torch_seed)

    log_pytorch_version_info()
Example #6
0
    def test_regexes_with_backslashes(self):
        bad_regex = self.TEST_DIR / 'bad_regex.jsonnet'
        good_regex = self.TEST_DIR / 'good_regex.jsonnet'

        with open(bad_regex, 'w') as f:
            f.write(r'{"myRegex": "a\.b"}')

        with open(good_regex, 'w') as f:
            f.write(r'{"myRegex": "a\\.b"}')

        with pytest.raises(RuntimeError):
            Params.from_file(bad_regex)

        params = Params.from_file(good_regex)
        regex = params['myRegex']

        assert re.match(regex, "a.b")
        assert not re.match(regex, "a-b")

        # Check roundtripping
        good_regex2 = self.TEST_DIR / 'good_regex2.jsonnet'
        with open(good_regex2, 'w') as f:
            f.write(json.dumps(params.as_dict()))
        params2 = Params.from_file(good_regex2)

        assert params.as_dict() == params2.as_dict()
Example #7
0
    def test_known_configs(self):
        configs = os.listdir(self.PROJECT_ROOT / "training_config")

        # Our configs use environment variable substitution, and the _jsonnet parser
        # will fail if we don't pass it correct environment variables.
        forced_variables = [
            # constituency parser
            'PTB_TRAIN_PATH', 'PTB_DEV_PATH', 'PTB_TEST_PATH',

            # srl_elmo_5.5B
            'SRL_TRAIN_DATA_PATH', 'SRL_VALIDATION_DATA_PATH',

            # coref
            'COREF_TRAIN_DATA_PATH', 'COREF_DEV_DATA_PATH', 'COREF_TEST_DATA_PATH',

            # ner
            'NER_TRAIN_DATA_PATH', 'NER_TEST_A_PATH', 'NER_TEST_B_PATH'
        ]

        for var in forced_variables:
            os.environ[var] = os.environ.get(var) or str(self.TEST_DIR)

        for config in configs:
            try:
                Params.from_file(self.PROJECT_ROOT / "training_config" / config)
            except Exception as e:
                raise AssertionError(f"unable to load params for {config}, because {e}")

        for var in forced_variables:
            if os.environ[var] == str(self.TEST_DIR):
                del os.environ[var]
Example #8
0
def remove_pretrained_embedding_params(params: Params):
    keys = params.keys()
    if 'pretrained_file' in keys:
        del params['pretrained_file']
    for value in params.values():
        if isinstance(value, Params):
            remove_pretrained_embedding_params(value)
Example #9
0
            def from_params(cls, params: Params) -> 'B':
                params.add_file_to_archive("filename")

                filename = params.pop("filename")
                c_params = params.pop("c")
                c = C.from_params(c_params)

                return cls(filename, c)
    def from_params(cls, optimizer: torch.optim.Optimizer, params: Params):  # type: ignore
        # pylint: disable=arguments-differ
        scheduler = params.pop_choice("type", LearningRateScheduler.list_available())

        schedulers = LearningRateScheduler.by_name(scheduler)(optimizer, **params.as_dict())  # type: ignore
        if isinstance(schedulers, torch.optim.lr_scheduler.ReduceLROnPlateau):
            return LearningRateWithMetricsWrapper(schedulers)
        else:
            return LearningRateWithoutMetricsWrapper(schedulers)
Example #11
0
 def test_as_ordered_dict(self):
     # keyD > keyC > keyE; keyDA > keyDB; Next all other keys alphabetically
     preference_orders = [["keyD", "keyC", "keyE"], ["keyDA", "keyDB"]]
     params = Params({"keyC": "valC", "keyB": "valB", "keyA": "valA", "keyE": "valE",
                      "keyD": {"keyDB": "valDB", "keyDA": "valDA"}})
     ordered_params_dict = params.as_ordered_dict(preference_orders)
     expected_ordered_params_dict = OrderedDict({'keyD': {'keyDA': 'valDA', 'keyDB': 'valDB'},
                                                 'keyC': 'valC', 'keyE': 'valE',
                                                 'keyA': 'valA', 'keyB': 'valB'})
     assert json.dumps(ordered_params_dict) == json.dumps(expected_ordered_params_dict)
Example #12
0
 def test_to_file(self):
     # Test to_file works with or without preference orders
     params_dict = {"keyA": "valA", "keyB": "valB"}
     expected_ordered_params_dict = OrderedDict({"keyB": "valB", "keyA": "valA"})
     params = Params(params_dict)
     file_path = self.TEST_DIR / 'config.jsonnet'
     # check with preference orders
     params.to_file(file_path, [["keyB", "keyA"]])
     with open(file_path, "r") as handle:
         ordered_params_dict = OrderedDict(json.load(handle))
     assert json.dumps(expected_ordered_params_dict) == json.dumps(ordered_params_dict)
     # check without preference orders doesn't give error
     params.to_file(file_path)
Example #13
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)
Example #14
0
    def test_from_params(self):
        params = Params({"regularizers": [("conv", "l1"), ("linear", {"type": "l2", "alpha": 10})]})
        regularizer_applicator = RegularizerApplicator.from_params(params.pop("regularizers"))
        regularizers = regularizer_applicator._regularizers  # pylint: disable=protected-access

        conv = linear = None
        for regex, regularizer in regularizers:
            if regex == "conv":
                conv = regularizer
            elif regex == "linear":
                linear = regularizer

        assert isinstance(conv, L1Regularizer)
        assert isinstance(linear, L2Regularizer)
        assert linear.alpha == 10
Example #15
0
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              cuda_device: int = -1) -> 'Model':
        """
        Ensembles don't have vocabularies or weights of their own, so they override _load.
        """
        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=None, params=model_params)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #16
0
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
def main(param_file: str, extra_beaker_commands: List[str]):
    ecr_repository = "896129387501.dkr.ecr.us-west-2.amazonaws.com"
    commit = subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True).strip()
    image = f"{ecr_repository}/allennlp/allennlp:{commit}"
    overrides = ""

    # Reads params and sets environment.
    params = Params.from_file(param_file, overrides)
    flat_params = params.as_flat_dict()
    env = []
    for k, v in flat_params.items():
        k = str(k).replace('.', '_')
        env.append(f"--env={k}={v}")

    # If the git repository is dirty, add a random hash.
    result = subprocess.run('git diff-index --quiet HEAD --', shell=True)
    if result.returncode != 0:
        dirty_hash = "%x" % random_int
        image += "-" + dirty_hash

    # Get temporary ecr login. For this command to work, you need the python awscli
    # package with a version more recent than 1.11.91.
    print("Generating ECR Login Command")
    login_command = subprocess.check_output('aws --region=us-west-2 ecr get-login --no-include-email', shell=True)

    print("Logging into ECR")
    subprocess.run(login_command, shell=True, check=True)

    print(f"Building the Docker image ({image})")
    subprocess.run(f'docker build -t {image} .', shell=True, check=True)

    print(f"Pushing the Docker image ({image})")
    subprocess.run(f'docker push {image}', shell=True, check=True)

    config_dataset_id = subprocess.check_output(f'beaker dataset create --quiet {param_file}', shell=True, universal_newlines=True).strip()
    filename = os.path.basename(param_file)

    allennlp_command = [
            "python",
            "-m",
            "allennlp.run",
            "train",
            "/config.json",
            "-s",
            "/output",
            "--file-friendly-logging"
        ]

    # TODO(michaels): add back in the env list.
    # Presently this makes the Beaker UI unusably cluttered.
    command = [
            '/usr/local/bin/beaker',
            'experiment',
            'run',
            '--result-path',
            '/output',
            "--source",
            f"{config_dataset_id}:/config.json"] + env + extra_beaker_commands + [image] + allennlp_command
    print(' '.join(command))
    subprocess.run(command, check=True)
Example #18
0
 def test_mismatching_contextualizer_unidirectionality_throws_configuration_error(self):
     params = Params.from_file(self.param_file)
     # Make the contextualizer unidirectionality wrong - it should be
     # False to match the language model.
     params["model"]["contextualizer"]["bidirectional"] = (not self.bidirectional)
     with pytest.raises(ConfigurationError):
         Model.from_params(vocab=self.vocab, params=params.get("model"))
Example #19
0
 def test_mismatching_dimensions_throws_configuration_error(self):
     params = Params.from_file(self.param_file)
     # Make the encoder wrong - it should be 2 to match
     # the embedding dimension from the text_field_embedder.
     params["model"]["encoder"]["input_size"] = 10
     with pytest.raises(ConfigurationError):
         Model.from_params(vocab=self.vocab, params=params.pop("model"))
 def test_mismatching_dimensions_throws_configuration_error(self):
     params = Params.from_file(self.param_file)
     # Make the phrase layer wrong - it should be 150 to match
     # the embedding + binary feature dimensions.
     params["model"]["encoder"]["input_size"] = 10
     with pytest.raises(ConfigurationError):
         Model.from_params(vocab=self.vocab, params=params.pop("model"))
 def from_params(cls, params: Params) -> 'TokenCharactersIndexer':
     """
     Parameters
     ----------
     namespace : ``str``, optional (default=``token_characters``)
         We will use this namespace in the :class:`Vocabulary` to map the characters in each token
         to indices.
     character_tokenizer : ``Params``, optional (default=``Params({})``)
         We use a :class:`CharacterTokenizer` to handle splitting tokens into characters, as it has
         options for byte encoding and other things.  These parameters get passed to the character
         tokenizer.  The default is to use unicode characters and to retain casing.
     """
     namespace = params.pop('namespace', 'token_characters')
     character_tokenizer_params = params.pop('character_tokenizer', {})
     character_tokenizer = CharacterTokenizer.from_params(character_tokenizer_params)
     params.assert_empty(cls.__name__)
     return cls(namespace=namespace, character_tokenizer=character_tokenizer)
 def test_elmo_but_no_set_flags_throws_configuration_error(self):
     # pylint: disable=line-too-long
     params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json')
     # Elmo is specified in the model, but set both flags to false.
     params["model"]["use_input_elmo"] = False
     params["model"]["use_integrator_output_elmo"] = False
     with pytest.raises(ConfigurationError):
         Model.from_params(vocab=self.vocab, params=params.get("model"))
Example #23
0
    def from_params(cls: Type[T], params: Params, **extras) -> T:
        """
        This is the automatic implementation of `from_params`. Any class that subclasses `FromParams`
        (or `Registrable`, which itself subclasses `FromParams`) gets this implementation for free.
        If you want your class to be instantiated from params in the "obvious" way -- pop off parameters
        and hand them to your constructor with the same names -- this provides that functionality.

        If you need more complex logic in your from `from_params` method, you'll have to implement
        your own method that overrides this one.
        """
        # pylint: disable=protected-access
        from allennlp.common.registrable import Registrable  # import here to avoid circular imports

        logger.info(f"instantiating class {cls} from params {getattr(params, 'params', params)} "
                    f"and extras {extras}")

        if params is None:
            return None

        registered_subclasses = Registrable._registry.get(cls)

        if registered_subclasses is not None:
            # We know ``cls`` inherits from Registrable, so we'll use a cast to make mypy happy.
            # We have to use a disable to make pylint happy.
            # pylint: disable=no-member
            as_registrable = cast(Type[Registrable], cls)
            default_to_first_choice = as_registrable.default_implementation is not None
            choice = params.pop_choice("type",
                                       choices=as_registrable.list_available(),
                                       default_to_first_choice=default_to_first_choice)
            subclass = registered_subclasses[choice]

            # We want to call subclass.from_params. It's possible that it's just the "free"
            # implementation here, in which case it accepts `**extras` and we are not able
            # to make any assumptions about what extra parameters it needs.
            #
            # It's also possible that it has a custom `from_params` method. In that case it
            # won't accept any **extra parameters and we'll need to filter them out.
            if not takes_arg(subclass.from_params, 'extras'):
                # Necessarily subclass.from_params is a custom implementation, so we need to
                # pass it only the args it's expecting.
                extras = {k: v for k, v in extras.items() if takes_arg(subclass.from_params, k)}

            return subclass.from_params(params=params, **extras)
        else:
            # This is not a base class, so convert our params and extras into a dict of kwargs.

            if cls.__init__ == object.__init__:
                # This class does not have an explicit constructor, so don't give it any kwargs.
                # Without this logic, create_kwargs will look at object.__init__ and see that
                # it takes *args and **kwargs and look for those.
                kwargs: Dict[str, Any] = {}
            else:
                # This class has a constructor, so create kwargs for it.
                kwargs = create_kwargs(cls, params, **extras)

            return cls(**kwargs)  # type: ignore
Example #24
0
    def load(cls,
             config: Params,
             serialization_dir: str,
             weights_file: str = None,
             cuda_device: int = -1) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.

        Parameters
        ----------
        config: Params
            The configuration that was used to train the model. It should definitely
            have a `model` section, and should probably have a `trainer` section
            as well.
        serialization_dir: str = None
            The directory containing the serialized weights, parameters, and vocabulary
            of the model.
        weights_file: str = None
            By default we load the weights from `best.th` in the serialization
            directory, but you can override that value here.
        cuda_device: int = -1
            By default we load the model on the CPU, but if you want to load it
            for GPU usage you can specify the id of your GPU here


        Returns
        -------
        model: Model
            The model specified in the configuration, loaded with the serialized
            vocabulary and the trained weights.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config.get('model')

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        _remove_pretrained_embedding_params(model_params)
        model = Model.from_params(vocab, model_params)
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        model.load_state_dict(model_state)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Example #25
0
    def test_fine_tune_does_not_expand_vocab_by_default(self):
        params = Params.from_file(self.config_file)
        # snli2 has a new token in it
        params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl')

        model = load_archive(self.model_archive).model

        # By default, no vocab expansion.
        fine_tune_model(model, params, self.serialization_dir)
Example #26
0
    def test_fine_tune_runtime_errors_with_vocab_expansion(self):
        params = Params.from_file(self.config_file)
        params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl')

        model = load_archive(self.model_archive).model

        # If we do vocab expansion, we get a runtime error because of the embedding.
        with pytest.raises(RuntimeError):
            fine_tune_model(model, params, self.serialization_dir, extend_vocab=True)
Example #27
0
    def from_params(cls, params: Params) -> 'Initializer':   # type: ignore
        # pylint: disable=arguments-differ

        # Just a string - corresponds to the name of an initializer.
        if isinstance(params, str):
            return cls.by_name(params)()
        else:
            choice = params.pop_choice("type", cls.list_available())
            return cls.by_name(choice).from_params(params)
Example #28
0
    def test_load_from_file(self):
        filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json'
        params = Params.from_file(filename)

        assert "dataset_reader" in params
        assert "trainer" in params

        model_params = params.pop("model")
        assert model_params.pop("type") == "bidaf"
Example #29
0
    def test_env_var_substitution(self):
        substitutor = self.TEST_DIR / 'substitutor.jsonnet'
        key = 'TEST_ENV_VAR_SUBSTITUTION'

        assert os.environ.get(key) is None

        with open(substitutor, 'w') as f:
            f.write(f'{{"path": std.extVar("{key}")}}')

        # raises without environment variable set
        with pytest.raises(RuntimeError):
            Params.from_file(substitutor)

        os.environ[key] = "PERFECT"

        params = Params.from_file(substitutor)
        assert params['path'] == "PERFECT"

        del os.environ[key]
Example #30
0
def make_vocab_from_args(args: argparse.Namespace):
    """
    Just converts from an ``argparse.Namespace`` object to params.
    """
    parameter_path = args.param_path
    overrides = args.overrides

    params = Params.from_file(parameter_path, overrides)

    make_vocab_from_params(params)
Example #31
0
    def from_params(
        cls: Type[T],
        params: Params,
        constructor_to_call: Callable[..., T] = None,
        constructor_to_inspect: Callable[..., T] = None,
        **extras,
    ) -> T:
        """
        This is the automatic implementation of `from_params`. Any class that subclasses
        `FromParams` (or `Registrable`, which itself subclasses `FromParams`) gets this
        implementation for free.  If you want your class to be instantiated from params in the
        "obvious" way -- pop off parameters and hand them to your constructor with the same names --
        this provides that functionality.

        If you need more complex logic in your from `from_params` method, you'll have to implement
        your own method that overrides this one.

        The `constructor_to_call` and `constructor_to_inspect` arguments deal with a bit of
        redirection that we do.  We allow you to register particular `@classmethods` on a class as
        the constructor to use for a registered name.  This lets you, e.g., have a single
        `Vocabulary` class that can be constructed in two different ways, with different names
        registered to each constructor.  In order to handle this, we need to know not just the class
        we're trying to construct (`cls`), but also what method we should inspect to find its
        arguments (`constructor_to_inspect`), and what method to call when we're done constructing
        arguments (`constructor_to_call`).  These two methods are the same when you've used a
        `@classmethod` as your constructor, but they are `different` when you use the default
        constructor (because you inspect `__init__`, but call `cls()`).
        """

        from allennlp.common.registrable import Registrable  # import here to avoid circular imports

        logger.info(
            f"instantiating class {cls} from params {getattr(params, 'params', params)} "
            f"and extras {set(extras.keys())}")

        if params is None:
            return None

        if isinstance(params, str):
            params = Params({"type": params})

        registered_subclasses = Registrable._registry.get(cls)

        if registered_subclasses is not None and not constructor_to_call:
            # We know `cls` inherits from Registrable, so we'll use a cast to make mypy happy.

            as_registrable = cast(Type[Registrable], cls)
            default_to_first_choice = as_registrable.default_implementation is not None
            choice = params.pop_choice(
                "type",
                choices=as_registrable.list_available(),
                default_to_first_choice=default_to_first_choice,
            )
            subclass, constructor_name = as_registrable.resolve_class_name(
                choice)
            # See the docstring for an explanation of what's going on here.
            if not constructor_name:
                constructor_to_inspect = subclass.__init__
                constructor_to_call = subclass  # type: ignore
            else:
                constructor_to_inspect = getattr(subclass, constructor_name)
                constructor_to_call = constructor_to_inspect

            if hasattr(subclass, "from_params"):
                # We want to call subclass.from_params.
                extras = create_extras(subclass, extras)
                # mypy can't follow the typing redirection that we do, so we explicitly cast here.
                retyped_subclass = cast(Type[T], subclass)
                return retyped_subclass.from_params(
                    params=params,
                    constructor_to_call=constructor_to_call,
                    constructor_to_inspect=constructor_to_inspect,
                    **extras,
                )
            else:
                # In some rare cases, we get a registered subclass that does _not_ have a
                # from_params method (this happens with Activations, for instance, where we
                # register pytorch modules directly).  This is a bit of a hack to make those work,
                # instead of adding a `from_params` method for them somehow.  We just trust that
                # you've done the right thing in passing your parameters, and nothing else needs to
                # be recursively constructed.
                extras = create_extras(subclass, extras)
                constructor_args = {**params, **extras}
                return subclass(**constructor_args)  # type: ignore
        else:
            # This is not a base class, so convert our params and extras into a dict of kwargs.

            # See the docstring for an explanation of what's going on here.
            if not constructor_to_inspect:
                constructor_to_inspect = cls.__init__
            if not constructor_to_call:
                constructor_to_call = cls

            if constructor_to_inspect == object.__init__:
                # This class does not have an explicit constructor, so don't give it any kwargs.
                # Without this logic, create_kwargs will look at object.__init__ and see that
                # it takes *args and **kwargs and look for those.
                kwargs: Dict[str, Any] = {}
            else:
                # This class has a constructor, so create kwargs for it.
                kwargs = create_kwargs(constructor_to_inspect, cls, params,
                                       **extras)

            return constructor_to_call(**kwargs)  # type: ignore
Example #32
0
class TestMakeVocabFromParams(AllenNlpTestCase):
    @pytest.mark.parametrize(
        "params",
        [
            Params({
                "dataset_reader": {
                    "type": "train-util-test-reader"
                },
                "train_data_path": "path-to-training-file",
                "validation_data_path": "path-to-validation-file",
                "test_data_path": "path-to-validation-file",
                "datasets_for_vocab_creation": [],
                "data_loader": {
                    "batch_size": 2
                },
            }),
            Params({
                "dataset_reader": {
                    "type": "train-util-test-reader"
                },
                "train_data_path": "path-to-training-file",
                "datasets_for_vocab_creation": [],
                "data_loader": {
                    "batch_size": 2
                },
            }),
            Params({
                "dataset_reader": {
                    "type": "train-util-test-reader"
                },
                "train_data_path": "path-to-training-file",
                "validation_data_path": "path-to-validation-file",
                "test_data_path": "path-to-validation-file",
                "vocabulary": {
                    "type": "empty"
                },
                "data_loader": {
                    "batch_size": 2
                },
            }),
        ],
    )
    def test_no_instances_read_for_vocab(self, caplog, params):
        _ = make_vocab_from_params(params, str(self.TEST_DIR))
        log_messages = "\n".join([rec.message for rec in caplog.records])
        assert "...train-util-test-reader reading from" not in log_messages
        assert "Reading training data" not in log_messages
        assert "Reading validation data" not in log_messages
        assert "Reading test data" not in log_messages

    def test_only_train_read_for_vocab(self, caplog):
        params = Params({
            "dataset_reader": {
                "type": "train-util-test-reader"
            },
            "train_data_path": "path-to-training-file",
            "data_loader": {
                "batch_size": 2
            },
        })
        _ = make_vocab_from_params(params, str(self.TEST_DIR))
        log_messages = "\n".join([rec.message for rec in caplog.records])
        assert "...train-util-test-reader reading from path-to-training-file" in log_messages
        assert "...train-util-test-reader reading from path-to-validation-file" not in log_messages
        assert "...train-util-test-reader reading from path-to-test-file" not in log_messages
        assert "Reading training data" in log_messages
        assert "Reading validation data" not in log_messages
        assert "Reading test data" not in log_messages

    def test_all_datasets_read_for_vocab(self, caplog):
        params = Params({
            "dataset_reader": {
                "type": "train-util-test-reader"
            },
            "train_data_path": "path-to-training-file",
            "validation_data_path": "path-to-validation-file",
            "test_data_path": "path-to-test-file",
            "data_loader": {
                "batch_size": 2
            },
        })
        _ = make_vocab_from_params(params, str(self.TEST_DIR))
        log_messages = "\n".join([rec.message for rec in caplog.records])
        assert "...train-util-test-reader reading from path-to-training-file" in log_messages
        assert "...train-util-test-reader reading from path-to-validation-file" in log_messages
        assert "...train-util-test-reader reading from path-to-test-file" in log_messages
        assert "Reading training data" in log_messages
        assert "Reading validation data" in log_messages
        assert "Reading test data" in log_messages

    def test_only_specified_datasets_read_for_vocab(self, caplog):
        params = Params({
            "dataset_reader": {
                "type": "train-util-test-reader"
            },
            "train_data_path": "path-to-training-file",
            "validation_data_path": "path-to-validation-file",
            "test_data_path": "path-to-test-file",
            "datasets_for_vocab_creation": ["train", "validation"],
            "data_loader": {
                "batch_size": 2
            },
        })
        _ = make_vocab_from_params(params, str(self.TEST_DIR))
        log_messages = "\n".join([rec.message for rec in caplog.records])
        assert "...train-util-test-reader reading from path-to-training-file" in log_messages
        assert "...train-util-test-reader reading from path-to-validation-file" in log_messages
        assert "...train-util-test-reader reading from path-to-test-file" not in log_messages
        assert "Reading training data" in log_messages
        assert "Reading validation data" in log_messages
        assert "Reading test data" not in log_messages

    def test_using_seperate_validation_reader(self, caplog):
        params = Params({
            "dataset_reader": {
                "type": "train-util-test-reader"
            },
            "validation_dataset_reader": {
                "type": "train-util-test-reader"
            },
            "train_data_path": "path-to-training-file",
            "validation_data_path": "path-to-validation-file",
            "data_loader": {
                "batch_size": 2
            },
        })
        _ = make_vocab_from_params(params, str(self.TEST_DIR))
        log_messages = "\n".join([rec.message for rec in caplog.records])
        assert "Using a separate dataset reader to load validation and test data" in log_messages

    def test_invalid_datasets_for_vocab_creation(self):
        params = Params({
            "dataset_reader": {
                "type": "train-util-test-reader"
            },
            "train_data_path":
            "path-to-training-file",
            "validation_data_path":
            "path-to-validation-file",
            "datasets_for_vocab_creation": ["train", "validation", "test"],
            "data_loader": {
                "batch_size": 2
            },
        })
        with pytest.raises(ConfigurationError,
                           match="invalid 'datasets_for_vocab_creation' test"):
            make_vocab_from_params(params, str(self.TEST_DIR))

    def test_raise_error_if_directory_non_empty(self):
        params = Params({
            "dataset_reader": {
                "type": "train-util-test-reader"
            },
            "train_data_path": "path-to-training-file",
            "validation_data_path": "path-to-validation-file",
            "data_loader": {
                "batch_size": 2
            },
        })
        os.makedirs(self.TEST_DIR / "vocabulary")
        with open(self.TEST_DIR / "vocabulary" / "blah", "w") as random_file:
            random_file.write("BLAH!")
        with pytest.raises(ConfigurationError,
                           match="The 'vocabulary' directory in the provided"):
            make_vocab_from_params(params, str(self.TEST_DIR))

    def test_get_metrics(self):
        class FakeModel(Model):
            def forward(self, **kwargs):
                return {}

        model = FakeModel(None)
        total_loss = 100.0
        batch_loss = 10.0
        num_batches = 2
        metrics = get_metrics(model, total_loss, None, batch_loss, None,
                              num_batches)

        assert metrics["loss"] == float(total_loss / num_batches)
        assert metrics["batch_loss"] == batch_loss

        metrics = get_metrics(model, total_loss, None, None, None, num_batches)

        assert metrics["loss"] == float(total_loss / num_batches)
        assert "batch_loss" not in metrics

    def test_exception_serialization(self):
        e = ConfigurationError("example")
        assert {"message": "example"} == vars(pickle.loads(pickle.dumps(e)))
Example #33
0
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}

        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}


# In practice you'd probably do this from the command line:
#   $ allennlp train tutorials/tagger/experiment.jsonnet -s /tmp/serialization_dir --include-package tutorials.tagger.config_allennlp
#
if __name__ == "__main__":
    params = Params.from_file('./character_experiment.jsonnet')
    serialization_dir = tempfile.mkdtemp()
    model = train_model(params, serialization_dir)

    # Make predictions
    predictor = SentenceTaggerPredictor(model, dataset_reader=PosDatasetReader())
    tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
    print(tag_logits)
    tag_ids = np.argmax(tag_logits, axis=-1)
    print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

    shutil.rmtree(serialization_dir)
Example #34
0
    def from_params(cls: Type[T], params: Params, **extras) -> T:
        """
        This is the automatic implementation of `from_params`. Any class that subclasses `FromParams`
        (or `Registrable`, which itself subclasses `FromParams`) gets this implementation for free.
        If you want your class to be instantiated from params in the "obvious" way -- pop off parameters
        and hand them to your constructor with the same names -- this provides that functionality.

        If you need more complex logic in your from `from_params` method, you'll have to implement
        your own method that overrides this one.
        """
        # pylint: disable=protected-access
        from allennlp.common.registrable import Registrable  # import here to avoid circular imports

        logger.info(
            f"instantiating class {cls} from params {getattr(params, 'params', params)} "
            f"and extras {set(extras.keys())}")

        if params is None:
            return None

        if isinstance(params, str):
            params = Params({"type": params})

        registered_subclasses = Registrable._registry.get(cls)

        if registered_subclasses is not None:
            # We know ``cls`` inherits from Registrable, so we'll use a cast to make mypy happy.
            # We have to use a disable to make pylint happy.
            # pylint: disable=no-member
            as_registrable = cast(Type[Registrable], cls)
            default_to_first_choice = as_registrable.default_implementation is not None
            choice = params.pop_choice(
                "type",
                choices=as_registrable.list_available(),
                default_to_first_choice=default_to_first_choice)
            subclass = registered_subclasses[choice]

            # We want to call subclass.from_params. It's possible that it's just the "free"
            # implementation here, in which case it accepts `**extras` and we are not able
            # to make any assumptions about what extra parameters it needs.
            #
            # It's also possible that it has a custom `from_params` method. In that case it
            # won't accept any **extra parameters and we'll need to filter them out.
            if not takes_arg(subclass.from_params, 'extras'):
                # Necessarily subclass.from_params is a custom implementation, so we need to
                # pass it only the args it's expecting.
                extras = {
                    k: v
                    for k, v in extras.items()
                    if takes_arg(subclass.from_params, k)
                }

            return subclass.from_params(params=params, **extras)
        else:
            # This is not a base class, so convert our params and extras into a dict of kwargs.

            if cls.__init__ == object.__init__:
                # This class does not have an explicit constructor, so don't give it any kwargs.
                # Without this logic, create_kwargs will look at object.__init__ and see that
                # it takes *args and **kwargs and look for those.
                kwargs: Dict[str, Any] = {}
            else:
                # This class has a constructor, so create kwargs for it.
                kwargs = create_kwargs(cls, params, **extras)

            return cls(**kwargs)  # type: ignore
Example #35
0
)
parser.add_argument(
    '-folder',
    dest='folder',
    help='folder location',
    type=str,
)
parser.add_argument(
    '-no_tqdm',
    dest='no_tqdm',
    action='store_true',
)

args = parser.parse_args()

params = Params.from_file(args.params)
train, val, test = VCR.splits(
    mode='rationale' if args.rationale else 'answer',
    embs_to_load=params['dataset_reader'].get('embs', 'bert_da'),
    only_use_relevant_dets=params['dataset_reader'].get(
        'only_use_relevant_dets', True))
NUM_GPUS = torch.cuda.device_count()
NUM_CPUS = multiprocessing.cpu_count()
if NUM_GPUS == 0:
    raise ValueError("you need gpus!")


def _to_gpu(td):
    if NUM_GPUS > 1:
        return td
    for k in td:
Example #36
0
 def from_params(cls, params: Params) -> "SpanExtractor":
     choice = params.pop_choice('type', cls.list_available())
     return cls.by_name(choice).from_params(params)
Example #37
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        """
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        """

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(
            non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens0")  # index:2
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens0")  # index:3
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens0")  # index:4

        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens1")  # index:0
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens1")  # index:1
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens1")  # index:2

        original_vocab.add_token_to_namespace("a",
                                              namespace="tokens2")  # index:0
        original_vocab.add_token_to_namespace("b",
                                              namespace="tokens2")  # index:1
        original_vocab.add_token_to_namespace("c",
                                              namespace="tokens2")  # index:2

        original_vocab.add_token_to_namespace("p",
                                              namespace="tokens3")  # index:0
        original_vocab.add_token_to_namespace("q",
                                              namespace="tokens3")  # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens0": SingleIdTokenIndexer("tokens0")},
        )
        text_field1 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens1": SingleIdTokenIndexer("tokens1")},
        )
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([
            Instance({
                "text0": text_field0,
                "text1": text_field1,
                "text4": text_field4,
                "text5": text_field5,
            })
        ])

        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": ["tokens1", "tokens5"],
        })
        extended_vocab = Vocabulary.from_params(params, instances=instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {
            "tokens1", "tokens3", "tokens5"
        }

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size(
            "tokens0") == 8  # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size(
            "tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens4") == 6  # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3  # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(
                    token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(
                    index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(
                    index, namespace)
                assert vocab_token == extended_vocab_token
def main(param_file: str, args: argparse.Namespace):
    commit = subprocess.check_output(["git", "rev-parse", "HEAD"],
                                     universal_newlines=True).strip()
    image = f"allennlp/sparc_rc:{commit}"
    overrides = ""

    # Reads params and sets environment.
    params = Params.from_file(param_file, overrides)
    flat_params = params.as_flat_dict()
    env = {}
    for k, v in flat_params.items():
        k = str(k).replace('.', '_')
        env[k] = str(v)

    # If the git repository is dirty, add a random hash.
    result = subprocess.run('git diff-index --quiet HEAD --', shell=True)
    if result.returncode != 0:
        dirty_hash = "%x" % random_int
        image += "-" + dirty_hash

    if args.blueprint:
        blueprint = args.blueprint
        print(f"Using the specified blueprint: {blueprint}")
    else:
        print(f"Building the Docker image ({image})...")
        subprocess.run(f'docker build -t {image} .', shell=True, check=True)

        print(f"Create a Beaker blueprint...")
        blueprint = subprocess.check_output(
            f'beaker blueprint create --quiet {image}',
            shell=True,
            universal_newlines=True).strip()
        print(f"  Blueprint created: {blueprint}")

    config_dataset_id = subprocess.check_output(
        f'beaker dataset create --quiet {param_file}',
        shell=True,
        universal_newlines=True).strip()

    allennlp_command = [
        "python", "-m", "allennlp.run", "train", "/config.json", "-s",
        "/output", "--file-friendly-logging", "--include-package",
        "reading_comprehension"
    ]

    dataset_mounts = []
    for source in args.source + [f"{config_dataset_id}:/config.json"]:
        datasetId, containerPath = source.split(":")
        dataset_mounts.append({
            "datasetId": datasetId,
            "containerPath": containerPath
        })

    for var in args.env:
        key, value = var.split("=")
        env[key] = value

    requirements = {}
    if args.cpu:
        requirements["cpu"] = float(args.cpu)
    if args.memory:
        requirements["memory"] = args.memory
    if args.gpu_count:
        requirements["gpuCount"] = int(args.gpu_count)
    config_spec = {
        "description": args.desc,
        "blueprint": blueprint,
        "resultPath": "/output",
        "args": allennlp_command,
        "datasetMounts": dataset_mounts,
        "requirements": requirements,
        "env": env
    }
    config_task = {"spec": config_spec, "name": "training"}

    config = {"tasks": [config_task]}

    output_path = args.spec_output_path if args.spec_output_path else tempfile.mkstemp(
        ".yaml", "beaker-config-")[1]
    with open(output_path, "w") as output:
        output.write(json.dumps(config, indent=4))
    print(f"Beaker spec written to {output_path}.")

    experiment_command = [
        "beaker", "experiment", "create", "--file", output_path
    ]
    if args.name:
        experiment_command.append("--name")
        experiment_command.append(args.name.replace(" ", "-"))

    if args.dry_run:
        print(
            f"This is a dry run (--dry-run).  Launch your job with the following command:"
        )
        print(f"    " + " ".join(experiment_command))
    else:
        print(f"Running the experiment:")
        print(f"    " + " ".join(experiment_command))
        subprocess.run(experiment_command)
Example #39
0
def load_archive(
    archive_file: str,
    cuda_device: int = -1,
    opt_level: str = None,
    overrides: str = "",
    weights_file: str = None,
) -> Archive:
    """
    Instantiates an Archive from an archived `tar.gz` file.

    # Parameters

    archive_file : `str`
        The archive file to load the model from.
    cuda_device : `int`, optional (default = -1)
        If `cuda_device` is >= 0, the model will be loaded onto the
        corresponding GPU. Otherwise it will be loaded onto the CPU.
    opt_level : `str`, optional, (default = `None`)
        Each `opt_level` establishes a set of properties that govern Amp’s implementation of pure or mixed
        precision training. Must be a choice of `"O0"`, `"O1"`, `"O2"`, or `"O3"`.
        See the Apex [documentation](https://nvidia.github.io/apex/amp.html#opt-levels-and-properties) for
        more details. If `None`, defaults to the `opt_level` found in the model params. If `cuda_device==-1`,
        Amp is not used and this argument is ignored.
    overrides : `str`, optional (default = "")
        JSON overrides to apply to the unarchived `Params` object.
    weights_file : `str`, optional (default = None)
        The weights file to use.  If unspecified, weights.th in the archive_file will be used.
    """
    # redirect to the cache, if necessary
    resolved_archive_file = cached_path(archive_file)

    if resolved_archive_file == archive_file:
        logger.info(f"loading archive file {archive_file}")
    else:
        logger.info(
            f"loading archive file {archive_file} from cache at {resolved_archive_file}"
        )

    if os.path.isdir(resolved_archive_file):
        serialization_dir = resolved_archive_file
    else:
        # Extract archive to temp dir
        tempdir = tempfile.mkdtemp()
        logger.info(
            f"extracting archive file {resolved_archive_file} to temp dir {tempdir}"
        )
        with tarfile.open(resolved_archive_file, "r:gz") as archive:
            archive.extractall(tempdir)
        # Postpone cleanup until exit in case the unarchived contents are needed outside
        # this function.
        atexit.register(_cleanup_archive_dir, tempdir)

        serialization_dir = tempdir

    # Load config
    config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME),
                              overrides)

    if weights_file:
        weights_path = weights_file
    else:
        weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME)
        # Fallback for serialization directories.
        if not os.path.exists(weights_path):
            weights_path = os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

    # Instantiate model. Use a duplicate of the config, as it will get consumed.
    model = Model.load(
        config.duplicate(),
        weights_file=weights_path,
        serialization_dir=serialization_dir,
        cuda_device=cuda_device,
        opt_level=opt_level,
    )

    return Archive(model=model, config=config)
Example #40
0
 def from_params(cls, params: Params):
     return cls(**params.as_dict())
Example #41
0
def load_archive(
    archive_file: Union[str, PathLike],
    cuda_device: int = -1,
    overrides: Union[str, Dict[str, Any]] = "",
    weights_file: str = None,
) -> Archive:
    """
    Instantiates an Archive from an archived `tar.gz` file.

    # Parameters

    archive_file : `Union[str, PathLike]`
        The archive file to load the model from.
    cuda_device : `int`, optional (default = `-1`)
        If `cuda_device` is >= 0, the model will be loaded onto the
        corresponding GPU. Otherwise it will be loaded onto the CPU.
    overrides : `Union[str, Dict[str, Any]]`, optional (default = `""`)
        JSON overrides to apply to the unarchived `Params` object.
    weights_file : `str`, optional (default = `None`)
        The weights file to use.  If unspecified, weights.th in the archive_file will be used.
    """
    # redirect to the cache, if necessary
    resolved_archive_file = cached_path(archive_file)

    if resolved_archive_file == archive_file:
        logger.info(f"loading archive file {archive_file}")
    else:
        logger.info(
            f"loading archive file {archive_file} from cache at {resolved_archive_file}"
        )

    meta: Optional[Meta] = None

    tempdir = None
    try:
        if os.path.isdir(resolved_archive_file):
            serialization_dir = resolved_archive_file
        else:
            with extracted_archive(resolved_archive_file,
                                   cleanup=False) as tempdir:
                serialization_dir = tempdir

        if weights_file:
            weights_path = weights_file
        else:
            weights_path = get_weights_path(serialization_dir)

        # Load config
        config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME),
                                  overrides)

        # Instantiate model and dataset readers. Use a duplicate of the config, as it will get consumed.
        dataset_reader, validation_dataset_reader = _load_dataset_readers(
            config.duplicate(), serialization_dir)
        model = _load_model(config.duplicate(), weights_path,
                            serialization_dir, cuda_device)

        # Load meta.
        meta_path = os.path.join(serialization_dir, META_NAME)
        if os.path.exists(meta_path):
            meta = Meta.from_path(meta_path)
    finally:
        if tempdir is not None:
            logger.info(
                f"removing temporary unarchived model dir at {tempdir}")
            shutil.rmtree(tempdir, ignore_errors=True)

    # Check version compatibility.
    if meta is not None:
        _check_version_compatibility(archive_file, meta)

    return Archive(
        model=model,
        config=config,
        dataset_reader=dataset_reader,
        validation_dataset_reader=validation_dataset_reader,
        meta=meta,
    )
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}

        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

# In practice you'd probably do this from the command line:
#   $ allennlp train tutorials/tagger/experiment.jsonnet -s /tmp/serialization_dir --include-package tutorials.tagger.config_allennlp
#
if __name__ == "__main__":
    params = Params.from_file('./config.jsonnet')
    serialization_dir = tempfile.mkdtemp()
    model = train_model(params, serialization_dir)

    # Make predictions
    predictor = SentenceTaggerPredictor(model, dataset_reader=PosDatasetReader())
    tag_logits = predictor.predict("骑着 狗  出去 逛 街")['tag_logits']
    print(tag_logits)
    tag_ids = np.argmax(tag_logits, axis=-1)
    print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

    shutil.rmtree(serialization_dir)
Example #43
0

if __name__ == "__main__":
    reader = UDDatasetReader()
    train_dataset = reader.read('data/UD_English-EWT/en_ewt-ud-train.conllu')
    validation_dataset = reader.read(
        'data/UD_English-EWT/en_ewt-ud-dev.conllu')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    EMBEDDING_DIM = 100
    HIDDEN_DIM = 200

    model_params = Params({
        'type': 'lstm',
        'input_size': EMBEDDING_DIM,
        'hidden_size': HIDDEN_DIM
    })

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embedding = BasicTextFieldEmbedder({'tokens': token_embedding})
    lstm = Seq2SeqEncoder.from_params(model_params)

    model = POSTagger(word_embedding, lstm, vocab)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[('sentence', 'num_tokens')])
    iterator.index_with(vocab)
Example #44
0
def create_serialization_dir(
        params: Params,
        serialization_dir: str,
        recover: bool,
        force: bool) -> None:
    """
    This function creates the serialization directory if it doesn't exist.  If it already exists
    and is non-empty, then it verifies that we're recovering from a training with an identical configuration.

    Parameters
    ----------
    params: ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: ``str``
        The directory in which to save results and logs.
    recover: ``bool``
        If ``True``, we will try to recover from an existing serialization directory, and crash if
        the directory doesn't exist, or doesn't match the configuration we're given.
    force: ``bool``
        If ``True``, we will overwrite the serialization directory if it already exists.
    """
    if recover and force:
        raise ConfigurationError("Illegal arguments: both force and recover are true.")

    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        if not recover:
            raise ConfigurationError(f"Serialization directory ({serialization_dir}) already exists and is "
                                     f"not empty. Specify --recover to recover training from existing output.")

        logger.info(f"Recovering from prior training at {serialization_dir}.")

        recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME)
        if not os.path.exists(recovered_config_file):
            raise ConfigurationError("The serialization directory already exists but doesn't "
                                     "contain a config.json. You probably gave the wrong directory.")
        else:
            loaded_params = Params.from_file(recovered_config_file)

            # Check whether any of the training configuration differs from the configuration we are
            # resuming.  If so, warn the user that training may fail.
            fail = False
            flat_params = params.as_flat_dict()
            flat_loaded = loaded_params.as_flat_dict()

            # Exclude some keys from being checked as matching config
            no_check_keys = ['trainer.cuda_device', 'train_data_path', 'validation_data_path', 'test_data_path']  # Make this the overrides
            for key in no_check_keys:
                flat_params.pop(key, None)
                flat_loaded.pop(key, None)

            for key in flat_params.keys() - flat_loaded.keys():
                logger.error(f"Key '{key}' found in training configuration but not in the serialization "
                             f"directory we're recovering from.")
                fail = True
            for key in flat_loaded.keys() - flat_params.keys():
                logger.error(f"Key '{key}' found in the serialization directory we're recovering from "
                             f"but not in the training config.")
                fail = True
            for key in flat_params.keys():
                if flat_params.get(key, None) != flat_loaded.get(key, None):
                    logger.error(f"Value for '{key}' in training configuration does not match that the value in "
                                 f"the serialization directory we're recovering from: "
                                 f"{flat_params[key]} != {flat_loaded[key]}")
                    fail = True
            if fail:
                raise ConfigurationError("Training configuration does not match the configuration we're "
                                         "recovering from.")
    else:
        if recover:
            raise ConfigurationError(f"--recover specified but serialization_dir ({serialization_dir}) "
                                     "does not exist.  There is nothing to recover from.")
        os.makedirs(serialization_dir, exist_ok=True)
Example #45
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'Model':
     choice = params.pop_choice("type", cls.list_available())
     return cls.by_name(choice).from_params(vocab, params)
Example #46
0
 def test_replace_none(self):
     params = Params({"a": "None", "b": [1.0, "None", 2], "c": {"d": "None"}})
     assert params["a"] is None
     assert params["b"][1] is None
     assert params["c"]["d"] is None
 def test_locally_normalised_span_extractor_can_build_from_params(self):
     params = Params({"type": "self_attentive", "input_dim": 5})
     extractor = SpanExtractor.from_params(params)
     assert isinstance(extractor, SelfAttentiveSpanExtractor)
Example #48
0
 def from_params(cls, params: Params) -> 'A':
     b_params = params.pop("b")
     return cls(B.from_params(b_params))
Example #49
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / "vocab_save"
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_tokens_to_namespace(["d", "a", "b"],
                                                   namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)
            instances = Batch([Instance({"text": text_field})])
            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            extra_count = 2 if extended_vocab.is_padded("tokens") else 0
            assert extended_vocab.get_token_index("d",
                                                  "tokens") == 0 + extra_count
            assert extended_vocab.get_token_index("a",
                                                  "tokens") == 1 + extra_count
            assert extended_vocab.get_token_index("b",
                                                  "tokens") == 2 + extra_count

            assert extended_vocab.get_token_index(
                "c", "tokens")  # should be present
            assert extended_vocab.get_token_index(
                "e", "tokens")  # should be present

            assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace(
                "a", namespace="tokens1")  # index2
            text_field = TextField(
                [Token(t) for t in ["b"]],
                {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)

            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            # Should have two namespaces
            assert len(extended_vocab._token_to_index) == 2

            extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
            assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

            extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
            assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
Example #50
0
    def test_add_file_with_list_history_to_archive(self):
        # Creates actual files since add_file_to_archive will throw an exception
        # if the file does not exist.
        tempdir = tempfile.mkdtemp()
        my_file = os.path.join(tempdir, "my_file.txt")
        my_other_file = os.path.join(tempdir, "my_other_file.txt")
        open(my_file, 'w').close()
        open(my_other_file, 'w').close()

        # Some nested classes just to exercise the ``from_params``
        # and ``add_file_to_archive`` methods.

        class C:
            def __init__(self, c_file: str) -> None:
                self.c_file = c_file

            @classmethod
            def from_params(cls, params: Params) -> 'C':
                params.add_file_to_archive("c_file")
                c_file = params.pop("c_file")

                return cls(c_file)

        class B:
            def __init__(self, filename: str, c) -> None:
                self.filename = filename
                self.c_dict = {"here": c}

            @classmethod
            def from_params(cls, params: Params) -> 'B':
                params.add_file_to_archive("filename")

                filename = params.pop("filename")
                c_params = params.pop("c")
                c = C.from_params(c_params)

                return cls(filename, c)

        class A:
            def __init__(self, bs) -> None:
                self.bs = bs

            @classmethod
            def from_params(cls, params: Params) -> 'A':
                bs = params.pop("bs")
                return cls(bs=[B.from_params(b_params) for b_params in bs])

        params = Params({
                "a": {
                        "bs": [
                                {
                                    "filename": my_file,
                                    "c": {
                                            "c_file": my_other_file
                                    },
                                },
                            ],
                }
        })

        # Construct ``A`` from params but then just throw it away.
        A.from_params(params.pop("a"))

        assert params.files_to_archive == {
                "a.bs.0.filename": my_file,
                "a.bs.0.c.c_file": my_other_file
        }
Example #51
0
 def from_params(cls, params: Params, vocab: Optional[Vocabulary] = None):
     metric_type = params.pop_choice("type", cls.list_available())
     if vocab:
         params["vocabulary"] = vocab
     return cls.by_name(metric_type)(**params.as_dict())  # type: ignore
Example #52
0
 def test_bad_unicode_environment_variables(self):
     filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json'
     os.environ['BAD_ENVIRONMENT_VARIABLE'] = "\udce2"
     Params.from_file(filename)
     del os.environ['BAD_ENVIRONMENT_VARIABLE']
Example #53
0
def create_kwargs(cls: Type[T], params: Params, **extras) -> Dict[str, Any]:
    """
    Given some class, a `Params` object, and potentially other keyword arguments,
    create a dict of keyword args suitable for passing to the class's constructor.

    The function does this by finding the class's constructor, matching the constructor
    arguments to entries in the `params` object, and instantiating values for the parameters
    using the type annotation and possibly a from_params method.

    Any values that are provided in the `extras` will just be used as is.
    For instance, you might provide an existing `Vocabulary` this way.
    """
    # Get the signature of the constructor.
    from allennlp.models.archival import load_archive  # import here to avoid circular imports

    signature = inspect.signature(cls.__init__)
    kwargs: Dict[str, Any] = {}

    # Iterate over all the constructor parameters and their annotations.
    for name, param in signature.parameters.items():
        # Skip "self". You're not *required* to call the first parameter "self",
        # so in theory this logic is fragile, but if you don't call the self parameter
        # "self" you kind of deserve what happens.
        if name == "self":
            continue

        # If the annotation is a compound type like typing.Dict[str, int],
        # it will have an __origin__ field indicating `typing.Dict`
        # and an __args__ field indicating `(str, int)`. We capture both.
        annotation = remove_optional(param.annotation)
        origin = getattr(annotation, '__origin__', None)
        args = getattr(annotation, '__args__', [])

        # The parameter is optional if its default value is not the "no default" sentinel.
        default = param.default
        optional = default != _NO_DEFAULT

        # Some constructors expect extra non-parameter items, e.g. vocab: Vocabulary.
        # We check the provided `extras` for these and just use them if they exist.
        if name in extras:
            kwargs[name] = extras[name]
        # Next case is when argument should be loaded from pretrained archive.
        elif name in params and isinstance(
                params.get(name),
                Params) and "_pretrained" in params.get(name):
            load_module_params = params.pop(name).pop("_pretrained")
            archive_file = load_module_params.pop("archive_file")
            module_path = load_module_params.pop("module_path")
            freeze = load_module_params.pop("freeze", True)
            archive = load_archive(archive_file)
            kwargs[name] = archive.extract_module(module_path, freeze)  # pylint: disable=no-member
            if not isinstance(kwargs[name], annotation):
                raise ConfigurationError(
                    f"The module from model at {archive_file} at path {module_path} "
                    f"was expected of type {annotation} but is of type {type(kwargs[name])}"
                )
        # # The next case is when the parameter type is itself constructible from_params.
        elif hasattr(annotation, 'from_params'):
            if name in params:
                # Our params have an entry for this, so we use that.
                subparams = params.pop(name)

                if takes_arg(annotation.from_params, 'extras'):
                    # If annotation.params accepts **extras, we need to pass them all along.
                    # For example, `BasicTextFieldEmbedder.from_params` requires a Vocabulary
                    # object, but `TextFieldEmbedder.from_params` does not.
                    subextras = extras
                else:
                    # Otherwise, only supply the ones that are actual args; any additional ones
                    # will cause a TypeError.
                    subextras = {
                        k: v
                        for k, v in extras.items()
                        if takes_arg(annotation.from_params, k)
                    }

                # In some cases we allow a string instead of a param dict, so
                # we need to handle that case separately.
                if isinstance(subparams, str):
                    kwargs[name] = annotation.by_name(subparams)()
                else:
                    kwargs[name] = annotation.from_params(params=subparams,
                                                          **subextras)
            elif not optional:
                # Not optional and not supplied, that's an error!
                raise ConfigurationError(
                    f"expected key {name} for {cls.__name__}")
            else:
                kwargs[name] = default

        # If the parameter type is a Python primitive, just pop it off
        # using the correct casting pop_xyz operation.
        elif annotation == str:
            kwargs[name] = (params.pop(name, default)
                            if optional else params.pop(name))
        elif annotation == int:
            kwargs[name] = (params.pop_int(name, default)
                            if optional else params.pop_int(name))
        elif annotation == bool:
            kwargs[name] = (params.pop_bool(name, default)
                            if optional else params.pop_bool(name))
        elif annotation == float:
            kwargs[name] = (params.pop_float(name, default)
                            if optional else params.pop_float(name))

        # This is special logic for handling types like Dict[str, TokenIndexer],
        # List[TokenIndexer], Tuple[TokenIndexer, Tokenizer], and Set[TokenIndexer],
        # which it creates by instantiating each value from_params and returning the resulting structure.
        elif origin in (Dict, dict) and len(args) == 2 and hasattr(
                args[-1], 'from_params'):
            value_cls = annotation.__args__[-1]

            value_dict = {}

            for key, value_params in params.pop(name, Params({})).items():
                value_dict[key] = value_cls.from_params(params=value_params,
                                                        **extras)

            kwargs[name] = value_dict

        elif origin in (List, list) and len(args) == 1 and hasattr(
                args[0], 'from_params'):
            value_cls = annotation.__args__[0]

            value_list = []

            for value_params in params.pop(name, Params({})):
                value_list.append(
                    value_cls.from_params(params=value_params, **extras))

            kwargs[name] = value_list

        elif origin in (Tuple, tuple) and all(
                hasattr(arg, 'from_params') for arg in args):
            value_list = []

            for value_cls, value_params in zip(annotation.__args__,
                                               params.pop(name, Params({}))):
                value_list.append(
                    value_cls.from_params(params=value_params, **extras))

            kwargs[name] = tuple(value_list)

        elif origin in (Set, set) and len(args) == 1 and hasattr(
                args[0], 'from_params'):
            value_cls = annotation.__args__[0]

            value_set = set()

            for value_params in params.pop(name, Params({})):
                value_set.add(
                    value_cls.from_params(params=value_params, **extras))

            kwargs[name] = value_set

        else:
            # Pass it on as is and hope for the best.   ¯\_(ツ)_/¯
            if optional:
                kwargs[name] = params.pop(name, default)
            else:
                kwargs[name] = params.pop(name)

    params.assert_empty(cls.__name__)
    return kwargs
Example #54
0
            def from_params(cls, params: Params) -> 'C':
                params.add_file_to_archive("c_file")
                c_file = params.pop("c_file")

                return cls(c_file)
Example #55
0
def run(
    config: str,
    name: str,
    allennlp_version: str,
    models_version: str,
    packages: str,
    gpus: int,
    workspace: str,
    user: str,
    include: Tuple[Tuple[str, str], ...],
    verbose: int,
    dry_run: bool,
    cluster: str,
):
    # We create a temp directory to use as context for the Docker build, and
    # also to create a  temporary beaker config file.
    with TemporaryDirectory() as context_dir:
        # Write the training config to the context directory.
        training_config_path = os.path.join(context_dir, "config.jsonnet")
        params = Params.from_file(config)
        params.to_file(training_config_path)

        # Create a unique tag to use.
        image_id = str(uuid.uuid4())

        local_image_name = f"allennlp-beaker-{name}:{image_id}"
        beaker_image_name = f"allennlp-beaker-{name}-{image_id}"

        if models_version:
            packages = models_version + " " + packages
        packages = packages.strip()

        # Write the Dockefile to the context directory.
        dockerfile_path = os.path.join(context_dir, "Dockerfile")
        with open(dockerfile_path, "w") as dockerfile:
            dockerfile.write(DOCKERFILE)
            if packages:
                dockerfile.write(DOCKERFILE_EXTRA_STEPS)

        # Write the beaker config to the context directory.
        beaker_config_path = os.path.join(context_dir, "config.yml")
        with open(beaker_config_path, "w") as beaker_config:
            beaker_config.write(
                yaml.dump(
                    create_beaker_config(
                        name=name,
                        image=user + "/" + beaker_image_name,
                        gpus=gpus,
                        description=f"{allennlp_version} {packages}",
                        cluster=cluster,
                    )
                )
            )

        if verbose:
            click.echo("Beaker config:")
            for line in shell_out_command(["cat", beaker_config_path]):
                print(line)

        # Copy any other include files.
        if include:
            for (path, dest) in include:
                dest = os.path.join(context_dir, dest)
                click.echo(f"Copying {path} to {dest}")
                if os.path.isdir(path):
                    shutil.copytree(path, dest)
                else:
                    shutil.copy(path, dest)

        # Build the Docker image.
        click.echo(
            "Building docker image with name "
            + click.style(local_image_name, fg="green")
            + "..."
        )
        build_args = [
            "docker",
            "build",
            "--build-arg",
            f"ALLENNLP={allennlp_version}",
        ]
        if packages:
            build_args.extend(["--build-arg", f"PACKAGES={packages}"])
        build_args.extend(["-t", local_image_name, context_dir])
        if verbose:
            for line in shell_out_command(build_args):
                print(line)
        else:
            with click_spinner.spinner():
                deque(shell_out_command(build_args), maxlen=0)

        if dry_run:
            click.echo("Run the following to check the Docker image:\n")
            click.echo(
                f"  docker run --rm -it --entrypoint /bin/bash {local_image_name}"
            )
            return None

        # Publish the image to beaker.
        click.echo("Publishing image to beaker...")
        with click_spinner.spinner():
            deque(
                shell_out_command(
                    [
                        "beaker",
                        "image",
                        "create",
                        "-n",
                        beaker_image_name,
                        local_image_name,
                    ]
                ),
                maxlen=0,
            )

        # Submit the experiment to beaker.
        click.echo("Submitting experiment to beaker...")
        cmds = [
            "beaker",
            "experiment",
            "create",
            "--name",
            name,
            "-f",
            beaker_config_path,
        ]
        if workspace:
            cmds.extend(["--workspace", workspace])
        echo_command_output(cmds)
Example #56
0
 def from_params(cls, params: Params) -> 'A':
     bs = params.pop("bs")
     return cls(bs=[B.from_params(b_params) for b_params in bs])
Example #57
0
    def _load(
        cls,
        config: Params,
        serialization_dir: Union[str, PathLike],
        weights_file: Optional[Union[str, PathLike]] = None,
        cuda_device: int = -1,
    ) -> "Model":
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, "vocabulary")
        # If the config specifies a vocabulary subclass, we need to use it.
        vocab_params = config.get("vocabulary", Params({}))
        vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True)
        vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice)
        vocab = vocab_class.from_files(
            vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token")
        )

        model_params = config.get("model")

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings/weights from. We're now _loading_ the model, so those weights will already be
        # stored in our model. We don't need any pretrained weight file or initializers anymore,
        # and we don't want the code to look for it, so we remove it from the parameters here.
        remove_keys_from_params(model_params)
        model = Model.from_params(vocab=vocab, params=model_params)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        # If vocab+embedding extension was done, the model initialized from from_params
        # and one defined by state dict in weights_file might not have same embedding shapes.
        # Eg. when model embedder module was transferred along with vocab extension, the
        # initialized embedding weight shape would be smaller than one in the state_dict.
        # So calling model embedding extension is required before load_state_dict.
        # If vocab and model embeddings are in sync, following would be just a no-op.
        model.extend_embedder_vocab()

        # Load state dict. We pass `strict=False` so PyTorch doesn't raise a RuntimeError
        # if the state dict is missing keys because we handle this case below.
        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
        missing_keys, unexpected_keys = model.load_state_dict(model_state, strict=False)

        # Modules might define a class variable called `authorized_missing_keys`,
        # a list of regex patterns, that tells us to ignore missing keys that match
        # any of the patterns.
        # We sometimes need this in order to load older models with newer versions of AllenNLP.

        def filter_out_authorized_missing_keys(module, prefix=""):
            nonlocal missing_keys
            for pat in getattr(module.__class__, "authorized_missing_keys", None) or []:
                missing_keys = [
                    k
                    for k in missing_keys
                    if k.startswith(prefix) and re.search(pat[len(prefix) :], k) is None
                ]
            for name, child in module._modules.items():
                if child is not None:
                    filter_out_authorized_missing_keys(child, prefix + name + ".")

        filter_out_authorized_missing_keys(model)

        if unexpected_keys or missing_keys:
            raise RuntimeError(
                f"Error loading state dict for {model.__class__.__name__}\n\t"
                f"Missing keys: {missing_keys}\n\t"
                f"Unexpected keys: {unexpected_keys}"
            )

        return model
Example #58
0
    def test_pop_choice(self):
        choices = ['my_model', 'other_model']
        params = Params({'model': 'my_model'})
        assert params.pop_choice('model', choices) == 'my_model'

        params = Params({'model': 'non_existent_model'})
        with pytest.raises(ConfigurationError):
            params.pop_choice('model', choices)

        params = Params({'model': 'module.submodule.ModelName'})
        assert params.pop_choice('model', 'choices') == 'module.submodule.ModelName'

        params = Params({'model': 'module.submodule.ModelName'})
        with pytest.raises(ConfigurationError):
            params.pop_choice('model', choices, allow_class_names=False)
Example #59
0
def construct_arg(
    class_name: str,
    argument_name: str,
    popped_params: Params,
    annotation: Type,
    default: Any,
    **extras,
) -> Any:
    """
    The first two parameters here are only used for logging if we encounter an error.
    """
    origin = getattr(annotation, "__origin__", None)
    args = getattr(annotation, "__args__", [])

    # The parameter is optional if its default value is not the "no default" sentinel.
    optional = default != _NO_DEFAULT

    if hasattr(annotation, "from_params"):
        if popped_params is default:
            return default
        elif popped_params is not None:
            # Our params have an entry for this, so we use that.

            subextras = create_extras(annotation, extras)

            # In some cases we allow a string instead of a param dict, so
            # we need to handle that case separately.
            if isinstance(popped_params, str):
                return annotation.by_name(popped_params)()
            else:
                if isinstance(popped_params, dict):
                    popped_params = Params(popped_params)
                return annotation.from_params(params=popped_params,
                                              **subextras)
        elif not optional:
            # Not optional and not supplied, that's an error!
            raise ConfigurationError(
                f"expected key {argument_name} for {class_name}")
        else:
            return default

    # If the parameter type is a Python primitive, just pop it off
    # using the correct casting pop_xyz operation.
    elif annotation in {int, bool}:
        if type(popped_params) in {int, bool}:
            return annotation(popped_params)
        else:
            raise TypeError(
                f"Expected {argument_name} to be a {annotation.__name__}.")
    elif annotation == str:
        # Strings are special because we allow casting from Path to str.
        if type(popped_params) == str or isinstance(popped_params, Path):
            return str(popped_params)  # type: ignore
        else:
            raise TypeError(f"Expected {argument_name} to be a string.")
    elif annotation == float:
        # Floats are special because in Python, you can put an int wherever you can put a float.
        # https://mypy.readthedocs.io/en/stable/duck_type_compatibility.html
        if type(popped_params) in {int, float}:
            return popped_params
        else:
            raise TypeError(f"Expected {argument_name} to be numeric.")

    # This is special logic for handling types like Dict[str, TokenIndexer],
    # List[TokenIndexer], Tuple[TokenIndexer, Tokenizer], and Set[TokenIndexer],
    # which it creates by instantiating each value from_params and returning the resulting structure.
    elif origin in (Dict,
                    dict) and len(args) == 2 and can_construct_from_params(
                        args[-1]):
        value_cls = annotation.__args__[-1]

        value_dict = {}

        for key, value_params in popped_params.items():
            value_dict[key] = construct_arg(
                str(value_cls),
                argument_name + "." + key,
                value_params,
                value_cls,
                _NO_DEFAULT,
                **extras,
            )

        return value_dict

    elif origin in (List,
                    list) and len(args) == 1 and can_construct_from_params(
                        args[0]):
        value_cls = annotation.__args__[0]

        value_list = []

        for i, value_params in enumerate(popped_params):
            value = construct_arg(
                str(value_cls),
                argument_name + f".{i}",
                value_params,
                value_cls,
                _NO_DEFAULT,
                **extras,
            )
            value_list.append(value)

        return value_list

    elif origin in (Tuple, tuple) and all(
            can_construct_from_params(arg) for arg in args):
        value_list = []

        for i, (value_cls, value_params) in enumerate(
                zip(annotation.__args__, popped_params)):
            value = construct_arg(
                str(value_cls),
                argument_name + f".{i}",
                value_params,
                value_cls,
                _NO_DEFAULT,
                **extras,
            )
            value_list.append(value)

        return tuple(value_list)

    elif origin in (Set, set) and len(args) == 1 and can_construct_from_params(
            args[0]):
        value_cls = annotation.__args__[0]

        value_set = set()

        for i, value_params in enumerate(popped_params):
            value = construct_arg(
                str(value_cls),
                argument_name + f".{i}",
                value_params,
                value_cls,
                _NO_DEFAULT,
                **extras,
            )
            value_set.add(value)

        return value_set

    elif origin == Union:
        # Storing this so we can recover it later if we need to.
        backup_params = deepcopy(popped_params)

        # We'll try each of the given types in the union sequentially, returning the first one that
        # succeeds.
        for arg_annotation in args:
            try:
                return construct_arg(
                    str(arg_annotation),
                    argument_name,
                    popped_params,
                    arg_annotation,
                    default,
                    **extras,
                )
            except (ValueError, TypeError, ConfigurationError, AttributeError):
                # Our attempt to construct the argument may have modified popped_params, so we
                # restore it here.
                popped_params = deepcopy(backup_params)

        # If none of them succeeded, we crash.
        raise ConfigurationError(
            f"Failed to construct argument {argument_name} with type {annotation}"
        )
    elif origin == Lazy:
        if popped_params is default:
            return Lazy(lambda **kwargs: default)
        value_cls = args[0]
        subextras = create_extras(value_cls, extras)

        def constructor(**kwargs):
            # If there are duplicate keys between subextras and kwargs, this will overwrite the ones
            # in subextras with what's in kwargs.  If an argument shows up twice, we should take it
            # from what's passed to Lazy.construct() instead of what we got from create_extras().
            # Almost certainly these will be identical objects, anyway.
            # We do this by constructing a new dictionary, instead of mutating subextras, just in
            # case this constructor is called multiple times.
            constructor_extras = {**subextras, **kwargs}
            return value_cls.from_params(params=deepcopy(popped_params),
                                         **constructor_extras)

        return Lazy(constructor)  # type: ignore
    else:
        # Pass it on as is and hope for the best.   ¯\_(ツ)_/¯
        if isinstance(popped_params, Params):
            return popped_params.as_dict(quiet=True)
        return popped_params
Example #60
0
def pop_and_construct_arg(class_name: str, argument_name: str,
                          annotation: Type, default: Any, params: Params,
                          **extras) -> Any:
    """
    Does the work of actually constructing an individual argument for
    [`create_kwargs`](./from_params#create_kwargs).

    Here we're in the inner loop of iterating over the parameters to a particular constructor,
    trying to construct just one of them.  The information we get for that parameter is its name,
    its type annotation, and its default value; we also get the full set of `Params` for
    constructing the object (which we may mutate), and any `extras` that the constructor might
    need.

    We take the type annotation and default value here separately, instead of using an
    `inspect.Parameter` object directly, so that we can handle `Union` types using recursion on
    this method, trying the different annotation types in the union in turn.
    """
    from allennlp.models.archival import load_archive  # import here to avoid circular imports

    # We used `argument_name` as the method argument to avoid conflicts with 'name' being a key in
    # `extras`, which isn't _that_ unlikely.  Now that we are inside the method, we can switch back
    # to using `name`.
    name = argument_name

    # Some constructors expect extra non-parameter items, e.g. vocab: Vocabulary.
    # We check the provided `extras` for these and just use them if they exist.
    if name in extras:
        if name not in params:
            return extras[name]
        else:
            logger.warning(
                f"Parameter {name} for class {class_name} was found in both "
                "**extras and in params. Using the specification found in params, "
                "but you probably put a key in a config file that you didn't need, "
                "and if it is different from what we get from **extras, you might "
                "get unexpected behavior.")
    # Next case is when argument should be loaded from pretrained archive.
    elif (name in params and isinstance(params.get(name), Params)
          and "_pretrained" in params.get(name)):
        load_module_params = params.pop(name).pop("_pretrained")
        archive_file = load_module_params.pop("archive_file")
        module_path = load_module_params.pop("module_path")
        freeze = load_module_params.pop("freeze", True)
        archive = load_archive(archive_file)
        result = archive.extract_module(module_path, freeze)
        if not isinstance(result, annotation):
            raise ConfigurationError(
                f"The module from model at {archive_file} at path {module_path} "
                f"was expected of type {annotation} but is of type {type(result)}"
            )
        return result

    popped_params = params.pop(
        name, default) if default != _NO_DEFAULT else params.pop(name)
    if popped_params is None:
        origin = getattr(annotation, "__origin__", None)
        if origin == Lazy:
            return Lazy(lambda **kwargs: None)
        return None

    return construct_arg(class_name, name, popped_params, annotation, default,
                         **extras)