def __init__(self,
                 archive_file: str,
                 dropout: float = None,
                 bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"),
                 remove_bos_eos: bool = True,
                 requires_grad: bool = False) -> None:
        super().__init__()

        overrides = {"model": {"contextualizer": {"return_all_layers": True}}}

        # Import here to avoid circular dependency.
        from allennlp.models.archival import load_archive
        # Load LM and the associated config.
        archive = load_archive(archive_file, overrides=json.dumps(overrides))
        self._lm: LanguageModel = archive.model
        self._lm.delete_softmax()
        config = archive.config
        dict_config = config.as_dict(quiet=True)

        # Extract the name of the tokens that the LM was trained on.
        text_field_embedder = dict_config["model"]["text_field_embedder"]
        token_names = list(text_field_embedder["token_embedders"].keys())
        if len(token_names) != 1:
            # We don't currently support embedding with language models trained with multiple
            # embedded indices.
            #
            # Note: We only care about embedded indices. This does not include "tokens" which
            # is just used to compute the loss in LanguageModel.
            raise ConfigurationError(
                f"LM from {archive_file} trained with multiple embedders!")
        if "embedder_to_indexer_map" in text_field_embedder:
            # Similarly we don't support multiple indexers per embedder.
            raise ConfigurationError(
                f"LM from {archive_file} trained with embedder_to_indexer_map!"
            )
        self._token_name = token_names[0]

        # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the
        # BasicTextFieldEmbedder concatenates multiple embedded representations. When a
        # downstream model uses both, tokens and token characters, say, and only adds bos/eos
        # tokens to the token characters, the dimensions don't match. See:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109
        #
        # For the equivalent hack in the ELMo embedder see:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590
        if bos_eos_tokens:
            dataset_reader_config = config.get("dataset_reader")
            if dataset_reader_config.get("type") == "multiprocess":
                dataset_reader_config = dataset_reader_config.get(
                    "base_reader")
            token_indexer_config = dataset_reader_config.get(
                "token_indexers").get(self._token_name)
            token_indexer: TokenIndexer = TokenIndexer.from_params(
                token_indexer_config)
            token_list = [Token(token) for token in bos_eos_tokens]
            # TODO(brendanr): Obtain these indices from the vocab once the
            # ELMoTokenCharactersIndexer adds the mappings.
            bos_eos_indices = token_indexer.tokens_to_indices(
                token_list, self._lm.vocab, "key")["key"]
            self._bos_indices = torch.Tensor(bos_eos_indices[0])
            self._eos_indices = torch.Tensor(bos_eos_indices[1])
        else:
            self._bos_indices = None
            self._eos_indices = None

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._remove_bos_eos = remove_bos_eos
        num_layers = self._lm.num_layers()
        # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead.
        # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76
        self._scalar_mix = ScalarMix(mixture_size=num_layers,
                                     do_layer_norm=False,
                                     trainable=True)

        # pylint: disable=protected-access
        character_dim = self._lm._text_field_embedder.get_output_dim()
        contextual_dim = self._lm._contextualizer.get_output_dim()

        if contextual_dim % character_dim != 0:
            raise ConfigurationError(
                "The output dimensions for the text_field_embedder " +
                f"({character_dim}) and the contextualizer ({contextual_dim})"
                + f" from the language model loaded from {archive_file} are " +
                "not compatible. Please check the config used to train that " +
                "model and ensure that the output dimension of the " +
                "text_field_embedder divides the output dimension of the " +
                "contextualizer.")
        self._character_embedding_duplication_count = contextual_dim // character_dim

        for param in self._lm.parameters():
            param.requires_grad = requires_grad
Beispiel #2
0
    def __init__(self, transformer: OpenaiTransformer) -> None:
        super().__init__()

        self._transformer = transformer
        self._scalar_mix = ScalarMix(transformer.num_output_layers,
                                     do_layer_norm=False)
Beispiel #3
0
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 top_layer_only: bool = True,
                 bert_weights_model: str = None,
                 per_choice_loss: bool = False,
                 layer_freeze_regexes: List[str] = None,
                 regularizer: Optional[RegularizerApplicator] = None,
                 use_comparative_bert: bool = True,
                 use_bilinear_classifier: bool = False,
                 train_comparison_layer: bool = False,
                 number_of_choices_compared: int = 0,
                 comparison_layer_hidden_size: int = -1,
                 comparison_layer_use_relu: bool = True) -> None:
        super().__init__(vocab, regularizer)

        self._use_comparative_bert = use_comparative_bert
        self._use_bilinear_classifier = use_bilinear_classifier
        self._train_comparison_layer = train_comparison_layer
        if train_comparison_layer:
            assert number_of_choices_compared > 1
            self._num_choices = number_of_choices_compared
            self._comparison_layer_hidden_size = comparison_layer_hidden_size
            self._comparison_layer_use_relu = comparison_layer_use_relu

        # Bert weights and config
        if bert_weights_model:
            logging.info(f"Loading BERT weights model from {bert_weights_model}")
            bert_model_loaded = load_archive(bert_weights_model)
            self._bert_model = bert_model_loaded.model._bert_model
        else:
            self._bert_model = BertModel.from_pretrained(pretrained_model)

        for param in self._bert_model.parameters():
            param.requires_grad = requires_grad
        #for name, param in self._bert_model.named_parameters():
        #    grad = requires_grad
        #    if layer_freeze_regexes and grad:
        #        grad = not any([bool(re.search(r, name)) for r in layer_freeze_regexes])
        #    param.requires_grad = grad

        bert_config = self._bert_model.config
        self._output_dim = bert_config.hidden_size
        self._dropout = torch.nn.Dropout(bert_config.hidden_dropout_prob)
        self._per_choice_loss = per_choice_loss

        # Bert Classifier selector
        final_output_dim = 1
        if not use_comparative_bert:
            if bert_weights_model and hasattr(bert_model_loaded.model, "_classifier"):
                self._classifier = bert_model_loaded.model._classifier
            else:
                self._classifier = Linear(self._output_dim, final_output_dim)
        else:
            if use_bilinear_classifier:
                self._classifier = Bilinear(self._output_dim, self._output_dim, final_output_dim)
            else:
                self._classifier = Linear(self._output_dim * 2, final_output_dim)
        self._classifier.apply(self._bert_model.init_bert_weights)

        # Comparison layer setup
        if self._train_comparison_layer:
            number_of_pairs = self._num_choices * (self._num_choices - 1)
            if self._comparison_layer_hidden_size == -1:
                self._comparison_layer_hidden_size = number_of_pairs * number_of_pairs

            self._comparison_layer_1 = Linear(number_of_pairs, self._comparison_layer_hidden_size)
            if self._comparison_layer_use_relu:
                self._comparison_layer_1_activation = torch.nn.LeakyReLU()
            else:
                self._comparison_layer_1_activation = torch.nn.Tanh()
            self._comparison_layer_2 = Linear(self._comparison_layer_hidden_size, self._num_choices)
            self._comparison_layer_2_activation = torch.nn.Softmax()

        # Scalar mix, if necessary
        self._all_layers = not top_layer_only
        if self._all_layers:
            if bert_weights_model and hasattr(bert_model_loaded.model, "_scalar_mix") \
                    and bert_model_loaded.model._scalar_mix is not None:
                self._scalar_mix = bert_model_loaded.model._scalar_mix
            else:
                num_layers = bert_config.num_hidden_layers
                initial_scalar_parameters = num_layers * [0.0]
                initial_scalar_parameters[-1] = 5.0  # Starts with most mass on last layer
                self._scalar_mix = ScalarMix(bert_config.num_hidden_layers,
                                             initial_scalar_parameters=initial_scalar_parameters,
                                             do_layer_norm=False)
        else:
            self._scalar_mix = None

        # Accuracy and loss setup
        if self._train_comparison_layer:
            self._accuracy = CategoricalAccuracy()
            self._loss = torch.nn.CrossEntropyLoss()
        else:
            self._accuracy = BooleanAccuracy()
            self._loss = torch.nn.BCEWithLogitsLoss()
        self._debug = -1
    def __init__(self, transformer):
        super(OpenaiTransformerEmbedder, self).__init__()

        self._transformer = transformer
        self._scalar_mix = ScalarMix(transformer.num_output_layers,
                                     do_layer_norm=False)
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        gradient_checkpointing: Optional[bool] = None,
        masked_language_modeling: bool = True,
    ) -> None:
        TokenEmbedder.__init__(self)  # Call the base class constructor
        tokenizer = PretrainedTransformerTokenizer(model_name)
        self.masked_language_modeling = masked_language_modeling

        if self.masked_language_modeling:
            self.config = AutoConfig.from_pretrained(model_name,
                                                     output_hidden_states=True)
            # We only need access to the HF tokenizer if we are masked language modeling
            self.tokenizer = tokenizer.tokenizer
            # The only differences when masked language modeling are:
            # 1) `output_hidden_states` must be True to get access to token embeddings.
            # 2) We need to use `AutoModelForMaskedLM` to get the correct model
            self.transformer_model = AutoModelForMaskedLM.from_pretrained(
                model_name, config=self.config)
        # Eveything after the if statement (including the else) is copied directly from:
        # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
        else:
            from allennlp.common import cached_transformers

            self.transformer_model = cached_transformers.get(
                model_name, True, override_weights_file,
                override_weights_strip_prefix)
            self.config = self.transformer_model.config

        if gradient_checkpointing is not None:
            self.transformer_model.config.update(
                {"gradient_checkpointing": gradient_checkpointing})

        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model,
                                             sub_module)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        self._num_added_start_tokens = len(
            tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        gradient_checkpointing: Optional[bool] = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        transformer_kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        super().__init__()
        from allennlp.common import cached_transformers

        self.transformer_model = cached_transformers.get(
            model_name,
            True,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
            **(transformer_kwargs or {}),
        )

        if gradient_checkpointing is not None:
            self.transformer_model.config.update(
                {"gradient_checkpointing": gradient_checkpointing})

        self.config = self.transformer_model.config
        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model,
                                             sub_module)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        tokenizer = PretrainedTransformerTokenizer(
            model_name,
            tokenizer_kwargs=tokenizer_kwargs,
        )

        try:
            if self.transformer_model.get_input_embeddings(
            ).num_embeddings != len(tokenizer.tokenizer):
                self.transformer_model.resize_token_embeddings(
                    len(tokenizer.tokenizer))
        except NotImplementedError:
            # Can't resize for transformers models that don't implement base_model.get_input_embeddings()
            logger.warning(
                "Could not resize the token embedding matrix of the transformer model. "
                "This model does not support resizing.")

        self._num_added_start_tokens = len(
            tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        self.train_parameters = train_parameters
        if not train_parameters:
            self.transformer_model.eval()
            for param in self.transformer_model.parameters():
                param.requires_grad = False