Ejemplo n.º 1
0
 def __init__(self,
              config,
              transformer: PreTrainedModel = None,
              transformer_tokenizer: PreTrainedTokenizer = None
              ) -> None:
     super().__init__()
     self.encoder = TransformerEncoder(transformer,
                                       transformer_tokenizer,
                                       config.average_subwords,
                                       config.scalar_mix,
                                       None,  # No word_dropout since SA is predicting masked tokens
                                       config.transformer_hidden_dropout,
                                       config.layer_dropout,
                                       config.max_sequence_length)
     hidden_size = transformer.config.hidden_size
     self.sa = StructuralAttentionLayer(hidden_size,
                                        config.n_mlp_arc,
                                        config.n_mlp_rel,
                                        config.mlp_dropout,
                                        config.n_rels,
                                        config.projection
                                        )
     if config.projection:
         hidden_size = config.projection
     self.mlm = nn.Linear(hidden_size, transformer_tokenizer.vocab_size)
Ejemplo n.º 2
0
 def build_transformer(self, training=True):
     transformer = TransformerEncoder(self.config.transformer, self.transformer_tokenizer,
                                      self.config.average_subwords,
                                      self.config.scalar_mix, self.config.word_dropout,
                                      self.config.max_seq_len, self.config.ret_raw_hidden_states,
                                      training=training)
     transformer_layers = self.config.get('transformer_layers', None)
     if transformer_layers:
         transformer.transformer.encoder.layer = transformer.transformer.encoder.layer[:-transformer_layers]
     return transformer
Ejemplo n.º 3
0
    def __init__(
        self,
        config,
        pretrained_embed: torch.Tensor = None,
        transformer: PreTrainedModel = None,
        transformer_tokenizer: PreTrainedTokenizer = None,
    ):
        super(EncoderWithContextualLayer, self).__init__()

        self.secondary_encoder = config.get('secondary_encoder', None)
        self.config = config

        if not transformer:
            self.pad_index = config.pad_index
            self.unk_index = config.unk_index
            if config.word_dropout:
                oov = self.unk_index
                excludes = [self.pad_index]
                self.word_dropout = WordDropout(p=config.word_dropout,
                                                oov_token=oov,
                                                exclude_tokens=excludes)
            else:
                self.word_dropout = None
        if transformer:
            input_size = 0
            if self.config.transformer_lr:
                hidden_size = transformer.config.hidden_size
            else:
                input_size = transformer.config.hidden_size
                hidden_size = config.n_lstm_hidden * 2
            if config.feat == 'pos':
                self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
                                               embedding_dim=config.n_embed)
                self.embed_dropout = IndependentDropout(p=config.embed_dropout)
                if self.config.transformer_lr:
                    hidden_size += config.n_embed
                else:
                    input_size += config.n_embed
            if not self.config.transformer_lr:
                self.lstm = VariationalLSTM(input_size=input_size,
                                            hidden_size=config.n_lstm_hidden,
                                            num_layers=config.n_lstm_layers,
                                            dropout=config.hidden_dropout,
                                            bidirectional=True)
        else:
            # the embedding layer
            input_size = config.n_embed
            self.word_embed = nn.Embedding(num_embeddings=config.n_words,
                                           embedding_dim=config.n_embed)
            if pretrained_embed is not None:
                if not isinstance(pretrained_embed, torch.Tensor):
                    pretrained_embed = torch.Tensor(pretrained_embed)
                self.pretrained = nn.Embedding.from_pretrained(
                    pretrained_embed)
                nn.init.zeros_(self.word_embed.weight)
            if config.feat == 'pos':
                self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
                                               embedding_dim=config.n_embed)
                self.embed_dropout = IndependentDropout(p=config.embed_dropout)
                input_size += config.n_embed

            # the word-lstm layer
            hidden_size = config.n_lstm_hidden * 2
            self.lstm = VariationalLSTM(input_size=input_size,
                                        hidden_size=config.n_lstm_hidden,
                                        num_layers=config.n_lstm_layers,
                                        dropout=config.hidden_dropout,
                                        bidirectional=True)
        self.hidden_size = hidden_size
        self.hidden_dropout = SharedDropout(p=config.hidden_dropout)
        if transformer:
            transformer = TransformerEncoder(
                transformer,
                transformer_tokenizer,
                config.average_subwords,
                word_dropout=config.word_dropout,
                max_sequence_length=config.max_sequence_length)
        self.transformer = transformer
Ejemplo n.º 4
0
 def build_model(self, training=True, **kwargs) -> torch.nn.Module:
     transformer = TransformerEncoder.build_transformer(config=self.config,
                                                        training=training)
     model = StructuralAttentionModel(self.config, transformer,
                                      self.transformer_tokenizer)
     return model
Ejemplo n.º 5
0
    def __init__(self,
                 field: str,
                 transformer: str,
                 average_subwords=False,
                 scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
                 word_dropout: Optional[Union[float, Tuple[float,
                                                           str]]] = None,
                 max_sequence_length=None,
                 truncate_long_sequences=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 ret_token_span=True,
                 ret_subtokens=False,
                 ret_subtokens_group=False,
                 ret_prefix_mask=False,
                 ret_raw_hidden_states=False,
                 transformer_args: Dict[str, Any] = None,
                 use_fast=True,
                 do_basic_tokenize=True,
                 trainable=True) -> None:
        """A contextual word embedding builder which builds a
        :class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a
        :class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`.

        Args:
            field: The field to work on. Usually some token fields.
            transformer:  An identifier of a ``PreTrainedModel``.
            average_subwords: ``True`` to average subword representations.
            scalar_mix: Layer attention.
            word_dropout: Dropout rate of randomly replacing a subword with MASK.
            max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                window.
            truncate_long_sequences: ``True`` to return hidden states of each layer.
            cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
                        ``False`` (default) means the first token is not [CLS], it will have its own embedding other than
                        the embedding of [CLS].
            sep_is_eos: ``True`` means the last token of input is [SEP].
                        ``False`` means it's not but [SEP] will be appended,
                        ``None`` means it dependents on `input[-1] == [EOS]`.
            ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
            ret_subtokens: ``True`` to return list of subtokens belonging to each token.
            ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
            ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
            ret_raw_hidden_states: ``True`` to return hidden states of each layer.
            transformer_args: Extra arguments passed to the transformer.
            use_fast: Whether or not to try to load the fast version of the tokenizer.
            do_basic_tokenize: Whether to do basic tokenization before wordpiece.
            trainable: ``False`` to use static embeddings.
        """
        super().__init__()
        self.truncate_long_sequences = truncate_long_sequences
        self.transformer_args = transformer_args
        self.trainable = trainable
        self.ret_subtokens_group = ret_subtokens_group
        self.ret_subtokens = ret_subtokens
        self.ret_raw_hidden_states = ret_raw_hidden_states
        self.sep_is_eos = sep_is_eos
        self.cls_is_bos = cls_is_bos
        self.max_sequence_length = max_sequence_length
        self.word_dropout = word_dropout
        self.scalar_mix = scalar_mix
        self.average_subwords = average_subwords
        self.transformer = transformer
        self.field = field
        self._transformer_tokenizer = TransformerEncoder.build_transformer_tokenizer(
            self.transformer,
            use_fast=use_fast,
            do_basic_tokenize=do_basic_tokenize)
        self._tokenizer_transform = TransformerSequenceTokenizer(
            self._transformer_tokenizer,
            field,
            truncate_long_sequences=truncate_long_sequences,
            ret_prefix_mask=ret_prefix_mask,
            ret_token_span=ret_token_span,
            cls_is_bos=cls_is_bos,
            sep_is_eos=sep_is_eos,
            ret_subtokens=ret_subtokens,
            ret_subtokens_group=ret_subtokens_group,
            max_seq_length=self.max_sequence_length)