Esempio n. 1
0
 def reset_parameters(self):
     initialize = layers.get_initializer(self._hparams.initializer)
     if initialize is not None:
         # Do not re-initialize LayerNorm modules.
         for name, param in self.named_parameters():
             if name.split('.')[-1] == 'weight' and 'layer_norm' not in name:
                 initialize(param)
    def __init__(self, input_size: int, hparams=None):
        super().__init__(hparams=hparams)
        use_bias = self._hparams.use_bias

        self.Q_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.K_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.V_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.O_dense = nn.Linear(self._hparams.num_units,
                                 self._hparams.output_dim,
                                 bias=use_bias)

        if self._hparams.initializer:
            # TODO(haoransh): we may define kernel_initializer and bias
            #  initializer seperately
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            for name, param in self.named_parameters():
                if name.split('.')[-1] == 'weight':
                    print('name:{}'.format(name))
                    initialize(param)
Esempio n. 3
0
    def __init__(self, hparams=None):
        super().__init__(hparams=hparams)
        self._input_size = self._hparams.dim
        self.self_attns = nn.ModuleList()
        if not self._hparams.use_bert_config:
            self.self_attn_layer_norm = nn.ModuleList()
        else:
            self.output_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()

        # In PyTorch, eps for LayerNorm is 1e-6 by default.
        eps = self._hparams.eps

        self.initialize_blocks()

        self.embed_dropout = nn.Dropout(p=self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(p=self._hparams.residual_dropout)

        if self._hparams.use_bert_config:
            self.input_normalizer = nn.LayerNorm(self._input_size, eps=eps)
        else:
            self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps)

        if self._hparams.initializer:
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        '.')[-1] == 'weight' and 'layer_norm' not in name:
                    initialize(param)
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        super().__init__(hparams=hparams)

        # Create the underlying encoder
        encoder_hparams = dict_fetch(hparams, XLNetEncoder.default_hparams())

        self._encoder = XLNetEncoder(
            pretrained_model_name=pretrained_model_name,
            cache_dir=cache_dir,
            hparams=encoder_hparams)

        # TODO: The logic here is very similar to that in XLNetClassifier.
        #  We need to reduce the code redundancy.
        if self._hparams.use_projection:
            if self._hparams.regr_strategy == 'all_time':
                self.projection = nn.Linear(
                    self._encoder.output_size * self._hparams.max_seq_length,
                    self._encoder.output_size * self._hparams.max_seq_length)
            else:
                self.projection = nn.Linear(self._encoder.output_size,
                                            self._encoder.output_size)
        self.dropout = nn.Dropout(self._hparams.dropout)

        logit_kwargs = self._hparams.logit_layer_kwargs
        if logit_kwargs is None:
            logit_kwargs = {}
        elif not isinstance(logit_kwargs, HParams):
            raise ValueError("hparams['logit_layer_kwargs'] "
                             "must be a dict.")
        else:
            logit_kwargs = logit_kwargs.todict()

        if self._hparams.regr_strategy == 'all_time':
            self.hidden_to_logits = nn.Linear(
                self._encoder.output_size * self._hparams.max_seq_length,
                1, **logit_kwargs)
        else:
            self.hidden_to_logits = nn.Linear(
                self._encoder.output_size, 1, **logit_kwargs)

        if self._hparams.initializer:
            initialize = get_initializer(self._hparams.initializer)
            assert initialize is not None
            if self._hparams.use_projection:
                initialize(self.projection.weight)
                initialize(self.projection.bias)
            initialize(self.hidden_to_logits.weight)
            if self.hidden_to_logits.bias:
                initialize(self.hidden_to_logits.bias)
        else:
            if self._hparams.use_projection:
                self.projection.apply(init_weights)
            self.hidden_to_logits.apply(init_weights)
Esempio n. 5
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        super().__init__(hparams=hparams)

        # Create the underlying encoder
        encoder_hparams = dict_fetch(hparams,
                                     self._ENCODER_CLASS.default_hparams())

        self._encoder = self._ENCODER_CLASS(
            pretrained_model_name=pretrained_model_name,
            cache_dir=cache_dir,
            hparams=encoder_hparams)

        # Create a dropout layer
        self._dropout_layer = nn.Dropout(self._hparams.dropout)

        # Create an additional classification layer if needed
        self.num_classes = self._hparams.num_classes
        if self.num_classes <= 0:
            self._logits_layer = None
        else:
            logit_kwargs = self._hparams.logit_layer_kwargs
            if logit_kwargs is None:
                logit_kwargs = {}
            elif not isinstance(logit_kwargs, HParams):
                raise ValueError("hparams['logit_layer_kwargs'] "
                                 "must be a dict.")
            else:
                logit_kwargs = logit_kwargs.todict()

            if self._hparams.clas_strategy == 'all_time':
                self._logits_layer = nn.Linear(
                    self._encoder.output_size *
                    self._hparams.max_seq_length,
                    self.num_classes,
                    **logit_kwargs)
            else:
                self._logits_layer = nn.Linear(
                    self._encoder.output_size, self.num_classes,
                    **logit_kwargs)

        if self._hparams.initializer:
            initialize = get_initializer(self._hparams.initializer)
            assert initialize is not None
            if self._logits_layer:
                initialize(self._logits_layer.weight)
                if self._logits_layer.bias:
                    initialize(self._logits_layer.bias)

        self.is_binary = (self.num_classes == 1) or \
                         (self.num_classes <= 0 and
                          self._hparams.encoder.dim == 1)
Esempio n. 6
0
def get_embedding(num_embeds: Optional[int] = None,
                  init_value: Optional[torch.Tensor] = None,
                  hparams=None):
    r"""Creates embedding variable if not exists.

    Args:
        hparams (dict or HParams, optional): Embedding hyperparameters. Missing
            hyperparameters are set to default values. See
            :func:`~texar.torch.modules.default_embedding_hparams`
            for all hyperparameters and default values.

            If :attr:`init_value` is given, :attr:`hparams["initializer"]`,
            and :attr:`hparams["dim"]` are ignored.
        init_value (Tensor or numpy array, optional): Initial values of the
            embedding variable. If not given, embedding is initialized as
            specified in :attr:`hparams["initializer"]`.
        num_embeds (int, optional): The number of embedding items
            (e.g., vocabulary size). Required if :attr:`init_value` is
            not provided.

    Returns:
        A 2D :tensor:`Tensor` of the same shape with :attr:`init_value` or of
        the shape ``[num_embeds, hparams["dim"]]``.
    """
    if hparams is None or isinstance(hparams, dict):
        hparams = HParams(hparams, default_embedding_hparams())
    if init_value is None:
        initializer = layers.get_initializer(
            getattr(hparams, "initializer", None))
        # TODO Shibiao: add regularizer
        dim = hparams["dim"]
        if not isinstance(hparams["dim"], (list, tuple)):
            dim = [dim]
        embedding = torch.empty(size=[num_embeds] + dim)
        # initializer should be set by layers.get_initializer
        if initializer:
            initializer(embedding)
        else:
            torch.nn.init.xavier_uniform_(embedding)
    else:
        if torch.is_tensor(init_value):
            embedding = init_value  # Do not copy the tensor.
        else:
            embedding = torch.tensor(init_value, dtype=torch.float)

    return embedding
    def __init__(self,
                 token_embedder: Optional[TokenEmbedder] = None,
                 token_pos_embedder: Optional[TokenPosEmbedder] = None,
                 vocab_size: Optional[int] = None,
                 output_layer: Optional[Union[nn.Module, torch.Tensor]] = None,
                 hparams=None):
        super().__init__(token_embedder,
                         token_pos_embedder,
                         input_time_major=False,
                         output_time_major=False,
                         hparams=hparams)

        if token_pos_embedder is None and token_embedder is not None:
            warnings.warn(
                "Transformer models cannot capture positional information if "
                "no positional embedding is provided.")

        self._input_size = self._hparams.dim
        self._output_layer, self._vocab_size = _make_output_layer(
            output_layer, vocab_size, self._input_size,
            self._hparams.output_layer_bias)

        self.self_attns = nn.ModuleList()
        self.self_attn_layer_norm = nn.ModuleList()
        self.enc_dec_attns = nn.ModuleList()
        self.end_dec_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()

        self.initialize_blocks()

        self.final_layer_norm = nn.LayerNorm(self._input_size,
                                             eps=self._hparams.eps)

        self.embed_dropout = nn.Dropout(self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(self._hparams.residual_dropout)

        if self._hparams.initializer:
            # TODO: This might be different to what TensorFlow does
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        ".")[-1] == "weight" and "layer_norm" not in name:
                    initialize(param)
Esempio n. 8
0
    def __init__(self,
                 input_size: int,
                 hparams=None,
                 stores_relative_position: bool = False):
        super().__init__(hparams=hparams)

        use_bias = self._hparams.use_bias
        self.is_decoder = self._hparams.is_decoder
        self.stores_rpr = stores_relative_position
        self.relative_attention_num_buckets = \
            self._hparams.relative_attention_num_buckets

        self.Q_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.K_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.V_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.O_dense = nn.Linear(self._hparams.num_units,
                                 self._hparams.output_dim,
                                 bias=use_bias)

        if self.stores_rpr:
            self.relative_attention_bias = nn.Embedding(
                self.relative_attention_num_buckets, self._hparams.num_heads)

        if self._hparams.initializer:
            # TODO(haoransh): we may define kernel_initializer and bias
            #  initializer seperately
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            for name, param in self.named_parameters():
                if name.split('.')[-1] == 'weight':
                    print('name:{}'.format(name))
                    initialize(param)
    def __init__(self,
                 token_embedder: Optional[TokenEmbedder] = None,
                 token_pos_embedder: Optional[TokenPosEmbedder] = None,
                 vocab_size: Optional[int] = None,
                 output_layer: Optional[Union[nn.Module, torch.Tensor]] = None,
                 hparams=None):
        super().__init__(token_embedder,
                         token_pos_embedder,
                         input_time_major=False,
                         output_time_major=False,
                         hparams=hparams)

        if token_pos_embedder is None and token_embedder is not None:
            warnings.warn(
                "Transformer models cannot capture positional information if "
                "no positional embedding is provided.")

        self._input_size = self._hparams.dim
        self._output_layer, self._vocab_size = _make_output_layer(
            output_layer, vocab_size, self._input_size,
            self._hparams.output_layer_bias)

        self.self_attns = nn.ModuleList()
        self.self_attn_layer_norm = nn.ModuleList()
        self.enc_dec_attns = nn.ModuleList()
        self.end_dec_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()

        if self._hparams.use_gpt_config:
            eps = 1e-5
        else:
            eps = 1e-12

        for _ in range(self._hparams.num_blocks):
            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.self_attns.append(attn_module)
            self.self_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.enc_dec_attns.append(attn_module)
            self.end_dec_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            poswise_network = FeedForwardNetwork(
                hparams=self._hparams.poswise_feedforward)
            if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] !=
                    self._hparams.dim):
                raise ValueError("The output dimension of "
                                 "FeedForwardNetwork should be equal "
                                 "to the dim of TransformerDecoder")
            self.poswise_networks.append(poswise_network)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

        self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps)
        self.embed_dropout = nn.Dropout(self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(self._hparams.residual_dropout)

        if self._hparams.initializer:
            # TODO: This might be different to what TensorFlow does
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        ".")[-1] == "weight" and "layer_norm" not in name:
                    initialize(param)
    def __init__(self, hparams=None):
        super().__init__(hparams=hparams)
        self._input_size = self._hparams.dim
        self.self_attns = nn.ModuleList()
        if not self._hparams.use_bert_config:
            self.self_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()
        self.output_layer_norm = nn.ModuleList()

        if self._hparams.use_bert_config:
            # In TensorFlow, eps for LayerNorm is 1e-12 by default.
            eps = 1e-12
        else:
            # In PyTorch, eps for LayerNorm is 1e-6 by default.
            eps = 1e-6

        for _ in range(self._hparams.num_blocks):
            mh_attn = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            self.self_attns.append(mh_attn)
            if not self._hparams.use_bert_config:
                self.self_attn_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=eps))
            if self._hparams.dim != mh_attn.hparams.output_dim:
                raise ValueError(
                    'The "dim" in the hparams of '
                    '"multihead_attention" should be equal to the '
                    '"dim" of TransformerEncoder')

            pw_net = FeedForwardNetwork(
                hparams=self._hparams['poswise_feedforward'])

            final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features']
            if self._hparams.dim != final_dim:
                raise ValueError('The output dimenstion of '
                                 '"poswise_feedforward" should be equal '
                                 'to the "dim" of TransformerEncoder.')

            self.poswise_networks.append(pw_net)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))
            if self._hparams.use_bert_config:
                self.output_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=eps))

        self.embed_dropout = nn.Dropout(p=self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(p=self._hparams.residual_dropout)

        if self._hparams.use_bert_config:
            self.input_normalizer = nn.LayerNorm(self._input_size, eps=eps)
        else:
            self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps)

        if self._hparams.initializer:
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        '.')[-1] == 'weight' and 'layer_norm' not in name:
                    initialize(param)