Example #1
0
def test_positional_encoding():
    plt.figure(figsize=(15, 5))
    pe = PositionalEncoding(20, 0)
    y = pe.forward(Variable(torch.zeros(1, 100, 20)))
    y_labels =  y[0, :, 4:10].data.numpy()
    # y_labels =  y[0, :, 10:14].data.numpy()
    print(y_labels)
    plt.plot(np.arange(100), y_labels)
    plt.legend(["dim-i %d"%p for p in range(4, 10)]) 
    # plt.legend(["dim %d"%p for p in range(10, 14)]) 
    plt.show()
    def __init__(self,
                 in_dim,
                 out_dim,
                 N,
                 heads,
                 model_dim,
                 key_dim,
                 value_dim,
                 ff_dim,
                 max_len=10000,
                 batch_first=True):

        super().__init__()
        self.name = 'transformer'

        self.batch_first = batch_first
        self.model_dim = model_dim

        # define layers
        # embedding layers
        self.src_embed = nn.Linear(in_dim, model_dim)
        self.tgt_embed = nn.Linear(in_dim, model_dim)
        self.pos_enc = PositionalEncoding(model_dim, max_len)
        # encoder-decoder
        self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim)
        self.decoder = Decoder(N, heads, model_dim, key_dim, value_dim, ff_dim)
        # final output layer
        self.fc = nn.Linear(model_dim, out_dim)

        # xavier initialization
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
def create_model(seq_len, vocab_size, pad_id, N, d_model, d_ff, h, dropout):
    inp = Input((seq_len, ))
    embedding = Embedding(vocab_size, d_model, pad_id)(inp)
    encoding = PositionalEncoding(d_model)(inp)
    net = Add()([embedding, encoding])
    net = Dropout(dropout)(net)
    mask = Lambda(lambda t: create_padding_mask(t, pad_id),
                  name="input_mask")(inp)
    net = Encoder(N=N, d_model=d_model, d_ff=d_ff, h=h,
                  dropout=dropout)([net, mask])
    net = Flatten()(net)
    net = Dense(2, activation="softmax")(net)

    model = Model(inp, net)

    # NOTE: keras optimizers cannot be saved with optimizer state
    # need to use an optimizer from `tf.train`
    # NOTE: this seems to be a 1.0 thing, in 2.0 all tf.train optimizers are
    # dropped and the keras versions are the only implementations
    # NOTE: this is not recommended for training, the paper authors describe
    # a variable learning rate schedule, that still needs to be implemented.
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
                                       beta1=0.9,
                                       beta2=0.98,
                                       epsilon=1e-9)

    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=["acc"])

    return model
    def test_positional_encoding_sinusoid(self):
        """
        https://github.com/lilianweng/transformer-tensorflow/blob/master/transformer_test.py#L96
        """
        positional_encoding = PositionalEncoding(8)
        with self.test_session() as sess:
            encoding = sess.run(positional_encoding.call(self.raw_input),
                                feed_dict={self.raw_input: self.fake_data})
            assert encoding.shape == (4, 5, 8)

            np.testing.assert_array_equal(encoding[0], encoding[1])
            np.testing.assert_array_equal(encoding[0], encoding[2])
            np.testing.assert_array_equal(encoding[0], encoding[3])

            # single position
            np.testing.assert_array_equal(
                encoding[0][0],
                np.array([
                    np.sin(0),
                    np.cos(0),
                    np.sin(0),
                    np.cos(0),
                    np.sin(0),
                    np.cos(0),
                    np.sin(0),
                    np.cos(0)
                ]))

            # multiple positions in a single dimension
            # NOTE: / 6.0 instead of 8.0 because of the difference in taking `num_channels - 1`
            # instead of `num_channels`.
            # only the first 3 values are being calculated exactly, the 4th one is not exactly
            # matching, which fails the test.
            np.testing.assert_array_equal(
                encoding[0][:, 2][:3],
                np.array([
                    np.sin(0),
                    np.sin(1 / np.power(10000.0, 2.0 / 6.0)),
                    np.sin(2 / np.power(10000.0, 2.0 / 6.0)),
                    # np.sin(3 / np.power(10000.0, 2.0 / 6.0)),
                    # np.sin(4 / np.power(10000.0, 2.0 / 6.0)),
                ]).astype(np.float32))
Example #5
0
    def __init__(self, config):
        super(TransEncoder, self).__init__(config)

        self.config = config

        self.w2s = SequentialRepr(config, \
          input_dim = config.embed_dim, mode = "lstm")
        self.pe = PositionalEncoding(config.hidden_dim, config.dropout)

        self.s2d = make_model(N = config.num_layers,\
          d_model = config.hidden_dim, dropout = config.dropout)

        self.layer_norm = LayerNorm(config.hidden_dim)

        self.add_att = AttNet(config, config.hidden_dim)
        self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
Example #6
0
    def __init__(self,
                 in_dim,
                 out_dim,
                 N,
                 heads,
                 embed_dim,
                 model_dim,
                 key_dim,
                 value_dim,
                 ff_dim,
                 dropout=0.1,
                 max_len=10000,
                 batch_first=True,
                 pretrained_vec=None):

        super().__init__()
        self.name = 'transformer'

        self.batch_first = batch_first
        self.model_dim = model_dim
        self.embed_dim = embed_dim

        # define layers
        self.embedding = nn.Embedding(in_dim, embed_dim)
        # not training embedding layer if pretrained embedding is provided
        if pretrained_vec is not None:
            self.embedding = self.embedding.from_pretrained(pretrained_vec,
                                                            freeze=True)
        if embed_dim != model_dim:
            self.fc_in = nn.Linear(embed_dim, model_dim)
        self.pos_enc = PositionalEncoding(model_dim, max_len)
        self.encoder = Encoder(N,
                               heads,
                               model_dim,
                               key_dim,
                               value_dim,
                               ff_dim,
                               dropout=dropout)
        # final output layer
        self.fc = nn.Linear(model_dim, out_dim)

        # xavier initialization
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
def make_model_elmo(N=6, d_model=1024, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embedder(), c(position)),
        nn.Sequential(Embedder(), c(position)),
        generator=None)
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model
Example #8
0
    def __init__(self, config):
        super(TransEncoder, self).__init__(config)

        self.sent_repr_dim = config.hidden_dim
        self.w2s = SequentialRepr(config, \
          input_dim = config.embed_dim, mode = "lstm")
        # self.w2s_tl = SequentialRepr(config,\
        # input_dim = config.embed_dim, mode = "lstm")

        # self.s2d = SequentialRepr(config,
        #  input_dim = config.hidden_dim, mode = "lstm")

        self.pe = PositionalEncoding(self.sent_repr_dim, config.dropout)
        self.s2d = make_model(N = config.num_layers,\
          d_model = self.sent_repr_dim, dropout = config.dropout)

        self.satt_layer = AttNet(config, config.hidden_dim)
        self.datt_layer = AttNet(config, config.hidden_dim * 2)

        self.dropout = nn.Dropout(p=config.dropout)
        self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
Example #9
0
 def __init__(self,
              visual_model,
              num_visual_features,
              textual_features,
              vocab_size,
              pad_token_id,
              max_len=49,
              encoding_drop=0.1,
              N=6,
              heads=8,
              attn_drop=0.1,
              ff_drop=0.1,
              d_ff=2048,
              activation='GELU'):
     super().__init__()
     self.visual_backbone = visual_model
     self.th = Xcoder(True,
                      N,
                      textual_features,
                      h=heads,
                      d_ff=d_ff,
                      ff_drop=ff_drop,
                      attn_drop=attn_drop,
                      activation=activation)
     self.visual_features = []
     self.lin_projection = nn.Linear(num_visual_features, textual_features)
     self.embed = WordEmbedding(vocab_size,
                                textual_features,
                                padding_index=pad_token_id)
     self.pos_enc = PositionalEncoding(textual_features, max_len,
                                       encoding_drop)
     self._register_hook(self.visual_backbone,
                         partial(self.hook_function, self.visual_features))
     self.lin_out = nn.Linear(textual_features, vocab_size)
     self.lin_out.weight = self.embed.emb.weight
     self.pad_tok_id = pad_token_id
Example #10
0

(x_train, y_train), (x_test, y_test) = keras.datasets.reuters.load_data()
test_size = -1000  # -1 for all
(x_train,
 y_train), (x_test,
            y_test) = (x_train[:test_size],
                       y_train[:test_size]), (x_test[:int(test_size / 10)],
                                              y_test[:int(test_size / 10)])

x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=MAX_LEN)

x = keras.layers.Input((MAX_LEN, ))
y = keras.layers.Embedding(MAX_WORD * 10, 64)(x)
y = PositionalEncoding()(K.concatenate([y, y, y]))
y = K.reshape(y, (
    -1,
    MAX_LEN,
))
y = MultiHeadAttention(8, 64)(y)
y = EncoderBlock()(y)  # (None , max_len, n_head, dim_k)
y = keras.layers.Flatten()(y)
y = keras.layers.Dense(46, activation='softmax')(y)

model = keras.Model(inputs=[x], outputs=[y])
model.compile(loss=keras.losses.sparse_categorical_crossentropy,
              optimizer=tf.train.AdamOptimizer(),
              metrics=[keras.metrics.sparse_categorical_accuracy])

model.fit(x_train, y_train, epochs=1)