def test_positional_encoding(): plt.figure(figsize=(15, 5)) pe = PositionalEncoding(20, 0) y = pe.forward(Variable(torch.zeros(1, 100, 20))) y_labels = y[0, :, 4:10].data.numpy() # y_labels = y[0, :, 10:14].data.numpy() print(y_labels) plt.plot(np.arange(100), y_labels) plt.legend(["dim-i %d"%p for p in range(4, 10)]) # plt.legend(["dim %d"%p for p in range(10, 14)]) plt.show()
def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, max_len=10000, batch_first=True): super().__init__() self.name = 'transformer' self.batch_first = batch_first self.model_dim = model_dim # define layers # embedding layers self.src_embed = nn.Linear(in_dim, model_dim) self.tgt_embed = nn.Linear(in_dim, model_dim) self.pos_enc = PositionalEncoding(model_dim, max_len) # encoder-decoder self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim) self.decoder = Decoder(N, heads, model_dim, key_dim, value_dim, ff_dim) # final output layer self.fc = nn.Linear(model_dim, out_dim) # xavier initialization for p in self.parameters(): if p.dim() > 1 and p.requires_grad: nn.init.xavier_uniform_(p)
def create_model(seq_len, vocab_size, pad_id, N, d_model, d_ff, h, dropout): inp = Input((seq_len, )) embedding = Embedding(vocab_size, d_model, pad_id)(inp) encoding = PositionalEncoding(d_model)(inp) net = Add()([embedding, encoding]) net = Dropout(dropout)(net) mask = Lambda(lambda t: create_padding_mask(t, pad_id), name="input_mask")(inp) net = Encoder(N=N, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout)([net, mask]) net = Flatten()(net) net = Dense(2, activation="softmax")(net) model = Model(inp, net) # NOTE: keras optimizers cannot be saved with optimizer state # need to use an optimizer from `tf.train` # NOTE: this seems to be a 1.0 thing, in 2.0 all tf.train optimizers are # dropped and the keras versions are the only implementations # NOTE: this is not recommended for training, the paper authors describe # a variable learning rate schedule, that still needs to be implemented. optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.98, epsilon=1e-9) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"]) return model
def test_positional_encoding_sinusoid(self): """ https://github.com/lilianweng/transformer-tensorflow/blob/master/transformer_test.py#L96 """ positional_encoding = PositionalEncoding(8) with self.test_session() as sess: encoding = sess.run(positional_encoding.call(self.raw_input), feed_dict={self.raw_input: self.fake_data}) assert encoding.shape == (4, 5, 8) np.testing.assert_array_equal(encoding[0], encoding[1]) np.testing.assert_array_equal(encoding[0], encoding[2]) np.testing.assert_array_equal(encoding[0], encoding[3]) # single position np.testing.assert_array_equal( encoding[0][0], np.array([ np.sin(0), np.cos(0), np.sin(0), np.cos(0), np.sin(0), np.cos(0), np.sin(0), np.cos(0) ])) # multiple positions in a single dimension # NOTE: / 6.0 instead of 8.0 because of the difference in taking `num_channels - 1` # instead of `num_channels`. # only the first 3 values are being calculated exactly, the 4th one is not exactly # matching, which fails the test. np.testing.assert_array_equal( encoding[0][:, 2][:3], np.array([ np.sin(0), np.sin(1 / np.power(10000.0, 2.0 / 6.0)), np.sin(2 / np.power(10000.0, 2.0 / 6.0)), # np.sin(3 / np.power(10000.0, 2.0 / 6.0)), # np.sin(4 / np.power(10000.0, 2.0 / 6.0)), ]).astype(np.float32))
def __init__(self, config): super(TransEncoder, self).__init__(config) self.config = config self.w2s = SequentialRepr(config, \ input_dim = config.embed_dim, mode = "lstm") self.pe = PositionalEncoding(config.hidden_dim, config.dropout) self.s2d = make_model(N = config.num_layers,\ d_model = config.hidden_dim, dropout = config.dropout) self.layer_norm = LayerNorm(config.hidden_dim) self.add_att = AttNet(config, config.hidden_dim) self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
def __init__(self, in_dim, out_dim, N, heads, embed_dim, model_dim, key_dim, value_dim, ff_dim, dropout=0.1, max_len=10000, batch_first=True, pretrained_vec=None): super().__init__() self.name = 'transformer' self.batch_first = batch_first self.model_dim = model_dim self.embed_dim = embed_dim # define layers self.embedding = nn.Embedding(in_dim, embed_dim) # not training embedding layer if pretrained embedding is provided if pretrained_vec is not None: self.embedding = self.embedding.from_pretrained(pretrained_vec, freeze=True) if embed_dim != model_dim: self.fc_in = nn.Linear(embed_dim, model_dim) self.pos_enc = PositionalEncoding(model_dim, max_len) self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim, dropout=dropout) # final output layer self.fc = nn.Linear(model_dim, out_dim) # xavier initialization for p in self.parameters(): if p.dim() > 1 and p.requires_grad: nn.init.xavier_uniform_(p)
def make_model_elmo(N=6, d_model=1024, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embedder(), c(position)), nn.Sequential(Embedder(), c(position)), generator=None) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
def __init__(self, config): super(TransEncoder, self).__init__(config) self.sent_repr_dim = config.hidden_dim self.w2s = SequentialRepr(config, \ input_dim = config.embed_dim, mode = "lstm") # self.w2s_tl = SequentialRepr(config,\ # input_dim = config.embed_dim, mode = "lstm") # self.s2d = SequentialRepr(config, # input_dim = config.hidden_dim, mode = "lstm") self.pe = PositionalEncoding(self.sent_repr_dim, config.dropout) self.s2d = make_model(N = config.num_layers,\ d_model = self.sent_repr_dim, dropout = config.dropout) self.satt_layer = AttNet(config, config.hidden_dim) self.datt_layer = AttNet(config, config.hidden_dim * 2) self.dropout = nn.Dropout(p=config.dropout) self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
def __init__(self, visual_model, num_visual_features, textual_features, vocab_size, pad_token_id, max_len=49, encoding_drop=0.1, N=6, heads=8, attn_drop=0.1, ff_drop=0.1, d_ff=2048, activation='GELU'): super().__init__() self.visual_backbone = visual_model self.th = Xcoder(True, N, textual_features, h=heads, d_ff=d_ff, ff_drop=ff_drop, attn_drop=attn_drop, activation=activation) self.visual_features = [] self.lin_projection = nn.Linear(num_visual_features, textual_features) self.embed = WordEmbedding(vocab_size, textual_features, padding_index=pad_token_id) self.pos_enc = PositionalEncoding(textual_features, max_len, encoding_drop) self._register_hook(self.visual_backbone, partial(self.hook_function, self.visual_features)) self.lin_out = nn.Linear(textual_features, vocab_size) self.lin_out.weight = self.embed.emb.weight self.pad_tok_id = pad_token_id
(x_train, y_train), (x_test, y_test) = keras.datasets.reuters.load_data() test_size = -1000 # -1 for all (x_train, y_train), (x_test, y_test) = (x_train[:test_size], y_train[:test_size]), (x_test[:int(test_size / 10)], y_test[:int(test_size / 10)]) x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=MAX_LEN) x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=MAX_LEN) x = keras.layers.Input((MAX_LEN, )) y = keras.layers.Embedding(MAX_WORD * 10, 64)(x) y = PositionalEncoding()(K.concatenate([y, y, y])) y = K.reshape(y, ( -1, MAX_LEN, )) y = MultiHeadAttention(8, 64)(y) y = EncoderBlock()(y) # (None , max_len, n_head, dim_k) y = keras.layers.Flatten()(y) y = keras.layers.Dense(46, activation='softmax')(y) model = keras.Model(inputs=[x], outputs=[y]) model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer=tf.train.AdamOptimizer(), metrics=[keras.metrics.sparse_categorical_accuracy]) model.fit(x_train, y_train, epochs=1)