def __init__(self, in_features, hidden_features, head_num, attention_activation=None, feed_forward_activation=F.relu): """Encoder component. :param in_features: Length of the input features. :param hidden_features: Number of features inside feed-forward layer. :param head_num: Number of heads. :param attention_activation: Activation for attention layer. :param feed_forward_activation: Activation for feed-forward layer. """ super(EncoderComponent, self).__init__() self.attention = AttentionWrapper( in_features, layer=MultiHeadAttention( in_features=in_features, head_num=head_num, activation=attention_activation, ), ) self.feed_forward = BlockWrapper( in_features, layer=FeedForward( in_features=in_features, hidden_features=hidden_features, out_features=in_features, activation=feed_forward_activation, ), )
def __init__(self, max_leaf, importer_size, item_size, dim, head_num=4, fusion_type="concat", act="relu", device="cpu"): super(AttentionalTreeEmbeddig, self).__init__() self.d = dim self.device = device if act == "relu": self.act = nn.LeakyReLU() elif act == "mish": self.act = Mish() self.fusion_type = fusion_type # embedding layers self.leaf_embedding = nn.Embedding(max_leaf, dim) self.user_embedding = nn.Embedding(importer_size, dim, padding_idx=0) self.user_embedding.weight.data[0] = torch.zeros(dim) self.item_embedding = nn.Embedding(item_size, dim, padding_idx=0) self.item_embedding.weight.data[0] = torch.zeros(dim) # attention layer self.attention_bolck = Attention(dim, dim, "sum").to(device) self.self_att = MultiHeadAttention(dim, head_num).to(device) self.fusion_att = FusionAttention(dim) # Hidden & output layer self.fussionlayer = nn.Linear(dim * 3, dim) self.hidden = nn.Linear(dim, dim) self.output_cls_layer = nn.Linear(dim, 1) self.output_reg_layer = nn.Linear(dim, 1)
def test_history_only(self): x = torch.Tensor([[ [0.2, 0.3, 0.4, 0.6, 0.5], [0.4, 0.7, 0.2, 0.6, 0.9], [0.3, 0.5, 0.8, 0.9, 0.1], [0.2, 0.3, 0.4, 0.6, 0.5], [0.1, 0.2, 0.3, 0.4, 0.5], ]]) mask = MultiHeadAttention.gen_history_mask(x) y = ScaledDotProductAttention()(x, x, x, mask)[0] self.assertFalse(y[0].allclose(y[3]), y) self.assertTrue(y[0].allclose(x[0, 0]), y[0])
def test_same_output_history_only(self): batch_size, seq_len, feature_dim, head_num = 7, 12, 16, 4 weights = np.random.standard_normal((feature_dim, feature_dim * 4)) bias = np.random.standard_normal((feature_dim * 4, )) torch_net = self.get_torch_layer_with_weights(feature_dim, head_num, weights, bias) keras_net = self.get_keras_layer_weight_weights( seq_len, feature_dim, head_num, weights, bias, True) allclose_count = 0 for _ in range(100): x = np.random.standard_normal((batch_size, seq_len, feature_dim)) y = keras_net.predict(x) x = torch.from_numpy(x) y_hat = torch_net(x, x, x, MultiHeadAttention.gen_history_mask(x)) if np.allclose(y, y_hat.detach().numpy(), rtol=0.0, atol=1e-4): allclose_count += 1 self.assertGreaterEqual(allclose_count, 98)
def __init__(self, opt): super(attnneudeftb, self).__init__() self.vocab_size = opt.vocab_size self.term_hidden_size = opt.term_size self.use_cuda = opt.cuda self.word_emb = nn.Embedding(opt.vocab_size, opt.term_size, padding_idx=0) if not (opt.init_emb is None): self.word_emb.weight.data.copy_(opt.init_emb.data) self.multiheadattn = MultiHeadAttention(in_features=opt.term_size, head_num=opt.n_head) # knrm parameters tensor_mu = torch.FloatTensor(opt.mu) tensor_sigma = torch.FloatTensor(opt.sigma) if opt.cuda: tensor_mu = tensor_mu.cuda() tensor_sigma = tensor_sigma.cuda() self.mu = Variable(tensor_mu, requires_grad=False).view(1, 1, 1, opt.n_bins) self.sigma = Variable(tensor_sigma, requires_grad=False).view(1, 1, 1, opt.n_bins) # dense layers self.transform_dcq = nn.Linear(self.term_hidden_size, self.term_hidden_size) self.qddense = nn.Linear(opt.n_bins, 1, 1) self.qbdense = nn.Linear(opt.n_bins, 1, 1) self.qcqdense = nn.Linear(opt.n_bins, 1, 1) self.dcqdense = nn.Linear(opt.n_bins, 1, 1) self.bcqdense = nn.Linear(opt.n_bins, 1, 1) self.exp_combine = nn.Linear(2, 1) # (d,cq),(b,cq) self.combine = nn.Linear(3, 1) # qd, qe, qb
def get_torch_layer_with_weights(feature_dim, head_num, weights, bias): layer = MultiHeadAttention(feature_dim, head_num) layer.linear_q.weight = torch.nn.Parameter( torch.from_numpy(weights[:, :feature_dim]).transpose(1, 0)) layer.linear_q.bias = torch.nn.Parameter( torch.from_numpy(bias[:feature_dim])) layer.linear_k.weight = torch.nn.Parameter( torch.from_numpy(weights[:, feature_dim:feature_dim * 2]).transpose( 1, 0)) layer.linear_k.bias = torch.nn.Parameter( torch.from_numpy(bias[feature_dim:feature_dim * 2])) layer.linear_v.weight = torch.nn.Parameter( torch.from_numpy(weights[:, feature_dim * 2:feature_dim * 3]).transpose(1, 0)) layer.linear_v.bias = torch.nn.Parameter( torch.from_numpy(bias[feature_dim * 2:feature_dim * 3])) layer.linear_o.weight = torch.nn.Parameter( torch.from_numpy(weights[:, -feature_dim:]).transpose(1, 0)) layer.linear_o.bias = torch.nn.Parameter( torch.from_numpy(bias[-feature_dim:])) return layer
def test_divisible(self): with self.assertRaises(ValueError): MultiHeadAttention(in_features=73, head_num=5)
def forward(self, x): mask = MultiHeadAttention.gen_history_mask(x) for component in self.components: x = component(x, mask=mask) return x