def step(self, y_prev, mask, state, keys, values, key_mask, domain_keys, domain_annot): mask = mask[:, None] # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1}) _, state_prime = self.cell1(y_prev, state, scope="gru1") state_prime = (1.0 - mask) * state + mask * state_prime # c_j = att(H, s_j^{\prime}) alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) d_alpha = attention(state_prime, domain_keys, key_mask, self.dim_hid, self.dim_key, scope="domain_context") d_context = T.sum(d_alpha[:, :, None] * domain_annot, 0) gate = nn.feedforward( [state_prime, context, d_context], [[self.dim_hid, self.dim_value, self.dim_value], self.dim_value], True, scope="context_gate") context = gate * context + (1 - gate) * d_context # s_j = GRU^2(c_j, s_j^{\prime}) output, next_state = self.cell2(context, state_prime, scope="gru2") next_state = (1.0 - mask) * state + mask * next_state return next_state, context
def step(self, y_prev, mask, state, keys, values, key_mask): mask = mask[:, None] alpha = attention(state, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) output, next_state = self.cell([y_prev, context], state) next_state = (1.0 - mask) * state + mask * next_state return next_state, context
def attention_loop(inputs, mask, state, keys, values, key_mask): mask = mask[:, None] alpha = attention(state, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) output, next_state = self.cell([inputs, context], state) next_state = (1.0 - mask) * state + mask * next_state return [alpha, next_state]
def forward(self, y_seq, y_emb, mask, keys, key_mask, values, initial_state, domain_keys, domain_annot, tag_seq, keep_prob=1.0): # shift embedding y_shifted = T.zeros_like(y_emb) y_shifted = T.set_subtensor(y_shifted[1:], y_emb[:-1]) y_emb = y_shifted # feed states, contexts = Decoder.scan(self, y_emb, mask, keys, key_mask, values, initial_state, domain_keys, domain_annot) with ops.variable_scope("DSAdec"): newmask = T.set_subtensor( mask[T.cast(T.sum(mask, 0) - 1, 'int32'), T.arange(mask.shape[1])], 0.0) # domain_alpha = domain_sensitive_attention(states, newmask, self.dim_hid, self.dim_domain) domain_alpha = attention(states[-1], states, newmask, self.dim_hid, self.dim_hid) domain_states = states * domain_alpha[:, :, None] # batch * (shdim * 2) domain_context = T.sum(domain_states, 0) # batch * feadim1 feature = nn.feedforward(domain_context, [self.dim_hid, self.feadim], True, activation=T.tanh, scope="feature") dscores = nn.feedforward(feature, [self.feadim, self.dnum], True, activation=T.tanh, scope="score") # (batch, 4) dprobs = T.nnet.softmax(dscores) pred_tag = T.argmax(dprobs, 1) didx = T.arange(tag_seq.flatten().shape[0]) dce = -T.log(dprobs[didx, tag_seq.flatten()]) domaincost = T.mean(dce) # p(y_j) \propto f(y_{j-1}, s_{j}, c_{j}) probs = self.prediction(y_emb, states, contexts, keep_prob) # compute cost cost, snt_cost = self.get_cost(y_seq, mask, probs, domain_alpha) return states, contexts, cost, domaincost, pred_tag, snt_cost
def step(self, y_prev, mask, state, *args): n_src = self.n_src assert len(args) == self.n_src * 3 src_keys = args[:n_src] src_values = args[n_src:2 * n_src] src_masks = args[2 * n_src:] mask = mask[:, None] # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1}) _, state_prime = self.cell1(y_prev, state, scope="gru1") state_prime = (1.0 - mask) * state + mask * state_prime # c_j = att(H, s_j^{\prime}) contexts = [] for i, _key, _val, _mask in itertools.izip(itertools.count(), src_keys, src_values, src_masks): alpha = attention(state_prime, _key, _mask, self.dim_hid, self.dim_key, scope='attn_alpha_%d' % i) context = theano.tensor.sum(alpha[:, :, None] * _val, 0) contexts.append(context) if self.method == "attn": contexts = T.reshape(T.concatenate(contexts, 0), [n_src] + list(contexts[0].shape)) with ops.variable_scope("beta"): beta_keys = map_key(contexts, self.dim_value, self.dim_key) beta = attention(state_prime, beta_keys, T.ones(contexts.shape[:2]), self.dim_hid, self.dim_key, scope='beta') context = T.sum(beta[:, :, None] * contexts, 0) elif self.method == "concat": context = T.concatenate(contexts, -1) # s_j = GRU^2(c_j, s_j^{\prime}) output, next_state = self.cell2(context, state_prime, scope="gru2") next_state = (1.0 - mask) * state + mask * next_state return next_state, context
def sampling_loop(inputs, state, keys, values, key_mask): alpha = attention(state, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) probs = self.prediction(inputs, state, context) next_words = ops.random.multinomial(probs).argmax(axis=1) new_inputs = nn.embedding_lookup(target_embedding, next_words) new_inputs = new_inputs + target_bias output, next_state = self.cell([inputs, context], state) return [next_words, new_inputs, next_state]
def attention_loop(inputs, mask, state, keys, values, key_mask): mask = mask[:, None] # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1}) _, state_prime = self.cell1(inputs, state, scope="gru1") # c_j = att(H, s_j^{\prime}) alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) # s_j = GRU^2(c_j, s_j^{\prime}) output, next_state = self.cell2(context, state_prime, scope="gru2") next_state = (1.0 - mask) * state + mask * next_state return [alpha, next_state]
def sampling_loop(inputs, state, keys, values, key_mask): _, state_prime = self.cell1(inputs, state, scope="gru1") alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) output, next_state = self.cell2(context, state_prime, scope="gru2") probs = self.prediction( inputs, next_state, context) # p(y_j) \propto f(y_{j-1}, c_j, s_j) next_words = ops.random.multinomial(probs).argmax(axis=1) new_inputs = nn.embedding_lookup(target_embedding, next_words) new_inputs = new_inputs + target_bias return [next_words, new_inputs, next_state]