def ln_linear(inputs, size, bias, concat=False, dtype=None, scope=None): if not isinstance(size, (list, tuple)): raise ValueError("size argument must be (input_size, output_size)") input_size, output_size = size if not isinstance(input_size, (list, tuple)): input_size = [input_size] if not isinstance(inputs, (list, tuple)): inputs = [inputs] if len(inputs) != len(input_size): raise RuntimeError("unmatched elements found: inputs and input_size") results = [] with variable_scope(scope): if concat: input_size = sum(input_size) inputs = theano.tensor.concatenate(inputs, -1) shape = [input_size, output_size] matrix = get_variable("matrix", shape, dtype=dtype) res = theano.dot(inputs, matrix) with variable_scope("layer_norm"): alpha = get_variable("gains", shape=(output_size,), dtype=dtype, initializer=ones_initializer) beta = get_variable("biases", shape=(output_size,), dtype=dtype, initializer=zeros_initializer) res = layer_normalize(res, alpha, beta) results.append(res) else: for i in range(len(input_size)): shape = [input_size[i], output_size] name = "matrix_%d" % i matrix = get_variable(name, shape, dtype=dtype) res = theano.dot(inputs[i], matrix) with variable_scope("layer_norm"): alpha = get_variable("gains_%d" % i, shape=(output_size,), dtype=dtype, initializer=ones_initializer()) beta = get_variable("biases_%d" % i, shape=(output_size,), dtype=dtype, initializer=zeros_initializer()) res = layer_normalize(res, alpha, beta) results.append(res) if bias: shape = [output_size] bias = get_variable("bias", shape, dtype=dtype) results.append(bias) if len(results) == 1: return results[0] return reduce(theano.tensor.add, results)
def encoder(cell, inputs, mask, initial_state=None, dtype=None, scope=None): with ops.variable_scope(scope or "encoder"): with ops.variable_scope("forward"): fd_states = gru_encoder(cell, inputs, mask, initial_state, dtype) with ops.variable_scope("backward"): inputs = inputs[::-1] mask = mask[::-1] bd_states = gru_encoder(cell, inputs, mask, initial_state, dtype) bd_states = bd_states[::-1] return fd_states, bd_states
def decoder(cell, inputs, mask, initial_state, attention_states, attention_mask, attn_size, dtype=None, scope=None): input_size, states_size = cell.input_size output_size = cell.output_size dtype = dtype or inputs.dtype # non sequences should passed to scan, DO NOT use closure def loop_fn(inputs, mask, state, attn_states, attn_mask, m_states): mask = mask[:, None] alpha = attention(state, m_states, output_size, attn_size, attn_mask) context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0) output, next_state = cell([inputs, context], state) next_state = (1.0 - mask) * state + mask * next_state return [next_state, context] with ops.variable_scope(scope or "decoder"): mapped_states = map_attention_states(attention_states, states_size, attn_size) seq = [inputs, mask] outputs_info = [initial_state, None] non_seq = [attention_states, attention_mask, mapped_states] (states, contexts) = ops.scan(loop_fn, seq, outputs_info, non_seq) return states, contexts
def __call__(self, inputs, state, scope=None): if not isinstance(inputs, (list, tuple)): inputs = [inputs] input_size = self.input_size output_size = self.output_size if len(inputs) != len(input_size): raise RuntimeError("unmatched elements: inputs and input_size") size = [list(input_size) + [output_size], 4 * output_size] with variable_scope(scope or "lstm"): c, h = state new_inputs = list(inputs[:]) + [h] concat = linear(new_inputs, size, True, concat=True, scope="gates") i, j, f, o = theano.tensor.split(concat, [output_size] * 4, 4, -1) j = theano.tensor.tanh(j) # input, forget, output gate i = theano.tensor.nnet.sigmoid(i) f = theano.tensor.nnet.sigmoid(f) o = theano.tensor.nnet.sigmoid(o) new_c = c * f + i * j # no output activation new_h = new_c * o new_state = (new_c, new_h) return new_h, new_state
def attention(query, mapped_states, state_size, attn_size, attention_mask=None, scope=None): with ops.variable_scope(scope or "attention"): mapped_query = nn.linear(query, [state_size, attn_size], False, scope="query_w") mapped_query = mapped_query[None, :, :] hidden = theano.tensor.tanh(mapped_query + mapped_states) score = nn.linear(hidden, [attn_size, 1], False, scope="attention_v") score = score.reshape([score.shape[0], score.shape[1]]) exp_score = theano.tensor.exp(score) if attention_mask is not None: exp_score = exp_score * attention_mask alpha = exp_score / theano.tensor.sum(exp_score, 0) return alpha
def build_attention(self, src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, keys, values, initial_state): # attention graph, this feature is optional def attention_loop(inputs, mask, state, keys, values, key_mask): mask = mask[:, None] alpha = attention(state, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) output, next_state = self.cell([inputs, context], state) next_state = (1.0 - mask) * state + mask * next_state return [alpha, next_state] with ops.variable_scope("decoder"): seq = [target_inputs, tgt_mask] outputs_info = [None, initial_state] nonseq = [keys, values, src_mask] (alpha, state), updaptes = theano.scan(attention_loop, seq, outputs_info, nonseq) attention_score = alpha alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] alignment_outputs = attention_score align = theano.function(alignment_inputs, alignment_outputs) return align
def attention(query, keys, key_mask, dim_query, dim_key, dtype=None, scope=None): with ops.variable_scope(scope or "attention", dtype=dtype): # content-based addressing # e_i = v_a^T tanh(W query + key_i) # alpha = softmax({e_i}) # (n_query, dim_query) -> (n_query, dim_key) mapped_query = nn.linear(query, [dim_query, dim_key], False, scope="map-query") # (n_key, n_query, dim_key) act = T.tanh(mapped_query[None, :, :] + keys) # (n_key, n_query, 1) e = nn.linear(act, [dim_key, 1], False, scope="pre-alpha") # (n_key, n_query, 1) # (n_key, n_query) e = T.reshape(e, e.shape[:2]) e = e.T # (n_query, n_key) # match dimension key_mask = key_mask.T alpha = nn.masked_softmax(e, key_mask) # (n_query, n_key) alpha = alpha.T # (n_key, n_query) return alpha
def map_attention_states(attention_states, input_size, attn_size, scope=None): with ops.variable_scope(scope or "attention"): mapped_states = nn.linear(attention_states, [input_size, attn_size], False, scope="attention_w") return mapped_states
def __call__(self, inputs, state, scope=None): if not isinstance(inputs, (list, tuple)): inputs = [inputs] input_size = self.input_size output_size = self.output_size if len(inputs) != len(input_size): raise RuntimeError("unmatched elements: inputs and input_size") size = [list(input_size) + [output_size], output_size] with variable_scope(scope or "gru_cell"): new_inputs = list(inputs[:]) + [state] r = feedforward(new_inputs, size, False, scope="reset_gate") u = feedforward(new_inputs, size, False, scope="update_gate") new_inputs = list(inputs[:]) + [r * state] c = feedforward(new_inputs, size, True, activation=theano.tensor.tanh, scope="candidate") new_state = (1.0 - u) * state + u * c return new_state, new_state
def coarseattention(query, keys, key_mask, dim_query, dim_key, dtype=None, scope=None): with ops.variable_scope(scope or "coarseattention", dtype=dtype): # content-based addressing # e_i = v_a^T tanh(W query + key_i) # alpha = softmax({e_i}) # (n_query, dim_query) -> (n_query, dim_key) e = [] for i in range(len(keys)): mapped_query = nn.linear(query, [dim_query, dim_key[i]], False, scope="map-query_{}".format(i)) # (n_key, n_query, dim_key) act = T.tanh(mapped_query[None, :, :] + keys[i]) # (n_key, n_query, 1) em = nn.linear( act, [dim_key[i], 1], False, scope="pre-alpha_{}".format(i)) # (n_key, n_query, 1) e.append(em) e = reduce(T.add, e) # (n_key, n_query) e = T.reshape(e, e.shape[:2]) e = e.T # (n_query, n_key) # match dimension key_mask = key_mask.T alpha = nn.masked_softmax(e, key_mask) # (n_query, n_key) alpha = alpha.T # (n_key, n_query) return alpha
def build_sampling(self, src_seq, src_mask, target_embedding, target_bias, keys, values, initial_state): # sampling graph, this feature is optional max_len = T.iscalar() def sampling_loop(inputs, state, keys, values, key_mask): _, state_prime = self.cell1(inputs, state, scope="gru1") alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) output, next_state = self.cell2(context, state_prime, scope="gru2") probs = self.prediction(inputs, next_state, context) # p(y_j) \propto f(y_{j-1}, c_j, s_j) next_words = ops.random.multinomial(probs).argmax(axis=1) new_inputs = nn.embedding_lookup(target_embedding, next_words) new_inputs = new_inputs + target_bias return [next_words, new_inputs, next_state] with ops.variable_scope("decoder"): batch = src_seq.shape[1] initial_inputs = T.zeros([batch, self.dim_y], theano.config.floatX) outputs_info = [None, initial_inputs, initial_state] nonseq = [keys, values, src_mask] outputs, updates = theano.scan(sampling_loop, [], outputs_info, nonseq, n_steps=max_len) sampled_words = outputs[0] sampling_inputs = [src_seq, src_mask, max_len] sampling_outputs = sampled_words sample = theano.function(sampling_inputs, sampling_outputs, updates=updates) return sample
def build_attention(self, src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, keys, values, initial_state): # attention graph, this feature is optional def attention_loop(inputs, mask, state, keys, values, key_mask): mask = mask[:, None] # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1}) _, state_prime = self.cell1(inputs, state, scope="gru1") # c_j = att(H, s_j^{\prime}) alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key) context = T.sum(alpha[:, :, None] * values, 0) # s_j = GRU^2(c_j, s_j^{\prime}) output, next_state = self.cell2(context, state_prime, scope="gru2") next_state = (1.0 - mask) * state + mask * next_state return [alpha, next_state] with ops.variable_scope("decoder"): seq = [target_inputs, tgt_mask] outputs_info = [None, initial_state] nonseq = [keys, values, src_mask] (alpha, state), updaptes = theano.scan(attention_loop, seq, outputs_info, nonseq) attention_score = alpha alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] alignment_outputs = attention_score align = theano.function(alignment_inputs, alignment_outputs) return align
def decoder(cell, inputs, mask, initial_state, attention_states, attention_mask, attn_size, mapped_states=None, dtype=None, scope=None): input_size, states_size = cell.input_size output_size = cell.output_size dtype = dtype or inputs.dtype att_size = [output_size, states_size, attn_size] def loop_fn(inputs, mask, state, attn_states, attn_mask, mapped_states): mask = mask[:, None] alpha = attention(state, None, mapped_states, attn_mask, att_size) context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0) output, next_state = cell([inputs, context], state) next_state = (1.0 - mask) * state + mask * next_state return [next_state, context] with ops.variable_scope(scope or "decoder"): if mapped_states is None: mapped_states = attention(None, attention_states, None, None, att_size) seq = [inputs, mask] outputs_info = [initial_state, None] non_seq = [attention_states, attention_mask, mapped_states] (states, contexts) = ops.scan(loop_fn, seq, outputs_info, non_seq) return states, contexts
def prediction(self, y_emb, state, context, keep_prob=1.0): """ readout -> softmax p(y_j) \propto f(y_{j-1}, s_{j}, c_{j}) :param y_emb: :param state: :param context: :param keep_prob: :return: """ features = [state, y_emb, context] readout = nn.feedforward( features, [[self.dim_hid, self.dim_y, self.dim_value], self.dim_readout], True, activation=T.tanh, scope="readout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) with ops.variable_scope(self.tiescope, reuse=True): target_embedding = ops.get_variable( "embedding", [self.n_y_vocab, self.dim_readout]) target_embedding = target_embedding.T logits = T.dot(readout, target_embedding) # logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True, # scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = T.nnet.softmax(logits) return probs
def forward(self, y_seq, y_emb, mask, keys, key_mask, values, initial_state, domain_keys, domain_annot, tag_seq, keep_prob=1.0): # shift embedding y_shifted = T.zeros_like(y_emb) y_shifted = T.set_subtensor(y_shifted[1:], y_emb[:-1]) y_emb = y_shifted # feed states, contexts = Decoder.scan(self, y_emb, mask, keys, key_mask, values, initial_state, domain_keys, domain_annot) with ops.variable_scope("DSAdec"): newmask = T.set_subtensor( mask[T.cast(T.sum(mask, 0) - 1, 'int32'), T.arange(mask.shape[1])], 0.0) # domain_alpha = domain_sensitive_attention(states, newmask, self.dim_hid, self.dim_domain) domain_alpha = attention(states[-1], states, newmask, self.dim_hid, self.dim_hid) domain_states = states * domain_alpha[:, :, None] # batch * (shdim * 2) domain_context = T.sum(domain_states, 0) # batch * feadim1 feature = nn.feedforward(domain_context, [self.dim_hid, self.feadim], True, activation=T.tanh, scope="feature") dscores = nn.feedforward(feature, [self.feadim, self.dnum], True, activation=T.tanh, scope="score") # (batch, 4) dprobs = T.nnet.softmax(dscores) pred_tag = T.argmax(dprobs, 1) didx = T.arange(tag_seq.flatten().shape[0]) dce = -T.log(dprobs[didx, tag_seq.flatten()]) domaincost = T.mean(dce) # p(y_j) \propto f(y_{j-1}, s_{j}, c_{j}) probs = self.prediction(y_emb, states, contexts, keep_prob) # compute cost cost, snt_cost = self.get_cost(y_seq, mask, probs, domain_alpha) return states, contexts, cost, domaincost, pred_tag, snt_cost
def __call__(self, inputs, state, c_inputs=None, scope=None): with variable_scope(scope or "multi_rnn_cell"): cur_inp = inputs new_states = [] for i, cell in enumerate(self._cells): with variable_scope("cell_%d" % i): cur_state = state[i] if c_inputs: if not isinstance(inputs, (list, tuple)): cur_inp = [inputs] if not isinstance(c_inputs, (list, tuple)): c_inputs = [c_inputs] cur_inp = list(cur_inp) + list(c_inputs) cur_inp, new_state = cell(cur_inp, cur_state) new_states.append(new_state) new_states = tuple(new_states) return cur_inp, new_states
def forward(self, x_embedded, mask, initial_state=None, dtype=None, scope=None): scope = scope or "encoder" cell = self.cell with ops.variable_scope(scope, dtype=dtype): with ops.variable_scope("forward"): fd_states = gru_encoder(cell, x_embedded, mask, initial_state, dtype) with ops.variable_scope("backward"): x_embedded = x_embedded[::-1] mask = mask[::-1] bd_states = gru_encoder(cell, x_embedded, mask, initial_state, dtype) bd_states = bd_states[::-1] return fd_states, bd_states
def domain_sensitive_attention(keys, key_mask, dim_key, dim_domain, dtype=None, scope=None): with ops.variable_scope(scope or "domain_sensitive_attention", dtype=dtype): mapped_keys = nn.linear(keys, [dim_key, dim_domain], True, scope="map-key") act = T.tanh(mapped_keys) # (n_key, n_query, 1) e = nn.linear(act, [dim_domain, 1], False, scope="pre-alpha") # (n_key, n_query, 1) # (n_key, n_query) e = T.reshape(e, e.shape[:2]) e = e.T # (n_query, n_key) # match dimension key_mask = key_mask.T alpha = nn.masked_softmax(e, key_mask) # (n_query, n_key) alpha = alpha.T # (n_key, n_query) return alpha
def step(self, y_prev, mask, state, *args): n_src = self.n_src assert len(args) == self.n_src * 3 src_keys = args[:n_src] src_values = args[n_src:2 * n_src] src_masks = args[2 * n_src:] mask = mask[:, None] # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1}) _, state_prime = self.cell1(y_prev, state, scope="gru1") state_prime = (1.0 - mask) * state + mask * state_prime # c_j = att(H, s_j^{\prime}) contexts = [] for i, _key, _val, _mask in zip(itertools.count(), src_keys, src_values, src_masks): alpha = attention(state_prime, _key, _mask, self.dim_hid, self.dim_key, scope='attn_alpha_%d' % i) context = theano.tensor.sum(alpha[:, :, None] * _val, 0) contexts.append(context) if self.method == "attn": contexts = T.reshape(T.concatenate(contexts, 0), [n_src] + list(contexts[0].shape)) with ops.variable_scope("beta"): beta_keys = map_key(contexts, self.dim_value, self.dim_key) beta = attention(state_prime, beta_keys, T.ones(contexts.shape[:2]), self.dim_hid, self.dim_key, scope='beta') context = T.sum(beta[:, :, None] * contexts, 0) elif self.method == "concat": context = T.concatenate(contexts, -1) # s_j = GRU^2(c_j, s_j^{\prime}) output, next_state = self.cell2(context, state_prime, scope="gru2") next_state = (1.0 - mask) * state + mask * next_state return next_state, context
def linear(inputs, size, bias, concat=False, dtype=None, scope=None): if not isinstance(size, (list, tuple)): raise ValueError("size argument must be (input_size, output_size)") input_size, output_size = size if not isinstance(input_size, (list, tuple)): input_size = [input_size] if not isinstance(inputs, (list, tuple)): inputs = [inputs] if len(inputs) != len(input_size): raise RuntimeError("unmatched elements found: inputs and input_size") results = [] with variable_scope(scope): if concat: input_size = sum(input_size) inputs = theano.tensor.concatenate(inputs, -1) shape = [input_size, output_size] matrix = get_variable("matrix", shape, dtype=dtype) results.append(theano.dot(inputs, matrix)) else: for i in range(len(input_size)): shape = [input_size[i], output_size] name = "matrix_%d" % i matrix = get_variable(name, shape, dtype=dtype) results.append(theano.dot(inputs[i], matrix)) if bias: shape = [output_size] bias = get_variable("bias", shape, dtype=dtype) results.append(bias) if len(results) == 1: return results[0] return reduce(theano.tensor.add, results)
def step(self, y_prev, mask, state, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, values, key_mask): mask = mask[:, None] # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1}) _, state_prime = self.cell1(y_prev, state, scope="gru1") state_prime = (1.0 - mask) * state + mask * state_prime # c_j = att(H, s_j^{\prime}) alpha = coarseattention(state_prime, [src_words_keys, src_pos_keys], key_mask, self.dim_query, [self.dim_key, self.dim_word2pos]) context = T.sum(alpha[:, :, None] * values, 0) # s_j = GRU^2(c_j, s_j^{\prime}) output, next_state = self.cell2(context, state_prime, scope="gru2") next_state = (1.0 - mask) * state + mask * next_state # y_pos_{j} = nn(s_{j}) tempstate = nn.feedforward(next_state, [self.dim_hid, self.posnndim], True, activation=T.nnet.relu, scope="ttaggerstates") score = nn.linear(tempstate, [self.posnndim, self.n_y_tagvocab], True, scope="ttaggerscores") prob = T.nnet.softmax(score) with ops.variable_scope("tgttag_embedding"): tgttag_embedding = ops.get_variable("embedding", [self.n_y_tagvocab, self.poshdim]) tgttag_bias = ops.get_variable("bias", [self.poshdim]) y_pos_state = T.dot(prob, tgttag_embedding) + tgttag_bias posquery = T.concatenate([tempstate, y_pos_state], -1) beta = fineattention([state_prime, posquery], [[src_words_keys, src_pos_keys], [pos_words_keys, pos_pos_keys]], key_mask, [self.dim_query, self.dim_posquery], [[self.dim_key, self.dim_word2pos], [self.dim_pos2word, self.dim_pos2pos]]) finalalpha = beta # adl = ops.get_variable("adaptive", []) # walpha = T.exp(adl) # wbeta = T.exp(1.0 - adl) # finalalpha = walpha/(walpha+wbeta) * alpha + wbeta/(walpha+wbeta) * beta # # finalalpha = 0.5 * beta + 0.5 * alpha context = T.sum(finalalpha[:, :, None] * values, 0) return next_state, context, y_pos_state, prob
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding") as tgtembscope: target_embedding = ops.get_variable("embedding", [tvsize, tedim]) # target_bias = ops.get_variable("bias", [tedim]) decoder.tiescope = tgtembscope source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) # compute initial state for decoder # first state of backward encoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim) _, _, cost,_ = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, keep_prob) training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] training_outputs = [cost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) # target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) # decoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) # prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [annotation, initial_state, mapped_keys] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [prev_words, initial_state, annotation, mapped_keys, src_mask] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [prev_words, initial_state, annotation, mapped_keys, src_mask] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict # optional graph ''' with ops.variable_scope(scope, reuse=True): sample = decoder.build_sampling(src_seq, src_mask, target_embedding, target_bias, mapped_keys, annotation, initial_state) align = decoder.build_attention(src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, mapped_keys, annotation, initial_state) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim) _, _, _,snt_cost = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, 1.0) get_snt_cost = theano.function(training_inputs, snt_cost) ''' self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] # self.align = align # self.sample = sample self.encode = encode # self.get_snt_cost = get_snt_cost self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX scope = option["scope"] initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 def prediction(prev_inputs, prev_state, context, keep_prob=1.0): features = [prev_state, prev_inputs, context] maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim], maxpart, True) readout = nn.linear(maxhid, [maxdim, deephid], False, scope="deepout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [deephid, tvsize], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = theano.tensor.nnet.softmax(logits) return probs # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = theano.tensor.imatrix("soruce_sequence") src_mask = theano.tensor.matrix("soruce_sequence_mask") tgt_seq = theano.tensor.imatrix("target_sequence") tgt_mask = theano.tensor.matrix("target_sequence_mask") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias target_inputs = target_inputs + target_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) cell = nn.rnn_cell.gru_cell([sedim, shdim]) if keep_prob < 1.0: cell = nn.rnn_cell.dropout_wrapper(cell) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) # compute initial state for decoder # first state of backward encoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) if keep_prob < 1.0: cell = nn.rnn_cell.dropout_wrapper(cell) # run decoder decoder_outputs = decoder(cell, target_inputs, tgt_mask, initial_state, annotation, src_mask, ahdim) all_output, all_context = decoder_outputs shift_inputs = theano.tensor.zeros_like(target_inputs) shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:], target_inputs[:-1]) init_state = initial_state[None, :, :] all_states = theano.tensor.concatenate([init_state, all_output], 0) prev_states = all_states[:-1] with ops.variable_scope("decoder"): probs = prediction(shift_inputs, prev_states, all_context, keep_prob=keep_prob) # compute cost idx = theano.tensor.arange(tgt_seq.flatten().shape[0]) cost = -theano.tensor.log(probs[idx, tgt_seq.flatten()]) cost = cost.reshape(tgt_seq.shape) cost = theano.tensor.sum(cost * tgt_mask, 0) cost = theano.tensor.mean(cost) training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] training_outputs = [cost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = theano.tensor.ivector("prev_words") # encoder, disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias cell = nn.rnn_cell.gru_cell([sedim, shdim]) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) # decoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) inputs = nn.embedding_lookup(target_embedding, prev_words) inputs = inputs + target_bias cond = theano.tensor.neq(prev_words, 0) # zeros out embedding if y is 0 inputs = inputs * cond[:, None] cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) with ops.variable_scope("decoder"): mapped_states = map_attention_states(annotation, 2 * shdim, ahdim) alpha = attention(initial_state, mapped_states, thdim, ahdim, src_mask) context = theano.tensor.sum(alpha[:, :, None] * annotation, 0) output, next_state = cell([inputs, context], initial_state) probs = prediction(inputs, initial_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [annotation, initial_state, mapped_states] encode = theano.function(encoding_inputs, encoding_outputs) prediction_inputs = [prev_words, initial_state, annotation, mapped_states, src_mask] prediction_outputs = [probs, context, alpha] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.encode = encode self.predict = predict self.generate = generate self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim, xposhdim, yposhdim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim, xposnn, yposnn, word2pos, pos2word, pos2pos = option[ "hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab, tagvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab stag2id, ttag2id = tagvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) stagsize, ttagsize = len(stag2id), len(ttag2id) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim + xposhdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, n_y_vocab=tvsize, n_y_tagvocab=ttagsize, poshdim=yposhdim, posnndim=yposnn, word2pos=word2pos, pos2word=pos2word, pos2pos=pos2pos) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") src_pos = T.imatrix("source_postag") tgt_pos = T.imatrix("target_postag") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) with ops.variable_scope("srctag_embedding"): srctag_embedding = ops.get_variable("embedding", [stagsize, xposhdim]) srctag_bias = ops.get_variable("bias", [xposhdim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("srcpostagger"): tempstates = nn.feedforward(annotation, [shdim * 2, xposnn], True, scope="staggerstates", activation=T.nnet.relu) scores = nn.linear(tempstates, [xposnn, stagsize], True, scope="staggerscores") new_shape = [scores.shape[0] * scores.shape[1], -1] scores = scores.reshape(new_shape) srcposprobs = T.nnet.softmax(scores) srctaggerstates = T.dot(srcposprobs, srctag_embedding) + srctag_bias srctaggerstates = srctaggerstates.reshape( [annotation.shape[0], annotation.shape[1], -1]) idx = T.arange(src_pos.flatten().shape[0]) ce = -T.log(srcposprobs[idx, src_pos.flatten()]) ce = ce.reshape(src_pos.shape) ce = T.sum(ce * src_mask, 0) srcpos_cost = T.mean(ce) tempposkeys = T.concatenate([srctaggerstates, tempstates], -1) src_words_keys = map_key(annotation, 2 * shdim, ahdim, "srcwordkeys") src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos, "srcposkeys") pos_words_keys = map_key(annotation, 2 * shdim, pos2word, "pos2wordkeys") pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos, "pos2poskeys") annotation = T.concatenate([annotation, srctaggerstates], -1) # compute initial state for decoder # first state of backward encoder final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim + xposhdim, thdim], True, scope="initial", activation=T.tanh) _, _, transcost, _, tgtpos_cost = decoder.forward( tgt_seq, target_inputs, tgt_mask, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, src_mask, annotation, initial_state, tgt_pos, keep_prob) lambx = theano.shared(numpy.asarray(option["lambda"][0], dtype), "lambdax") lamby = theano.shared(numpy.asarray(option["lambda"][1], dtype), "lambday") totalcost = transcost + lambx * srcpos_cost + lamby * tgtpos_cost training_inputs = [ src_seq, src_mask, tgt_seq, tgt_mask, src_pos, tgt_pos ] training_outputs = [srcpos_cost, tgtpos_cost, transcost, totalcost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("srcpostagger"): tempstates = nn.feedforward(annotation, [shdim * 2, xposnn], True, scope="staggerstates", activation=T.nnet.relu) scores = nn.linear(tempstates, [xposnn, stagsize], True, scope="staggerscores") new_shape = [scores.shape[0] * scores.shape[1], -1] scores = scores.reshape(new_shape) srcposprobs = T.nnet.softmax(scores) srctaggerstates = T.dot(srcposprobs, srctag_embedding) + srctag_bias srctaggerstates = srctaggerstates.reshape( [annotation.shape[0], annotation.shape[1], -1]) tempposkeys = T.concatenate([srctaggerstates, tempstates], -1) src_words_keys = map_key(annotation, 2 * shdim, ahdim, "srcwordkeys") src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos, "srcposkeys") pos_words_keys = map_key(annotation, 2 * shdim, pos2word, "pos2wordkeys") pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos, "pos2poskeys") annotation = T.concatenate([annotation, srctaggerstates], -1) # decoder final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim + xposhdim, thdim], True, scope="initial", activation=T.tanh) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context, next_pos, tgtposprob = decoder.step( prev_inputs, mask, initial_state, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, annotation, src_mask) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context, next_pos) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ annotation, initial_state, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, srcposprobs ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, src_mask ] prediction_outputs = [probs, next_state, tgtposprob] predict = theano.function(prediction_inputs, prediction_outputs, on_unused_input='warn') self.predict = predict self.cost = totalcost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] self.encode = encode self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder2" encoder = Encoder(sedim, shdim) import decoder2 decoder = decoder2.DecoderGruCond(2, option['method'], tedim, thdim, ahdim, 2 * shdim + thdim, dim_readout=deephid, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") byseq = T.imatrix("backward_target_sequence") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) + target_bias by_inputs = nn.embedding_lookup(target_embedding, byseq) + target_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) by_inputs = nn.dropout(by_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) annotation = nn.dropout(annotation, keep_prob=keep_prob) import softdec soft_decoder = softdec.SoftDecoder(option["eosid"], option["softk"], tedim, thdim, ahdim, 2 * shdim, dim_readout=deephid, n_y_vocab=tvsize) with ops.variable_scope('soft_decoder'): initial_state = nn.feedforward(states[-1], [shdim, thdim], True, scope='initial', activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) soft_states, _, _, soft_mask = soft_decoder.infer( mapped_keys, src_mask, annotation, initial_state, target_embedding, target_bias, keep_prob) with ops.variable_scope('soft_decoder', reuse=True): _, _, soft_cost, _ = soft_decoder.forward( byseq, by_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, keep_prob) # compute initial state for decoder # first state of backward encoder # initialize with only encoder state final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query with ops.variable_scope('map-key-src'): mapped_keys_src = map_key(annotation, 2 * shdim, ahdim) with ops.variable_scope('map-key-soft'): mapped_keys_soft = map_key(soft_states, thdim, ahdim) _, _, _, snt_cost = decoder.forward( tgt_seq, target_inputs, tgt_mask, [mapped_keys_src, mapped_keys_soft], [src_mask, soft_mask], [annotation, soft_states], initial_state, keep_prob) ce = snt_cost true_cost = T.mean(ce) lamb = theano.shared(numpy.asarray(option['lambda'], dtype), 'lambda') cost = lamb * soft_cost + (1 - lamb) * true_cost # import utils.ttensor # print 'true_cost %d:' % len(utils.ttensor.find_inputs_and_params(true_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(true_cost)[0]: # print '\t', xxx # print 'soft_cost %d:' % len(utils.ttensor.find_inputs_and_params(soft_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(soft_cost)[0]: # print '\t', xxx # print 'tot_cost: %d' % len(utils.ttensor.find_inputs_and_params(cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(cost)[0]: # print '\t', xxx # print 'snt_cost: %d' % len(utils.ttensor.find_inputs_and_params(snt_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(snt_cost)[0]: # print '\t', xxx training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, byseq] training_outputs = [cost, soft_cost, true_cost] # get_snt_cost = theano.function(training_inputs[:4], snt_cost) get_snt_cost = None # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope('soft_decoder'): initial_state = nn.feedforward(states[-1], [shdim, thdim], True, scope='initial', activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) soft_states, soft_contexts, soft_probs, soft_mask = soft_decoder.infer( mapped_keys, src_mask, annotation, initial_state, target_embedding, target_bias, 1.0) # decoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query with ops.variable_scope('map-key-src'): mapped_keys_src = map_key(annotation, 2 * shdim, ahdim) with ops.variable_scope('map-key-soft'): mapped_keys_soft = map_key(soft_states, thdim, ahdim) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step( prev_inputs, mask, initial_state, *[ mapped_keys_src, mapped_keys_soft, annotation, soft_states, src_mask, soft_mask ]) probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ initial_state, annotation, soft_states, mapped_keys_src, mapped_keys_soft, soft_mask ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": raise ValueError() prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys_src, src_mask, soft_states, mapped_keys_soft, soft_mask ] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] self.align = None self.sample = None self.encode = encode self.get_snt_cost = get_snt_cost self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim, domaindim, feadim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) dnum = option['dnum'] if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dnum=dnum, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, dim_domain=domaindim, feadim=feadim, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") tag_seq = T.imatrix("domain_tag") # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'), # T.arange(src_mask.shape[1])], 0.0) with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding") as tgtembscope: target_embedding = ops.get_variable("embedding", [tvsize, tedim]) # target_bias = ops.get_variable("bias", [tedim]) decoder.tiescope = tgtembscope source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") dscores = nn.feedforward(dfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) dprobs = T.nnet.softmax(dscores) dpred_tag = T.argmax(dprobs, 1) didx = T.arange(tag_seq.flatten().shape[0]) dce = -T.log(dprobs[didx, tag_seq.flatten()]) dcost = T.mean(dce) share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") with ops.variable_scope("Shared"): sscores = nn.feedforward(sfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) sprobs = T.nnet.softmax(sscores) spred_tag = T.argmax(sprobs, 1) sidx = T.arange(tag_seq.flatten().shape[0]) sce = -T.log(sprobs[sidx, tag_seq.flatten()]) scost = T.mean(sce) adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log( sprobs[sidx, tag_seq.flatten()]) adv_scost = T.mean(adv_sce) domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate domain_annotation = nn.dropout(domain_annotation, keep_prob=keep_prob) share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate annotation = nn.dropout(annotation, keep_prob=keep_prob) # compute initial state for decoder # first state of backward encoder # batch * shdim final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward( tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, mapped_domain_keys, domain_annotation, tag_seq, keep_prob) lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda") # cwscost *= lamb final_cost = cost + dcost + tgtdcost - lamb * adv_scost tag_inputs = [src_seq, src_mask] tag_outputs = [dpred_tag, spred_tag] tag_predict = theano.function(tag_inputs, tag_outputs) self.tag_predict = tag_predict tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] tgt_tag_outputs = [tpred_tag] tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs) self.tgt_tag_predict = tgt_tag_predict training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq] training_outputs = [cost, dcost, adv_scost, tgtdcost] self.cost_cla = scost self.inputs_cla = [src_seq, src_mask, tag_seq] self.outputs_cla = [scost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate # decoder final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") prev_inputs = nn.embedding_lookup(target_embedding, prev_words) # prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask, mapped_domain_keys, domain_annotation) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ annotation, initial_state, mapped_keys, mapped_domain_keys, domain_annotation ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask, mapped_domain_keys, domain_annotation ] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict self.cost = final_cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] # self.align = align # self.sample = sample self.encode = encode # self.get_snt_cost = get_snt_cost self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "criterion" not in option: option["criterion"] = "mle" if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX scope = option["scope"] criterion = option["criterion"] initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 # MRT mode do not use dropout if criterion == "mrt": keep_prob = 1.0 def prediction(prev_inputs, prev_state, context, keep_prob=1.0): features = [prev_state, prev_inputs, context] maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim], maxpart, True) readout = nn.linear(maxhid, [maxdim, deephid], False, scope="deepout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [deephid, tvsize], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = theano.tensor.nnet.softmax(logits) return probs # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = theano.tensor.imatrix("soruce_sequence") src_mask = theano.tensor.matrix("soruce_sequence_mask") tgt_seq = theano.tensor.imatrix("target_sequence") tgt_mask = theano.tensor.matrix("target_sequence_mask") if criterion == "mrt": loss = theano.tensor.vector("loss_score") sharp = theano.tensor.scalar("sharpness") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias target_inputs = target_inputs + target_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) cell = nn.rnn_cell.gru_cell([sedim, shdim]) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) annotation = nn.dropout(annotation, keep_prob=keep_prob) # compute initial state for decoder # first state of backward encoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) # run decoder cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) if criterion == "mrt": # In MRT training, shape of src_seq and src_mask are assumed # to have [len, 1] batch = tgt_seq.shape[1] with ops.variable_scope("decoder"): mapped_states = attention(None, annotation, None, None, [thdim, 2 * shdim, ahdim]) b_src_mask = theano.tensor.repeat(src_mask, batch, 1) b_annotation = theano.tensor.repeat(annotation, batch, 1) b_mapped_states = theano.tensor.repeat(mapped_states, batch, 1) b_initial_state = theano.tensor.repeat(initial_state, batch, 0) decoder_outputs = decoder(cell, target_inputs, tgt_mask, b_initial_state, b_annotation, b_src_mask, ahdim, b_mapped_states) else: decoder_outputs = decoder(cell, target_inputs, tgt_mask, initial_state, annotation, src_mask, ahdim) all_output, all_context = decoder_outputs shift_inputs = theano.tensor.zeros_like(target_inputs) shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:], target_inputs[:-1]) if criterion == "mrt": init_state = b_initial_state[None, :, :] else: init_state = initial_state[None, :, :] all_states = theano.tensor.concatenate([init_state, all_output], 0) prev_states = all_states[:-1] with ops.variable_scope("decoder"): probs = prediction(shift_inputs, prev_states, all_context, keep_prob=keep_prob) # compute cost idx = theano.tensor.arange(tgt_seq.flatten().shape[0]) ce = -theano.tensor.log(probs[idx, tgt_seq.flatten()]) ce = ce.reshape(tgt_seq.shape) ce = theano.tensor.sum(ce * tgt_mask, 0) if criterion == "mle": cost = theano.tensor.mean(ce) else: # ce is positive here logp = -ce score = sharp * logp # safe softmax score = score - theano.tensor.max(score) score = theano.tensor.exp(score) qprob = score / theano.tensor.sum(score) risk = theano.tensor.sum(qprob * loss) cost = risk if criterion == "mle": training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] else: training_inputs = [ src_seq, src_mask, tgt_seq, tgt_mask, loss, sharp ] training_outputs = [cost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = theano.tensor.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) target_inputs = target_inputs + target_bias cell = nn.rnn_cell.gru_cell([sedim, shdim]) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) # decoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) inputs = nn.embedding_lookup(target_embedding, prev_words) inputs = inputs + target_bias cond = theano.tensor.neq(prev_words, 0) # zeros out embedding if y is 0 inputs = inputs * cond[:, None] cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) # encode -> prediction -> generation # prediction: prev_word + prev_state => context, next_word # generation: curr_word + context + prev_state => next_state # here, initial_state is merely a placeholder with ops.variable_scope("decoder"): # used in encoding mapped_states = attention(None, annotation, None, None, [thdim, 2 * shdim, ahdim]) # used in prediction alpha = attention(initial_state, None, mapped_states, src_mask, [thdim, 2 * shdim, ahdim]) context = theano.tensor.sum(alpha[:, :, None] * annotation, 0) probs = prediction(inputs, initial_state, context) # used in generation output, next_state = cell([inputs, context], initial_state) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [annotation, initial_state, mapped_states] encode = theano.function(encoding_inputs, encoding_outputs) prediction_inputs = [ prev_words, initial_state, annotation, mapped_states, src_mask ] prediction_outputs = [probs, context, alpha] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) # sampling graph, this feature is optional with ops.variable_scope(scope, reuse=True): max_len = theano.tensor.iscalar() def sampling_loop(inputs, state, attn_states, attn_mask, m_states): alpha = attention(state, None, m_states, attn_mask, [thdim, 2 * shdim, ahdim]) context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0) probs = prediction(inputs, state, context) next_words = ops.random.multinomial(probs).argmax(axis=1) new_inputs = nn.embedding_lookup(target_embedding, next_words) new_inputs = new_inputs + target_bias output, next_state = cell([new_inputs, context], state) return [next_words, new_inputs, next_state] with ops.variable_scope("decoder"): batch = src_seq.shape[1] initial_inputs = theano.tensor.zeros([batch, tedim], dtype=dtype) outputs_info = [None, initial_inputs, initial_state] nonseq = [annotation, src_mask, mapped_states] outputs, updates = theano.scan(sampling_loop, [], outputs_info, nonseq, n_steps=max_len) sampled_words = outputs[0] sampling_inputs = [src_seq, src_mask, max_len] sampling_outputs = sampled_words sample = theano.function(sampling_inputs, sampling_outputs, updates=updates) # attention graph, this feature is optional with ops.variable_scope(scope, reuse=True): def attention_loop(inputs, mask, state, attn_states, attn_mask, m_states): mask = mask[:, None] alpha = attention(state, None, m_states, attn_mask, [thdim, 2 * shdim, ahdim]) context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0) output, next_state = cell([inputs, context], state) next_state = (1.0 - mask) * state + mask * next_state return [alpha, next_state] with ops.variable_scope("decoder"): seq = [target_inputs, tgt_mask] outputs_info = [None, initial_state] nonseq = [annotation, src_mask, mapped_states] (alpha, state), updaptes = theano.scan(attention_loop, seq, outputs_info, nonseq) attention_score = alpha alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] alignment_outputs = attention_score align = theano.function(alignment_inputs, alignment_outputs) self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] self.align = align self.sample = sample self.encode = encode self.predict = predict self.generate = generate self.option = option