def step(self, y_prev, mask, state, *args): n_src = self.n_src assert len(args) == self.n_src * 3 src_keys = args[:n_src] src_values = args[n_src:2 * n_src] src_masks = args[2 * n_src:] mask = mask[:, None] # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1}) _, state_prime = self.cell1(y_prev, state, scope="gru1") state_prime = (1.0 - mask) * state + mask * state_prime # c_j = att(H, s_j^{\prime}) contexts = [] for i, _key, _val, _mask in itertools.izip(itertools.count(), src_keys, src_values, src_masks): alpha = attention(state_prime, _key, _mask, self.dim_hid, self.dim_key, scope='attn_alpha_%d' % i) context = theano.tensor.sum(alpha[:, :, None] * _val, 0) contexts.append(context) if self.method == "attn": contexts = T.reshape(T.concatenate(contexts, 0), [n_src] + list(contexts[0].shape)) with ops.variable_scope("beta"): beta_keys = map_key(contexts, self.dim_value, self.dim_key) beta = attention(state_prime, beta_keys, T.ones(contexts.shape[:2]), self.dim_hid, self.dim_key, scope='beta') context = T.sum(beta[:, :, None] * contexts, 0) elif self.method == "concat": context = T.concatenate(contexts, -1) # s_j = GRU^2(c_j, s_j^{\prime}) output, next_state = self.cell2(context, state_prime, scope="gru2") next_state = (1.0 - mask) * state + mask * next_state return next_state, context
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder2" encoder = Encoder(sedim, shdim) import decoder2 decoder = decoder2.DecoderGruCond(2, option['method'], tedim, thdim, ahdim, 2 * shdim + thdim, dim_readout=deephid, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") byseq = T.imatrix("backward_target_sequence") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) + target_bias by_inputs = nn.embedding_lookup(target_embedding, byseq) + target_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) by_inputs = nn.dropout(by_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) annotation = nn.dropout(annotation, keep_prob=keep_prob) import softdec soft_decoder = softdec.SoftDecoder(option["eosid"], option["softk"], tedim, thdim, ahdim, 2 * shdim, dim_readout=deephid, n_y_vocab=tvsize) with ops.variable_scope('soft_decoder'): initial_state = nn.feedforward(states[-1], [shdim, thdim], True, scope='initial', activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) soft_states, _, _, soft_mask = soft_decoder.infer( mapped_keys, src_mask, annotation, initial_state, target_embedding, target_bias, keep_prob) with ops.variable_scope('soft_decoder', reuse=True): _, _, soft_cost, _ = soft_decoder.forward( byseq, by_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, keep_prob) # compute initial state for decoder # first state of backward encoder # initialize with only encoder state final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query with ops.variable_scope('map-key-src'): mapped_keys_src = map_key(annotation, 2 * shdim, ahdim) with ops.variable_scope('map-key-soft'): mapped_keys_soft = map_key(soft_states, thdim, ahdim) _, _, _, snt_cost = decoder.forward( tgt_seq, target_inputs, tgt_mask, [mapped_keys_src, mapped_keys_soft], [src_mask, soft_mask], [annotation, soft_states], initial_state, keep_prob) ce = snt_cost true_cost = T.mean(ce) lamb = theano.shared(numpy.asarray(option['lambda'], dtype), 'lambda') cost = lamb * soft_cost + (1 - lamb) * true_cost # import utils.ttensor # print 'true_cost %d:' % len(utils.ttensor.find_inputs_and_params(true_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(true_cost)[0]: # print '\t', xxx # print 'soft_cost %d:' % len(utils.ttensor.find_inputs_and_params(soft_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(soft_cost)[0]: # print '\t', xxx # print 'tot_cost: %d' % len(utils.ttensor.find_inputs_and_params(cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(cost)[0]: # print '\t', xxx # print 'snt_cost: %d' % len(utils.ttensor.find_inputs_and_params(snt_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(snt_cost)[0]: # print '\t', xxx training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, byseq] training_outputs = [cost, soft_cost, true_cost] # get_snt_cost = theano.function(training_inputs[:4], snt_cost) get_snt_cost = None # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope('soft_decoder'): initial_state = nn.feedforward(states[-1], [shdim, thdim], True, scope='initial', activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) soft_states, soft_contexts, soft_probs, soft_mask = soft_decoder.infer( mapped_keys, src_mask, annotation, initial_state, target_embedding, target_bias, 1.0) # decoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query with ops.variable_scope('map-key-src'): mapped_keys_src = map_key(annotation, 2 * shdim, ahdim) with ops.variable_scope('map-key-soft'): mapped_keys_soft = map_key(soft_states, thdim, ahdim) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step( prev_inputs, mask, initial_state, *[ mapped_keys_src, mapped_keys_soft, annotation, soft_states, src_mask, soft_mask ]) probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ initial_state, annotation, soft_states, mapped_keys_src, mapped_keys_soft, soft_mask ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": raise ValueError() prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys_src, src_mask, soft_states, mapped_keys_soft, soft_mask ] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] self.align = None self.sample = None self.encode = encode self.get_snt_cost = get_snt_cost self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding") as tgtembscope: target_embedding = ops.get_variable("embedding", [tvsize, tedim]) # target_bias = ops.get_variable("bias", [tedim]) decoder.tiescope = tgtembscope source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) # compute initial state for decoder # first state of backward encoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim) _, _, cost,_ = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, keep_prob) training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] training_outputs = [cost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) # target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) # decoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) # prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [annotation, initial_state, mapped_keys] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [prev_words, initial_state, annotation, mapped_keys, src_mask] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [prev_words, initial_state, annotation, mapped_keys, src_mask] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict # optional graph ''' with ops.variable_scope(scope, reuse=True): sample = decoder.build_sampling(src_seq, src_mask, target_embedding, target_bias, mapped_keys, annotation, initial_state) align = decoder.build_attention(src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, mapped_keys, annotation, initial_state) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim) _, _, _,snt_cost = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, 1.0) get_snt_cost = theano.function(training_inputs, snt_cost) ''' self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] # self.align = align # self.sample = sample self.encode = encode # self.get_snt_cost = get_snt_cost self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim, xposhdim, yposhdim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim, xposnn, yposnn, word2pos, pos2word, pos2pos = option[ "hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab, tagvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab stag2id, ttag2id = tagvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) stagsize, ttagsize = len(stag2id), len(ttag2id) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim + xposhdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, n_y_vocab=tvsize, n_y_tagvocab=ttagsize, poshdim=yposhdim, posnndim=yposnn, word2pos=word2pos, pos2word=pos2word, pos2pos=pos2pos) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") src_pos = T.imatrix("source_postag") tgt_pos = T.imatrix("target_postag") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) with ops.variable_scope("srctag_embedding"): srctag_embedding = ops.get_variable("embedding", [stagsize, xposhdim]) srctag_bias = ops.get_variable("bias", [xposhdim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("srcpostagger"): tempstates = nn.feedforward(annotation, [shdim * 2, xposnn], True, scope="staggerstates", activation=T.nnet.relu) scores = nn.linear(tempstates, [xposnn, stagsize], True, scope="staggerscores") new_shape = [scores.shape[0] * scores.shape[1], -1] scores = scores.reshape(new_shape) srcposprobs = T.nnet.softmax(scores) srctaggerstates = T.dot(srcposprobs, srctag_embedding) + srctag_bias srctaggerstates = srctaggerstates.reshape( [annotation.shape[0], annotation.shape[1], -1]) idx = T.arange(src_pos.flatten().shape[0]) ce = -T.log(srcposprobs[idx, src_pos.flatten()]) ce = ce.reshape(src_pos.shape) ce = T.sum(ce * src_mask, 0) srcpos_cost = T.mean(ce) tempposkeys = T.concatenate([srctaggerstates, tempstates], -1) src_words_keys = map_key(annotation, 2 * shdim, ahdim, "srcwordkeys") src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos, "srcposkeys") pos_words_keys = map_key(annotation, 2 * shdim, pos2word, "pos2wordkeys") pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos, "pos2poskeys") annotation = T.concatenate([annotation, srctaggerstates], -1) # compute initial state for decoder # first state of backward encoder final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim + xposhdim, thdim], True, scope="initial", activation=T.tanh) _, _, transcost, _, tgtpos_cost = decoder.forward( tgt_seq, target_inputs, tgt_mask, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, src_mask, annotation, initial_state, tgt_pos, keep_prob) lambx = theano.shared(numpy.asarray(option["lambda"][0], dtype), "lambdax") lamby = theano.shared(numpy.asarray(option["lambda"][1], dtype), "lambday") totalcost = transcost + lambx * srcpos_cost + lamby * tgtpos_cost training_inputs = [ src_seq, src_mask, tgt_seq, tgt_mask, src_pos, tgt_pos ] training_outputs = [srcpos_cost, tgtpos_cost, transcost, totalcost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("srcpostagger"): tempstates = nn.feedforward(annotation, [shdim * 2, xposnn], True, scope="staggerstates", activation=T.nnet.relu) scores = nn.linear(tempstates, [xposnn, stagsize], True, scope="staggerscores") new_shape = [scores.shape[0] * scores.shape[1], -1] scores = scores.reshape(new_shape) srcposprobs = T.nnet.softmax(scores) srctaggerstates = T.dot(srcposprobs, srctag_embedding) + srctag_bias srctaggerstates = srctaggerstates.reshape( [annotation.shape[0], annotation.shape[1], -1]) tempposkeys = T.concatenate([srctaggerstates, tempstates], -1) src_words_keys = map_key(annotation, 2 * shdim, ahdim, "srcwordkeys") src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos, "srcposkeys") pos_words_keys = map_key(annotation, 2 * shdim, pos2word, "pos2wordkeys") pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos, "pos2poskeys") annotation = T.concatenate([annotation, srctaggerstates], -1) # decoder final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim + xposhdim, thdim], True, scope="initial", activation=T.tanh) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context, next_pos, tgtposprob = decoder.step( prev_inputs, mask, initial_state, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, annotation, src_mask) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context, next_pos) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ annotation, initial_state, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, srcposprobs ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, src_mask ] prediction_outputs = [probs, next_state, tgtposprob] predict = theano.function(prediction_inputs, prediction_outputs, on_unused_input='warn') self.predict = predict self.cost = totalcost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] self.encode = encode self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim, domaindim, feadim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) dnum = option['dnum'] if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dnum=dnum, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, dim_domain=domaindim, feadim=feadim, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") tag_seq = T.imatrix("domain_tag") # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'), # T.arange(src_mask.shape[1])], 0.0) with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding") as tgtembscope: target_embedding = ops.get_variable("embedding", [tvsize, tedim]) # target_bias = ops.get_variable("bias", [tedim]) decoder.tiescope = tgtembscope source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") dscores = nn.feedforward(dfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) dprobs = T.nnet.softmax(dscores) dpred_tag = T.argmax(dprobs, 1) didx = T.arange(tag_seq.flatten().shape[0]) dce = -T.log(dprobs[didx, tag_seq.flatten()]) dcost = T.mean(dce) share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") with ops.variable_scope("Shared"): sscores = nn.feedforward(sfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) sprobs = T.nnet.softmax(sscores) spred_tag = T.argmax(sprobs, 1) sidx = T.arange(tag_seq.flatten().shape[0]) sce = -T.log(sprobs[sidx, tag_seq.flatten()]) scost = T.mean(sce) adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log( sprobs[sidx, tag_seq.flatten()]) adv_scost = T.mean(adv_sce) domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate domain_annotation = nn.dropout(domain_annotation, keep_prob=keep_prob) share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate annotation = nn.dropout(annotation, keep_prob=keep_prob) # compute initial state for decoder # first state of backward encoder # batch * shdim final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward( tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, mapped_domain_keys, domain_annotation, tag_seq, keep_prob) lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda") # cwscost *= lamb final_cost = cost + dcost + tgtdcost - lamb * adv_scost tag_inputs = [src_seq, src_mask] tag_outputs = [dpred_tag, spred_tag] tag_predict = theano.function(tag_inputs, tag_outputs) self.tag_predict = tag_predict tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] tgt_tag_outputs = [tpred_tag] tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs) self.tgt_tag_predict = tgt_tag_predict training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq] training_outputs = [cost, dcost, adv_scost, tgtdcost] self.cost_cla = scost self.inputs_cla = [src_seq, src_mask, tag_seq] self.outputs_cla = [scost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate # decoder final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") prev_inputs = nn.embedding_lookup(target_embedding, prev_words) # prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask, mapped_domain_keys, domain_annotation) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ annotation, initial_state, mapped_keys, mapped_domain_keys, domain_annotation ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask, mapped_domain_keys, domain_annotation ] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict self.cost = final_cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] # self.align = align # self.sample = sample self.encode = encode # self.get_snt_cost = get_snt_cost self.option = option