def network(self, input, keep_prob=0.5, reuse=None): with tf.variable_scope('network', reuse=reuse): pool_ = lambda x: nn.max_pool(x, 2, 2) max_out_ = lambda x: nn.max_out(x, 16) conv_ = lambda x, output_depth, name, trainable=True: nn.conv( x, 3, output_depth, 1, self.weight_decay, name=name, trainable=trainable) fc_ = lambda x, features, name, relu=True: nn.fc( x, features, self.weight_decay, name, relu=relu) VGG_MEAN = [103.939, 116.779, 123.68] # Convert RGB to BGR and subtract mean # red, green, blue = tf.split(input, 3, axis=3) input = tf.concat([ input - 24, input - 24, input - 24, ], axis=3) conv_1_1 = conv_(input, 64, 'conv1_1', trainable=False) conv_1_2 = conv_(conv_1_1, 64, 'conv1_2', trainable=False) pool_1 = pool_(conv_1_2) conv_2_1 = conv_(pool_1, 128, 'conv2_1', trainable=False) conv_2_2 = conv_(conv_2_1, 128, 'conv2_2', trainable=False) pool_2 = pool_(conv_2_2) conv_3_1 = conv_(pool_2, 256, 'conv3_1') conv_3_2 = conv_(conv_3_1, 256, 'conv3_2') conv_3_3 = conv_(conv_3_2, 256, 'conv3_3') pool_3 = pool_(conv_3_3) conv_4_1 = conv_(pool_3, 512, 'conv4_1') conv_4_2 = conv_(conv_4_1, 512, 'conv4_2') conv_4_3 = conv_(conv_4_2, 512, 'conv4_3') pool_4 = pool_(conv_4_3) conv_5_1 = conv_(pool_4, 512, 'conv5_1') conv_5_2 = conv_(conv_5_1, 512, 'conv5_2') conv_5_3 = conv_(conv_5_2, 512, 'conv5_3') pool_5 = pool_(conv_5_3) if self.maxout: max_5 = max_out_(pool_5) flattened = tf.contrib.layers.flatten(max_5) else: flattened = tf.contrib.layers.flatten(pool_5) fc_6 = nn.dropout(fc_(flattened, 4096, 'fc6'), keep_prob) fc_7 = nn.dropout(fc_(fc_6, 4096, 'fc7'), keep_prob) fc_8 = fc_(fc_7, self.label_dim, 'fc8', relu=False) return fc_8
def network(self, input, keep_prob=0.5, reuse=None): with tf.variable_scope("network", reuse=reuse): pool_ = lambda x: nn.max_pool(x, 2, 2) max_out_ = lambda x: nn.max_out(x, 16) config = self.config conv_ = lambda x, output_depth, name, stride=1, padding="SAME", relu=True, filter_size=3: conv( x, filter_size, output_depth, stride, name=name, padding=padding, relu=relu, ) fc_ = lambda x, features, name, relu=True: fc( x, features, name, relu=relu) VGG_MEAN = [config.mean, config.mean, config.mean] input = tf.concat([ input - VGG_MEAN[0], input - VGG_MEAN[1], input - VGG_MEAN[2] ], axis=3) conv_1_1 = conv_(input, 64, "conv1_1") # , trainable = False) conv_1_2 = conv_(conv_1_1, 64, "conv1_2") # , trainable = False) pool_1 = pool_(conv_1_2) conv_2_1 = conv_(pool_1, 128, "conv2_1") # , trainable = False) conv_2_2 = conv_(conv_2_1, 128, "conv2_2") # , trainable = False) pool_2 = pool_(conv_2_2) conv_3_1 = conv_(pool_2, 256, "conv3_1") conv_3_2 = conv_(conv_3_1, 256, "conv3_2") conv_3_3 = conv_(conv_3_2, 256, "conv3_3") pool_3 = pool_(conv_3_3) conv_4_1 = conv_(pool_3, 512, "conv4_1") conv_4_2 = conv_(conv_4_1, 512, "conv4_2") conv_4_3 = conv_(conv_4_2, 512, "conv4_3") pool_4 = pool_(conv_4_3) conv_5_1 = conv_(pool_4, 512, "conv5_1") conv_5_2 = conv_(conv_5_1, 512, "conv5_2") conv_5_3 = conv_(conv_5_2, 512, "conv5_3") pool_5 = pool_(conv_5_3) flattened = tf.contrib.layers.flatten( pool_5) # i.e. assume self.maxout=False fc_6 = nn.dropout(fc_(flattened, 4096, "fc6"), keep_prob) fc_7 = nn.dropout(fc_(fc_6, 4096, "fc7"), keep_prob) fc_8 = fc_(fc_7, config.label_dim, "fc8", relu=False) return fc_8
def prediction(self, y_emb, state, context, y_pos, keep_prob=1.0): """ readout -> softmax p(y_j) \propto f(y_{j-1}, s_{j}, c_{j}) :param y_pos: :param y_emb: :param state: :param context: :param keep_prob: :return: """ state = nn.feedforward([state, y_pos], [[self.dim_hid, self.poshdim], self.dim_hid], True, activation=T.tanh, scope="enhancedstate") features = [state, y_emb, context, y_pos] readout = nn.feedforward(features, [[self.dim_hid, self.dim_y, self.dim_value, self.poshdim], self.dim_readout], True, activation=T.tanh, scope="readout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = T.nnet.softmax(logits) return probs
def prediction(self, y_emb, state, context, keep_prob=1.0): """ readout -> softmax p(y_j) \propto f(y_{j-1}, s_{j}, c_{j}) :param y_emb: :param state: :param context: :param keep_prob: :return: """ features = [state, y_emb, context] readout = nn.feedforward( features, [[self.dim_hid, self.dim_y, self.dim_value], self.dim_readout], True, activation=T.tanh, scope="readout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) with ops.variable_scope(self.tiescope, reuse=True): target_embedding = ops.get_variable( "embedding", [self.n_y_vocab, self.dim_readout]) target_embedding = target_embedding.T logits = T.dot(readout, target_embedding) # logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True, # scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = T.nnet.softmax(logits) return probs
def prediction(self, y_emb, state, context, keep_prob=1.0): """ maxout -> readout -> softmax p(y_j) \propto f(y_{j-1}, s_{j-1}, c_{j}) :param y_emb: :param state: :param context: :param keep_prob: :return: """ features = [state, y_emb, context] maxhid = nn.maxout( features, [[self.dim_hid, self.dim_y, self.dim_value], self.dim_maxout], self.max_part, True) readout = nn.linear(maxhid, [self.dim_maxout, self.dim_readout], False, scope="readout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = T.nnet.softmax(logits) return probs
def F(inputs, d, activation=tf.nn.relu, kernel_initializer=None, scope=None, use_bias=True, input_keep_prob=1.0, wd=0.0, is_train=None): out = dropout(inputs, input_keep_prob, is_train) with tf.variable_scope(scope or "projection"): out = tf.layers.dense(out, d, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer) if wd: add_wd(wd) return out
def siamese(self, input1, input2, keep_prob = 0.5, weight_decay = \ weight_decay_factor, reuse = None): fc_ = lambda x, features, name, relu = True: nn.fc(x, features, weight_decay, name, relu = relu) feat1, _ = self.vgg(input1, keep_prob, True) feat2, _ = self.vgg(input2, keep_prob, True) with tf.variable_scope('network', reuse = reuse): fc_combined = tf.concat((feat1,feat2),1) fc_8 = nn.dropout(fc_(fc_combined, 4096, 'fc8'), keep_prob) fc_9 = fc_(fc_8, 2, 'fc9', relu = False) return fc_9
def prediction(prev_inputs, prev_state, context, keep_prob=1.0): features = [prev_state, prev_inputs, context] maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim], maxpart, True) readout = nn.linear(maxhid, [maxdim, deephid], False, scope="deepout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [deephid, tvsize], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = theano.tensor.nnet.softmax(logits) return probs
def cudnn_rnn(rnn_type, inputs, length, hidden_size, num_layers=1, dropout_keep_prob=1.0, concat=True, initial_state=None, kernel_initializer=tf.random_normal_initializer(stddev=0.1), wd=0.0, is_train=False, scope=None): with tf.variable_scope(scope or 'cudnn_rnn'): direction = "bidirectional" if 'bi' in rnn_type else "unidirectional" input_size = inputs.get_shape().as_list()[-1] if rnn_type.endswith('gru'): rnn = CudnnGRU(num_layers=num_layers, num_units=hidden_size, input_mode='linear_input', direction=direction, dropout=1-dropout_keep_prob, name='rnn') elif rnn_type.endswith('lstm'): rnn = CudnnLSTM(num_layers=num_layers, num_units=hidden_size, input_mode='linear_input', direction=direction, dropout=1-dropout_keep_prob, name='rnn') else: raise NotImplementedError("{} is not supported.".format(rnn_type)) inputs = dropout(inputs, dropout_keep_prob, is_train) outputs, _ = rnn(tf.transpose(inputs, [1, 0, 2])) outputs = tf.transpose(outputs, [1, 0, 2]) # [N, JX, 2*d] output_h = None if wd: add_wd(wd) return outputs, output_h
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX scope = option["scope"] initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 def prediction(prev_inputs, prev_state, context, keep_prob=1.0): features = [prev_state, prev_inputs, context] maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim], maxpart, True) readout = nn.linear(maxhid, [maxdim, deephid], False, scope="deepout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [deephid, tvsize], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = theano.tensor.nnet.softmax(logits) return probs # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = theano.tensor.imatrix("soruce_sequence") src_mask = theano.tensor.matrix("soruce_sequence_mask") tgt_seq = theano.tensor.imatrix("target_sequence") tgt_mask = theano.tensor.matrix("target_sequence_mask") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias target_inputs = target_inputs + target_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) cell = nn.rnn_cell.gru_cell([sedim, shdim]) if keep_prob < 1.0: cell = nn.rnn_cell.dropout_wrapper(cell) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) # compute initial state for decoder # first state of backward encoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) if keep_prob < 1.0: cell = nn.rnn_cell.dropout_wrapper(cell) # run decoder decoder_outputs = decoder(cell, target_inputs, tgt_mask, initial_state, annotation, src_mask, ahdim) all_output, all_context = decoder_outputs shift_inputs = theano.tensor.zeros_like(target_inputs) shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:], target_inputs[:-1]) init_state = initial_state[None, :, :] all_states = theano.tensor.concatenate([init_state, all_output], 0) prev_states = all_states[:-1] with ops.variable_scope("decoder"): probs = prediction(shift_inputs, prev_states, all_context, keep_prob=keep_prob) # compute cost idx = theano.tensor.arange(tgt_seq.flatten().shape[0]) cost = -theano.tensor.log(probs[idx, tgt_seq.flatten()]) cost = cost.reshape(tgt_seq.shape) cost = theano.tensor.sum(cost * tgt_mask, 0) cost = theano.tensor.mean(cost) training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] training_outputs = [cost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = theano.tensor.ivector("prev_words") # encoder, disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias cell = nn.rnn_cell.gru_cell([sedim, shdim]) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) # decoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) inputs = nn.embedding_lookup(target_embedding, prev_words) inputs = inputs + target_bias cond = theano.tensor.neq(prev_words, 0) # zeros out embedding if y is 0 inputs = inputs * cond[:, None] cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) with ops.variable_scope("decoder"): mapped_states = map_attention_states(annotation, 2 * shdim, ahdim) alpha = attention(initial_state, mapped_states, thdim, ahdim, src_mask) context = theano.tensor.sum(alpha[:, :, None] * annotation, 0) output, next_state = cell([inputs, context], initial_state) probs = prediction(inputs, initial_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [annotation, initial_state, mapped_states] encode = theano.function(encoding_inputs, encoding_outputs) prediction_inputs = [prev_words, initial_state, annotation, mapped_states, src_mask] prediction_outputs = [probs, context, alpha] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.encode = encode self.predict = predict self.generate = generate self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder2" encoder = Encoder(sedim, shdim) import decoder2 decoder = decoder2.DecoderGruCond(2, option['method'], tedim, thdim, ahdim, 2 * shdim + thdim, dim_readout=deephid, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") byseq = T.imatrix("backward_target_sequence") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) + target_bias by_inputs = nn.embedding_lookup(target_embedding, byseq) + target_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) by_inputs = nn.dropout(by_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) annotation = nn.dropout(annotation, keep_prob=keep_prob) import softdec soft_decoder = softdec.SoftDecoder(option["eosid"], option["softk"], tedim, thdim, ahdim, 2 * shdim, dim_readout=deephid, n_y_vocab=tvsize) with ops.variable_scope('soft_decoder'): initial_state = nn.feedforward(states[-1], [shdim, thdim], True, scope='initial', activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) soft_states, _, _, soft_mask = soft_decoder.infer( mapped_keys, src_mask, annotation, initial_state, target_embedding, target_bias, keep_prob) with ops.variable_scope('soft_decoder', reuse=True): _, _, soft_cost, _ = soft_decoder.forward( byseq, by_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, keep_prob) # compute initial state for decoder # first state of backward encoder # initialize with only encoder state final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query with ops.variable_scope('map-key-src'): mapped_keys_src = map_key(annotation, 2 * shdim, ahdim) with ops.variable_scope('map-key-soft'): mapped_keys_soft = map_key(soft_states, thdim, ahdim) _, _, _, snt_cost = decoder.forward( tgt_seq, target_inputs, tgt_mask, [mapped_keys_src, mapped_keys_soft], [src_mask, soft_mask], [annotation, soft_states], initial_state, keep_prob) ce = snt_cost true_cost = T.mean(ce) lamb = theano.shared(numpy.asarray(option['lambda'], dtype), 'lambda') cost = lamb * soft_cost + (1 - lamb) * true_cost # import utils.ttensor # print 'true_cost %d:' % len(utils.ttensor.find_inputs_and_params(true_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(true_cost)[0]: # print '\t', xxx # print 'soft_cost %d:' % len(utils.ttensor.find_inputs_and_params(soft_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(soft_cost)[0]: # print '\t', xxx # print 'tot_cost: %d' % len(utils.ttensor.find_inputs_and_params(cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(cost)[0]: # print '\t', xxx # print 'snt_cost: %d' % len(utils.ttensor.find_inputs_and_params(snt_cost)[0]) # for xxx in utils.ttensor.find_inputs_and_params(snt_cost)[0]: # print '\t', xxx training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, byseq] training_outputs = [cost, soft_cost, true_cost] # get_snt_cost = theano.function(training_inputs[:4], snt_cost) get_snt_cost = None # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope('soft_decoder'): initial_state = nn.feedforward(states[-1], [shdim, thdim], True, scope='initial', activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) soft_states, soft_contexts, soft_probs, soft_mask = soft_decoder.infer( mapped_keys, src_mask, annotation, initial_state, target_embedding, target_bias, 1.0) # decoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query with ops.variable_scope('map-key-src'): mapped_keys_src = map_key(annotation, 2 * shdim, ahdim) with ops.variable_scope('map-key-soft'): mapped_keys_soft = map_key(soft_states, thdim, ahdim) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step( prev_inputs, mask, initial_state, *[ mapped_keys_src, mapped_keys_soft, annotation, soft_states, src_mask, soft_mask ]) probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ initial_state, annotation, soft_states, mapped_keys_src, mapped_keys_soft, soft_mask ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": raise ValueError() prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys_src, src_mask, soft_states, mapped_keys_soft, soft_mask ] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] self.align = None self.sample = None self.encode = encode self.get_snt_cost = get_snt_cost self.option = option
def build(self): params = self.params V, d, A = params.embed_size, params.hidden_size, self.words.vocab_size # initialize self # placeholders input = tf.placeholder('int32', shape=[self.params.batch_size, self.params.max_fact_count, self.params.max_sent_size], name='x') # [num_batch, fact_count, sentence_len] question = tf.placeholder('int32', shape=[self.params.batch_size, self.params.max_ques_size], name='q') # [num_batch, question_len] answer = tf.placeholder('int32', shape=[self.params.batch_size], name='y') # [num_batch] - one word answer fact_counts = tf.placeholder('int64', shape=[self.params.batch_size], name='fc') input_mask = tf.placeholder('float32', shape=[self.params.batch_size, self.params.max_fact_count, self.params.max_sent_size,self.params.embed_size], name='xm') is_training = tf.placeholder(tf.bool) self.att = tf.constant(0.) # Prepare parameters gru = tf.nn.rnn_cell.GRUCell(self.params.hidden_size) l = self.positional_encoding() embedding = weight('embedding', [self.words.vocab_size, self.params.embed_size], init='uniform', range=3 ** (1 / 2)) with tf.name_scope('SentenceReader'): input_list = tf.unstack(tf.transpose(input)) # L x [F, N] input_embed = [] for facts in input_list: facts = tf.unstack(facts) embed = tf.stack([tf.nn.embedding_lookup(embedding, w) for w in facts]) # [F, N, V] input_embed.append(embed) # apply positional encoding input_embed = tf.transpose(tf.stack(input_embed), [2, 1, 0, 3]) # [N, F, L, V] encoded = l * input_embed * input_mask facts = tf.reduce_sum(encoded, 2) # [N, F, V] # dropout time facts = dropout(facts, params.keep_prob, is_training) with tf.name_scope('InputFusion'): # Bidirectional RNN with tf.variable_scope('Forward'): forward_states, _ = tf.nn.dynamic_rnn(gru, facts, fact_counts, dtype=tf.float32) with tf.variable_scope('Backward'): facts_reverse = tf.reverse_sequence(facts, fact_counts, 1) backward_states, _ = tf.nn.dynamic_rnn(gru, facts_reverse, fact_counts, dtype=tf.float32) # Use forward and backward states both facts = forward_states + backward_states # [N, F, d] with tf.variable_scope('Question'): tf.logging.info(question) ques_list = tf.unstack(tf.transpose(question)) tf.logging.info(ques_list) ques_embed = tf.stack([tf.nn.embedding_lookup(embedding, w) for w in ques_list]) #ques_embed = tf.expand_dims(ques_embed, 0) tf.logging.info(ques_embed) initial_state = gru.zero_state(self.params.batch_size, dtype=tf.float32) _, question_vec = tf.nn.dynamic_rnn(gru, ques_embed,initial_state=initial_state, dtype=tf.float32,time_major=True) # Episodic Memory with tf.variable_scope('Episodic'): episode = EpisodeModule(self.params.hidden_size, question_vec, facts, is_training, self.params.batch_norm) memory = tf.identity(question_vec) for t in range(params.memory_step): with tf.variable_scope('Layer%d' % t) as scope: if params.memory_update == 'gru': memory = gru(episode.new(memory), memory)[0] else: # ReLU update c = episode.new(memory) concated = tf.concat([memory, c, question_vec],1) w_t = weight('w_t', [3 * d, d]) z = tf.matmul(concated, w_t) if params.batch_norm: z = batch_norm(z, is_training) else: b_t = bias('b_t', d) z = z + b_t memory = tf.nn.relu(z) # [N, d] scope.reuse_variables() # Regularizations if params.batch_norm: memory = batch_norm(memory, is_training=is_training) memory = dropout(memory, params.keep_prob, is_training) with tf.name_scope('Answer'): # Answer module : feed-forward version (for it is one word answer) w_a = weight('w_a', [d, A], init='xavier') logits = tf.matmul(memory, w_a) # [N, A] with tf.name_scope('Loss'): # Cross-Entropy loss cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=answer) loss = tf.reduce_mean(cross_entropy) total_loss = loss + params.weight_decay * tf.add_n(tf.get_collection('l2')) with tf.variable_scope('Accuracy'): # Accuracy predicts = tf.cast(tf.argmax(logits, 1), 'int32') corrects = tf.equal(predicts, answer) num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) # Training optimizer = tf.train.AdamOptimizer(params.learning_rate) opt_op = optimizer.minimize(total_loss, global_step=self.global_step) # placeholders self.x = input self.xm = input_mask self.q = question self.y = answer self.fc = fact_counts self.is_training = is_training # tensors self.total_loss = total_loss self.num_corrects = num_corrects self.accuracy = accuracy self.opt_op = opt_op
def model_spec(x, keep_prob=0.5, deterministic=False, init=False, use_weight_normalization=False, use_batch_normalization=False, use_mean_only_batch_normalization=False): x = nn.gaussian_noise(x, deterministic=deterministic, name='gaussian_noise') x = nn.conv2d( x, num_filters=96, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='conv1', nonlinearity=nn.lRelu) x = nn.conv2d( x, num_filters=96, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='conv2', nonlinearity=nn.lRelu) x = nn.conv2d( x, num_filters=96, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='conv3', nonlinearity=nn.lRelu) x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='max_pool_1') x = nn.dropout(x, keep_prob=keep_prob, deterministic=deterministic, name='drop1') x = nn.conv2d( x, num_filters=192, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='conv4', nonlinearity=nn.lRelu) x = nn.conv2d( x, num_filters=192, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='conv5', nonlinearity=nn.lRelu) x = nn.conv2d( x, num_filters=192, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='conv6', nonlinearity=nn.lRelu) x = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='max_pool_2') x = nn.dropout(x, keep_prob=keep_prob, deterministic=deterministic, name='drop2') x = nn.conv2d( x, num_filters=192, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, pad='VALID', name='conv7', nonlinearity=nn.lRelu) x = nn.NiN( x, num_units=192, nonlinearity=nn.lRelu, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='Nin1') x = nn.NiN( x, num_units=192, nonlinearity=nn.lRelu, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='Nin2') x = nn.globalAvgPool(x, name='Globalavgpool1') x = nn.dense( x, num_units=10, nonlinearity=None, init=init, use_weight_normalization=use_weight_normalization, use_batch_normalization=use_batch_normalization, use_mean_only_batch_normalization=use_mean_only_batch_normalization, deterministic=deterministic, name='output_dense') return x
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "criterion" not in option: option["criterion"] = "mle" if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX scope = option["scope"] criterion = option["criterion"] initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 # MRT mode do not use dropout if criterion == "mrt": keep_prob = 1.0 def prediction(prev_inputs, prev_state, context, keep_prob=1.0): features = [prev_state, prev_inputs, context] maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim], maxpart, True) readout = nn.linear(maxhid, [maxdim, deephid], False, scope="deepout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [deephid, tvsize], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = theano.tensor.nnet.softmax(logits) return probs # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = theano.tensor.imatrix("soruce_sequence") src_mask = theano.tensor.matrix("soruce_sequence_mask") tgt_seq = theano.tensor.imatrix("target_sequence") tgt_mask = theano.tensor.matrix("target_sequence_mask") if criterion == "mrt": loss = theano.tensor.vector("loss_score") sharp = theano.tensor.scalar("sharpness") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding"): target_embedding = ops.get_variable("embedding", [tvsize, tedim]) target_bias = ops.get_variable("bias", [tedim]) source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias target_inputs = target_inputs + target_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) cell = nn.rnn_cell.gru_cell([sedim, shdim]) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) annotation = nn.dropout(annotation, keep_prob=keep_prob) # compute initial state for decoder # first state of backward encoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) # run decoder cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) if criterion == "mrt": # In MRT training, shape of src_seq and src_mask are assumed # to have [len, 1] batch = tgt_seq.shape[1] with ops.variable_scope("decoder"): mapped_states = attention(None, annotation, None, None, [thdim, 2 * shdim, ahdim]) b_src_mask = theano.tensor.repeat(src_mask, batch, 1) b_annotation = theano.tensor.repeat(annotation, batch, 1) b_mapped_states = theano.tensor.repeat(mapped_states, batch, 1) b_initial_state = theano.tensor.repeat(initial_state, batch, 0) decoder_outputs = decoder(cell, target_inputs, tgt_mask, b_initial_state, b_annotation, b_src_mask, ahdim, b_mapped_states) else: decoder_outputs = decoder(cell, target_inputs, tgt_mask, initial_state, annotation, src_mask, ahdim) all_output, all_context = decoder_outputs shift_inputs = theano.tensor.zeros_like(target_inputs) shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:], target_inputs[:-1]) if criterion == "mrt": init_state = b_initial_state[None, :, :] else: init_state = initial_state[None, :, :] all_states = theano.tensor.concatenate([init_state, all_output], 0) prev_states = all_states[:-1] with ops.variable_scope("decoder"): probs = prediction(shift_inputs, prev_states, all_context, keep_prob=keep_prob) # compute cost idx = theano.tensor.arange(tgt_seq.flatten().shape[0]) ce = -theano.tensor.log(probs[idx, tgt_seq.flatten()]) ce = ce.reshape(tgt_seq.shape) ce = theano.tensor.sum(ce * tgt_mask, 0) if criterion == "mle": cost = theano.tensor.mean(ce) else: # ce is positive here logp = -ce score = sharp * logp # safe softmax score = score - theano.tensor.max(score) score = theano.tensor.exp(score) qprob = score / theano.tensor.sum(score) risk = theano.tensor.sum(qprob * loss) cost = risk if criterion == "mle": training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] else: training_inputs = [ src_seq, src_mask, tgt_seq, tgt_mask, loss, sharp ] training_outputs = [cost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = theano.tensor.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) target_inputs = target_inputs + target_bias cell = nn.rnn_cell.gru_cell([sedim, shdim]) outputs = encoder(cell, source_inputs, src_mask) annotation = theano.tensor.concatenate(outputs, 2) # decoder final_state = outputs[1][0] with ops.variable_scope("decoder"): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=theano.tensor.tanh) inputs = nn.embedding_lookup(target_embedding, prev_words) inputs = inputs + target_bias cond = theano.tensor.neq(prev_words, 0) # zeros out embedding if y is 0 inputs = inputs * cond[:, None] cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim]) # encode -> prediction -> generation # prediction: prev_word + prev_state => context, next_word # generation: curr_word + context + prev_state => next_state # here, initial_state is merely a placeholder with ops.variable_scope("decoder"): # used in encoding mapped_states = attention(None, annotation, None, None, [thdim, 2 * shdim, ahdim]) # used in prediction alpha = attention(initial_state, None, mapped_states, src_mask, [thdim, 2 * shdim, ahdim]) context = theano.tensor.sum(alpha[:, :, None] * annotation, 0) probs = prediction(inputs, initial_state, context) # used in generation output, next_state = cell([inputs, context], initial_state) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [annotation, initial_state, mapped_states] encode = theano.function(encoding_inputs, encoding_outputs) prediction_inputs = [ prev_words, initial_state, annotation, mapped_states, src_mask ] prediction_outputs = [probs, context, alpha] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) # sampling graph, this feature is optional with ops.variable_scope(scope, reuse=True): max_len = theano.tensor.iscalar() def sampling_loop(inputs, state, attn_states, attn_mask, m_states): alpha = attention(state, None, m_states, attn_mask, [thdim, 2 * shdim, ahdim]) context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0) probs = prediction(inputs, state, context) next_words = ops.random.multinomial(probs).argmax(axis=1) new_inputs = nn.embedding_lookup(target_embedding, next_words) new_inputs = new_inputs + target_bias output, next_state = cell([new_inputs, context], state) return [next_words, new_inputs, next_state] with ops.variable_scope("decoder"): batch = src_seq.shape[1] initial_inputs = theano.tensor.zeros([batch, tedim], dtype=dtype) outputs_info = [None, initial_inputs, initial_state] nonseq = [annotation, src_mask, mapped_states] outputs, updates = theano.scan(sampling_loop, [], outputs_info, nonseq, n_steps=max_len) sampled_words = outputs[0] sampling_inputs = [src_seq, src_mask, max_len] sampling_outputs = sampled_words sample = theano.function(sampling_inputs, sampling_outputs, updates=updates) # attention graph, this feature is optional with ops.variable_scope(scope, reuse=True): def attention_loop(inputs, mask, state, attn_states, attn_mask, m_states): mask = mask[:, None] alpha = attention(state, None, m_states, attn_mask, [thdim, 2 * shdim, ahdim]) context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0) output, next_state = cell([inputs, context], state) next_state = (1.0 - mask) * state + mask * next_state return [alpha, next_state] with ops.variable_scope("decoder"): seq = [target_inputs, tgt_mask] outputs_info = [None, initial_state] nonseq = [annotation, src_mask, mapped_states] (alpha, state), updaptes = theano.scan(attention_loop, seq, outputs_info, nonseq) attention_score = alpha alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] alignment_outputs = attention_score align = theano.function(alignment_inputs, alignment_outputs) self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] self.align = align self.sample = sample self.encode = encode self.predict = predict self.generate = generate self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding") as tgtembscope: target_embedding = ops.get_variable("embedding", [tvsize, tedim]) # target_bias = ops.get_variable("bias", [tedim]) decoder.tiescope = tgtembscope source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) # compute initial state for decoder # first state of backward encoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim) _, _, cost,_ = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, keep_prob) training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] training_outputs = [cost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) # target_inputs = target_inputs + target_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) # decoder final_state = r_states[0] with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim) prev_inputs = nn.embedding_lookup(target_embedding, prev_words) # prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [annotation, initial_state, mapped_keys] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [prev_words, initial_state, annotation, mapped_keys, src_mask] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [prev_words, initial_state, annotation, mapped_keys, src_mask] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict # optional graph ''' with ops.variable_scope(scope, reuse=True): sample = decoder.build_sampling(src_seq, src_mask, target_embedding, target_bias, mapped_keys, annotation, initial_state) align = decoder.build_attention(src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, mapped_keys, annotation, initial_state) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim) _, _, _,snt_cost = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, 1.0) get_snt_cost = theano.function(training_inputs, snt_cost) ''' self.cost = cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] # self.align = align # self.sample = sample self.encode = encode # self.get_snt_cost = get_snt_cost self.option = option
def __init__(self, **option): # source and target embedding dim sedim, tedim = option["embdim"] # source, target and attention hidden dim shdim, thdim, ahdim, domaindim, feadim = option["hidden"] # maxout hidden dim maxdim = option["maxhid"] # maxout part maxpart = option["maxpart"] # deepout hidden dim deephid = option["deephid"] svocab, tvocab = option["vocabulary"] sw2id, sid2w = svocab tw2id, tid2w = tvocab # source and target vocabulary size svsize, tvsize = len(sid2w), len(tid2w) dnum = option['dnum'] if "scope" not in option or option["scope"] is None: option["scope"] = "rnnsearch" if "initializer" not in option: option["initializer"] = None if "regularizer" not in option: option["regularizer"] = None if "keep_prob" not in option: option["keep_prob"] = 1.0 dtype = theano.config.floatX initializer = option["initializer"] regularizer = option["regularizer"] keep_prob = option["keep_prob"] or 1.0 scope = option["scope"] decoder_scope = "decoder" encoder = Encoder(sedim, shdim) decoderType = eval("Decoder{}".format(option["decoder"])) decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dnum=dnum, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid, dim_domain=domaindim, feadim=feadim, n_y_vocab=tvsize) # training graph with ops.variable_scope(scope, initializer=initializer, regularizer=regularizer, dtype=dtype): src_seq = T.imatrix("source_sequence") src_mask = T.matrix("source_sequence_mask") tgt_seq = T.imatrix("target_sequence") tgt_mask = T.matrix("target_sequence_mask") tag_seq = T.imatrix("domain_tag") # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'), # T.arange(src_mask.shape[1])], 0.0) with ops.variable_scope("source_embedding"): source_embedding = ops.get_variable("embedding", [svsize, sedim]) source_bias = ops.get_variable("bias", [sedim]) with ops.variable_scope("target_embedding") as tgtembscope: target_embedding = ops.get_variable("embedding", [tvsize, tedim]) # target_bias = ops.get_variable("bias", [tedim]) decoder.tiescope = tgtembscope source_inputs = nn.embedding_lookup(source_embedding, src_seq) target_inputs = nn.embedding_lookup(target_embedding, tgt_seq) source_inputs = source_inputs + source_bias if keep_prob < 1.0: source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob) target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob) states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") dscores = nn.feedforward(dfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) dprobs = T.nnet.softmax(dscores) dpred_tag = T.argmax(dprobs, 1) didx = T.arange(tag_seq.flatten().shape[0]) dce = -T.log(dprobs[didx, tag_seq.flatten()]) dcost = T.mean(dce) share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") with ops.variable_scope("Shared"): sscores = nn.feedforward(sfeature, [feadim, dnum], True, activation=T.tanh, scope="score") # (batch, 2) sprobs = T.nnet.softmax(sscores) spred_tag = T.argmax(sprobs, 1) sidx = T.arange(tag_seq.flatten().shape[0]) sce = -T.log(sprobs[sidx, tag_seq.flatten()]) scost = T.mean(sce) adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log( sprobs[sidx, tag_seq.flatten()]) adv_scost = T.mean(adv_sce) domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate domain_annotation = nn.dropout(domain_annotation, keep_prob=keep_prob) share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate annotation = nn.dropout(annotation, keep_prob=keep_prob) # compute initial state for decoder # first state of backward encoder # batch * shdim final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) # keys for query mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward( tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask, annotation, initial_state, mapped_domain_keys, domain_annotation, tag_seq, keep_prob) lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda") # cwscost *= lamb final_cost = cost + dcost + tgtdcost - lamb * adv_scost tag_inputs = [src_seq, src_mask] tag_outputs = [dpred_tag, spred_tag] tag_predict = theano.function(tag_inputs, tag_outputs) self.tag_predict = tag_predict tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask] tgt_tag_outputs = [tpred_tag] tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs) self.tgt_tag_predict = tgt_tag_predict training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq] training_outputs = [cost, dcost, adv_scost, tgtdcost] self.cost_cla = scost self.inputs_cla = [src_seq, src_mask, tag_seq] self.outputs_cla = [scost] # decoding graph with ops.variable_scope(scope, reuse=True): prev_words = T.ivector("prev_words") # disable dropout source_inputs = nn.embedding_lookup(source_embedding, src_seq) source_inputs = source_inputs + source_bias states, r_states = encoder.forward(source_inputs, src_mask) annotation = T.concatenate([states, r_states], 2) with ops.variable_scope("Specific"): domain_alpha = domain_sensitive_attention( annotation, src_mask, shdim * 2, domaindim) # domain_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) domain_context = T.sum(annotation * domain_alpha[:, :, None], 0) dfeature = nn.feedforward(domain_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") share_alpha = domain_sensitive_attention(annotation, src_mask, shdim * 2, domaindim) # share_alpha = attention(r_states[0], annotation, nsrc_mask, # shdim, # shdim * 2) share_context = T.sum(annotation * share_alpha[:, :, None], 0) sfeature = nn.feedforward(share_context, [shdim * 2, feadim], True, activation=T.tanh, scope="feature1") domain_gate = nn.feedforward([dfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="domain_gate") domain_annotation = annotation * domain_gate share_gate = nn.feedforward([sfeature, annotation], [[feadim, shdim * 2], shdim * 2], True, scope="share_gate") annotation = annotation * share_gate # decoder final_state = T.concatenate([ annotation[0, :, annotation.shape[-1] / 2:], domain_annotation[0, :, annotation.shape[-1] / 2:] ], -1) with ops.variable_scope(decoder_scope): initial_state = nn.feedforward(final_state, [shdim * 2, thdim], True, scope="initial", activation=T.tanh) mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic") mapped_domain_keys = map_key(domain_annotation, 2 * shdim, ahdim, "domain") prev_inputs = nn.embedding_lookup(target_embedding, prev_words) # prev_inputs = prev_inputs + target_bias cond = T.neq(prev_words, 0) # zeros out embedding if y is 0, which indicates <s> prev_inputs = prev_inputs * cond[:, None] with ops.variable_scope(decoder_scope): mask = T.ones_like(prev_words, dtype=dtype) next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask, mapped_domain_keys, domain_annotation) if option["decoder"] == "GruSimple": probs = decoder.prediction(prev_inputs, initial_state, context) elif option["decoder"] == "GruCond": probs = decoder.prediction(prev_inputs, next_state, context) # encoding encoding_inputs = [src_seq, src_mask] encoding_outputs = [ annotation, initial_state, mapped_keys, mapped_domain_keys, domain_annotation ] encode = theano.function(encoding_inputs, encoding_outputs) if option["decoder"] == "GruSimple": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask ] prediction_outputs = [probs, context] predict = theano.function(prediction_inputs, prediction_outputs) generation_inputs = [prev_words, initial_state, context] generation_outputs = next_state generate = theano.function(generation_inputs, generation_outputs) self.predict = predict self.generate = generate elif option["decoder"] == "GruCond": prediction_inputs = [ prev_words, initial_state, annotation, mapped_keys, src_mask, mapped_domain_keys, domain_annotation ] prediction_outputs = [probs, next_state] predict = theano.function(prediction_inputs, prediction_outputs) self.predict = predict self.cost = final_cost self.inputs = training_inputs self.outputs = training_outputs self.updates = [] # self.align = align # self.sample = sample self.encode = encode # self.get_snt_cost = get_snt_cost self.option = option