def construct_model(vocab_size, embedding_dim, ngram_order, hidden_dims, activations): # Construct the model x = tensor.lmatrix('features') y = tensor.lvector('targets') lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup') hidden = MLP(activations=activations + [None], dims=[ngram_order * embedding_dim] + hidden_dims + [vocab_size]) embeddings = lookup.apply(x) embeddings = embeddings.flatten(ndim=2) # Concatenate embeddings activations = hidden.apply(embeddings) cost = Softmax().categorical_cross_entropy(y, activations) # Initialize parameters lookup.weights_init = IsotropicGaussian(0.001) hidden.weights_init = IsotropicGaussian(0.01) hidden.biases_init = Constant(0.001) lookup.initialize() hidden.initialize() return cost
feedback_brick=feedback, name="readout", ) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.0) generator.push_initialization_config() generator.transition.biases_init = IsotropicGaussian(0.01, 1) generator.transition.push_initialization_config() generator.initialize() lookup.weights_init = IsotropicGaussian(0.001) lookup.biases_init = Constant(0.0) lookup.initialize() # states = {} states = [state for state in generator.transition.apply.outputs if state != "step"] # ipdb.set_trace() states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states} cost_matrix = generator.cost_matrix(x, attended=context, **states) cost = cost_matrix.mean() + 0.0 * start_flag cost.name = "nll"
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([ cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2) clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism Bilinear attention_clinear_1 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_1') bricks += [attention_clinear_1] att_start = qenc[None, :, :] * attention_clinear_1.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_start = att_start.sum(axis=2) att_start = tensor.nnet.softmax(att_start.T).T attention_clinear_2 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_2') bricks += [attention_clinear_2] att_end = qenc[None, :, :] * attention_clinear_2.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_end = att_end.sum(axis=2) att_end = tensor.nnet.softmax(att_end.T).T att_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_start) att_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_end) # add attention from left and right att_weights = att_start * att_end att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() cost = (((att_weights - att_target)**2) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_start.name = 'att_start' att_end.name = 'att_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_start, att_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply( attention_clinear.apply( cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2] ))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply( layer1.reshape( (layer1.shape[0] * layer1.shape[1], layer1.shape[2]))) att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights = tensor.nnet.sigmoid(att_weights.T).T att_weights.name = 'att_weights' att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() self.predictions = tensor.gt(att_weights, 0.1) * context # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') context_bag = tensor.eq(context[:, :, None], tensor.arange(vocab_size)).sum(axis=1).clip( 0, 1) bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') # embed.weights_init = Constant(embeddings_initial_value) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' #embed size: 200, lstm_size = 256 #qenc: length * batch_size * (2*lstm_size) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate( [ cembed, tensor.extra_ops.repeat( qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2 ) #length * batch_size * (embed+2*lstm_size) this is what goes into encoder clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' #cenc: length * batch_size * (2*lstm_size) #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() # for p in tparams.values(): # add_role(p, WEIGHT) # self.theano_params.append(p) #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, cenc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') better = tensor.imatrix('better') better_mask = tensor.imatrix('better_mask') worse = tensor.imatrix('worse') worse_mask = tensor.imatrix('worse_mask') b_left = tensor.imatrix('b_left') b_left_mask = tensor.imatrix('b_left_mask') b_right = tensor.imatrix('b_right') b_right_mask = tensor.imatrix('b_right_mask') w_left = tensor.imatrix('w_left') w_left_mask = tensor.imatrix('w_left_mask') w_right = tensor.imatrix('w_right') w_right_mask = tensor.imatrix('w_right_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) better = better.dimshuffle(1, 0) better_mask = better_mask.dimshuffle(1, 0) worse = worse.dimshuffle(1, 0) worse_mask = worse_mask.dimshuffle(1, 0) b_left = b_left.dimshuffle(1, 0) b_left_mask = b_left_mask.dimshuffle(1, 0) b_right = b_right.dimshuffle(1, 0) b_right_mask = b_right_mask.dimshuffle(1, 0) w_left = w_left.dimshuffle(1, 0) w_left_mask = w_left_mask.dimshuffle(1, 0) w_right = w_right.dimshuffle(1, 0) w_right_mask = w_right_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # candidate encoders candidates_hidden_list = [] candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0') candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0') candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0') candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins] #computing better encoding better_embed = embed.apply(better) better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed) better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed) better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX)) better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1]) better_hidden_list = [better_fwd_hidden, better_bwd_hidden] better_enc_dim = 2*sum(config.ctx_lstm_size) better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_enc.name = 'better_enc' candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden] #computing worse encoding worse_embed = embed.apply(worse) worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed) worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed) worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX)) worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1]) worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden] worse_enc_dim = 2*sum(config.ctx_lstm_size) worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1) worse_enc.name = 'worse_enc' candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden] #left encoders left_context_hidden_list = [] left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0') left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0') left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0') left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins] #right encoders right_context_hidden_list = [] right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0') right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0') right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0') right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins] #left half encodings better_left_embed = embed.apply(b_left) better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed) better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed) better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX)) better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1]) better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden] better_left_enc_dim = 2*sum(config.ctx_lstm_size) better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_left_enc.name = 'better_left_enc' left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden] worse_left_embed = embed.apply(w_left) worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed) worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed) worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX)) worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1]) worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden] worse_left_enc_dim = 2*sum(config.ctx_lstm_size) worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_left_enc.name = 'worse_left_enc' left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden] #right half encoding better_right_embed = embed.apply(b_right) better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed) better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed) better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX)) better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1]) better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden] better_right_enc_dim = 2*sum(config.ctx_lstm_size) better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_right_enc.name = 'better_right_enc' right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden] worse_right_embed = embed.apply(w_right) worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed) worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed) worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX)) worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1]) worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden] worse_right_enc_dim = 2*sum(config.ctx_lstm_size) worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_right_enc.name = 'worse_right_enc' right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden] # F1 prediction MLP prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1], activations=config.prediction_mlp_activations[1:] + [Identity()], name='prediction_mlp') prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq') prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand') prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft') prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright') bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear] better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1)) better_layer1.name = 'better_layer1' worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1)) worse_layer1.name = 'worse_layer1' better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size # numpy.set_printoptions(edgeitems=500) # better_pred_weights = theano.printing.Print('better')(better_pred_weights) # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights) # #cost : max(0,- score-better + score-worse + margin) margin = config.margin conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX) self.predictions = conditions cost = (-better_pred_weights + worse_pred_weights + margin) * conditions cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) context_bag = to_bag(context, vocab_size) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) # embed.weights_init = Constant(init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs question_context_attention = att_weights_question.dimshuffle(2, 1, 0) question_context_attention.name = "question_context_attention" self.analyse_vars = [question_context_attention] attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_b1'], BIAS) add_role(tparams['ptr_b2'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, menc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' # self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def _embed(self, sample_num, dim, name, *args, **kwargs): embed = LookupTable(sample_num, dim, name=name) embed.weights_init = IsotropicGaussian(std=1 / numpy.sqrt(dim)) embed.initialize() return embed
def __init__(self, config, vocab_size): unsorted = tensor.imatrix('unsorted') unsorted_mask = tensor.imatrix('unsorted_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] unsorted = unsorted.dimshuffle(1, 0) unsorted_mask = unsorted_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed unsorted sequence embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''): unsorted_embed = embed.apply(unsorted) unsorted_lstms, unsorted_hidden_list = make_bidir_lstm_stack( unsorted_embed, config.embed_size, unsorted_mask.astype(theano.config.floatX), config.lstm_size, config.match_skip_connections, 'u') #lu,bs,lstm_dim bricks = bricks + unsorted_lstms unsorted_enc_dim = 2 * sum(config.lstm_size) unsorted_enc = tensor.concatenate( unsorted_hidden_list, axis=2) #concatenate fwd & bwd lstm hidden states unsorted_enc.name = 'unsorted_enc' #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_b1'], BIAS) add_role(tparams['ptr_b2'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() #n_steps = length , n_samples = batch_size n_steps = answer.shape[0] n_samples = answer.shape[1] preds, generations = ptr_network( tparams, unsorted_embed, unsorted_mask.astype(theano.config.floatX), answer, answer_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, unsorted_enc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, answer, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 probs += off # probs_printed = theano.printing.Print('probs')(probs) cost = -tensor.log(probs) cost *= answer_mask cost = cost.sum(axis=0) / answer_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, unsorted_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP fwd attention_mlp_fwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_fwd') attention_qlinear_fwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_fwd') attention_clinear_fwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_fwd') bricks += [ attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd ] layer1_fwd = Tanh(name='tanh_fwd') layer1_fwd = layer1_fwd.apply( attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_fwd.apply(qenc)[None, :, :]) att_weights_fwd = attention_mlp_fwd.apply( layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1], layer1_fwd.shape[2]))) att_weights_fwd = att_weights_fwd.reshape( (layer1_fwd.shape[0], layer1_fwd.shape[1])) att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T) att_weights_fwd.name = 'att_weights_fwd' attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] * att_weights_fwd.T[:, :, None], axis=0) attended_fwd.name = 'attended_fwd' # Attention mechanism MLP bwd attention_mlp_bwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_bwd') attention_qlinear_bwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_bwd') attention_clinear_bwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_bwd') bricks += [ attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd ] layer1_bwd = Tanh(name='tanh_bwd') layer1_bwd = layer1_bwd.apply( attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_bwd.apply(qenc)[None, :, :]) att_weights_bwd = attention_mlp_bwd.apply( layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1], layer1_bwd.shape[2]))) att_weights_bwd = att_weights_bwd.reshape( (layer1_bwd.shape[0], layer1_bwd.shape[1])) att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T) att_weights_bwd.name = 'att_weights_bwd' attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] * att_weights_bwd.T[:, :, None], axis=0) attended_bwd.name = 'attended_bwd' ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc], axis=1) ctx_question.name = 'ctx_question' answer_bag = to_bag(answer, vocab_size) answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0) relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX) def createSequences(j, index, c_enc, c_enc_dim, c_context, c_window_size): sequence = tensor.concatenate([ c_context[j:j + index, :], tensor.zeros((c_window_size - index, c_context.shape[1])) ], axis=0) enc = tensor.concatenate([ c_enc[j + index - 1, :, :], c_enc[j, :, :-1], tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1)) ], axis=1) return enc, sequence def createTargetValues(j, index, c_context, c_vocab_size): sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size) sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0) selected_items = sequence_bag.sum(axis=1, dtype=theano.config.floatX) tp = (sequence_bag * answer_bag).sum(axis=1, dtype=theano.config.floatX) precision = tp / (selected_items + 0.00001) recall = tp / (relevant_items + 0.00001) #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0) #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0) macroF1 = (2 * (precision * recall)) / (precision + recall + 0.00001) #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0) return macroF1 window_size = 3 senc = [] sequences = [] pred_targets = [] for i in range(1, window_size + 1): (all_enc, all_sequence), _ = theano.scan( fn=createSequences, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, cenc, cenc_dim, context, window_size]) (all_macroF1), _ = theano.scan( fn=createTargetValues, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, context, vocab_size]) senc.append(all_enc) sequences.append(all_sequence) pred_targets.append(all_macroF1) senc = tensor.concatenate(senc, axis=0) sequences = tensor.concatenate(sequences, axis=0) pred_targets = tensor.concatenate(pred_targets, axis=0) # F1 prediction Bilinear prediction_linear = Linear(input_dim=2 * cenc_dim, output_dim=cenc_dim + qenc_dim, name='pred_linear') bricks += [prediction_linear] pred_weights = ctx_question[None, :, :] * prediction_linear.apply( senc.reshape( (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape( (senc.shape[0], senc.shape[1], senc.shape[2])) pred_weights = pred_weights.sum(axis=2) pred_weights = tensor.nnet.sigmoid(pred_weights.T).T pred_weights.name = 'pred_weights' pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001) pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001) #numpy.set_printoptions(edgeitems=500) #pred_targets = theano.printing.Print('pred_targets')(pred_targets) #pred_weights = theano.printing.Print('pred_weights')(pred_weights) cost = tensor.nnet.binary_crossentropy(pred_weights, pred_targets).mean() self.predictions = sequences[pred_weights.argmax(axis=0), :, tensor.arange(sequences.shape[2])].T # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
readout = Readout(readout_dim=readout_size, source_names=source_names + [attention.take_glimpses.outputs[0]], emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, attention=attention, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.001) generator.push_initialization_config() lookup.weights_init = IsotropicGaussian(0.01) lookup.biases_init = Constant(0.001) lookup.initialize() #generator.transition.weights_init = initialization.Identity(0.98) #generator.transition.biases_init = IsotropicGaussian(0.01,0.9) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x, x_mask, attended=embed, attended_mask=context_mask) cost = cost_matrix.sum(axis=0).mean() cost.name = "nll"
def _build_lookup(self, name, word_num, dim=1, *args, **kwargs): lookup = LookupTable(length=word_num, dim=dim, name=name) lookup.weights_init = Constant(1. / word_num**0.25) lookup.initialize() return lookup
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') #embed.weights_init = IsotropicGaussian(0.01) embed.weights_init = Constant( init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' # Attention mechanism MLP start attention_mlp_start = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_start') attention_clinear_start = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='attm_start') # Wym bricks += [attention_mlp_start, attention_clinear_start] layer1_start = Tanh(name='layer1_start') layer1_start = layer1_start.apply( attention_clinear_start.apply( menc.reshape( (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape( (menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0]))) att_weights_start = attention_mlp_start.apply( layer1_start.reshape( (layer1_start.shape[0] * layer1_start.shape[1], layer1_start.shape[2]))) att_weights_start = att_weights_start.reshape( (layer1_start.shape[0], layer1_start.shape[1])) att_weights_start = tensor.nnet.softmax(att_weights_start.T).T attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0) attended.name = 'attended' # Attention mechanism MLP end attention_mlp_end = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_end') attention_qlinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='atts_end') #Wum attention_clinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attm_end') # Wym bricks += [ attention_mlp_end, attention_qlinear_end, attention_clinear_end ] layer1_end = Tanh(name='layer1_end') layer1_end = layer1_end.apply( attention_clinear_end.apply( menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2] ))).reshape((menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_end.apply(attended)[None, :, :]) att_weights_end = attention_mlp_end.apply( layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1], layer1_end.shape[2]))) att_weights_end = att_weights_end.reshape( (layer1_end.shape[0], layer1_end.shape[1])) att_weights_end = tensor.nnet.softmax(att_weights_end.T).T att_weights_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_start) att_weights_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_end) # add attention from left and right att_weights = att_weights_start * att_weights_end #att_weights = tensor.minimum(att_weights_start, att_weights_end) att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]), dtype=theano.config.floatX) att_target = tensor.set_subtensor( att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1) att_target = att_target.dimshuffle(1, 0) #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)), # tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_weights_start.name = 'att_weights_start' att_weights_end.name = 'att_weights_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()