def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or ( args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def __init__(self, config, vocab_size, id_to_vocab, logger): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') # question_actual = tensor.imatrix('question_actual') # context_actual = tensor.imatrix('context_actual') # answer_actual = tensor.imatrix('answer_actual') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] #u qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' #r attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] # g^AR probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply( attention_clinear.apply( cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2] ))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply( layer1.reshape( (layer1.shape[0] * layer1.shape[1], layer1.shape[2]))) att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights = tensor.nnet.sigmoid(att_weights.T).T att_weights.name = 'att_weights' att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() self.predictions = tensor.gt(att_weights, 0.1) * context # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') better = tensor.imatrix('better') better_mask = tensor.imatrix('better_mask') worse = tensor.imatrix('worse') worse_mask = tensor.imatrix('worse_mask') b_left = tensor.imatrix('b_left') b_left_mask = tensor.imatrix('b_left_mask') b_right = tensor.imatrix('b_right') b_right_mask = tensor.imatrix('b_right_mask') w_left = tensor.imatrix('w_left') w_left_mask = tensor.imatrix('w_left_mask') w_right = tensor.imatrix('w_right') w_right_mask = tensor.imatrix('w_right_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) better = better.dimshuffle(1, 0) better_mask = better_mask.dimshuffle(1, 0) worse = worse.dimshuffle(1, 0) worse_mask = worse_mask.dimshuffle(1, 0) b_left = b_left.dimshuffle(1, 0) b_left_mask = b_left_mask.dimshuffle(1, 0) b_right = b_right.dimshuffle(1, 0) b_right_mask = b_right_mask.dimshuffle(1, 0) w_left = w_left.dimshuffle(1, 0) w_left_mask = w_left_mask.dimshuffle(1, 0) w_right = w_right.dimshuffle(1, 0) w_right_mask = w_right_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # candidate encoders candidates_hidden_list = [] candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0') candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0') candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0') candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins] #computing better encoding better_embed = embed.apply(better) better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed) better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed) better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX)) better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1]) better_hidden_list = [better_fwd_hidden, better_bwd_hidden] better_enc_dim = 2*sum(config.ctx_lstm_size) better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_enc.name = 'better_enc' candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden] #computing worse encoding worse_embed = embed.apply(worse) worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed) worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed) worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX)) worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1]) worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden] worse_enc_dim = 2*sum(config.ctx_lstm_size) worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1) worse_enc.name = 'worse_enc' candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden] #left encoders left_context_hidden_list = [] left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0') left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0') left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0') left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins] #right encoders right_context_hidden_list = [] right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0') right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0') right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0') right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins] #left half encodings better_left_embed = embed.apply(b_left) better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed) better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed) better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX)) better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1]) better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden] better_left_enc_dim = 2*sum(config.ctx_lstm_size) better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_left_enc.name = 'better_left_enc' left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden] worse_left_embed = embed.apply(w_left) worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed) worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed) worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX)) worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1]) worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden] worse_left_enc_dim = 2*sum(config.ctx_lstm_size) worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_left_enc.name = 'worse_left_enc' left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden] #right half encoding better_right_embed = embed.apply(b_right) better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed) better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed) better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX)) better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1]) better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden] better_right_enc_dim = 2*sum(config.ctx_lstm_size) better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_right_enc.name = 'better_right_enc' right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden] worse_right_embed = embed.apply(w_right) worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed) worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed) worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX)) worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1]) worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden] worse_right_enc_dim = 2*sum(config.ctx_lstm_size) worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_right_enc.name = 'worse_right_enc' right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden] # F1 prediction MLP prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1], activations=config.prediction_mlp_activations[1:] + [Identity()], name='prediction_mlp') prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq') prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand') prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft') prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright') bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear] better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1)) better_layer1.name = 'better_layer1' worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1)) worse_layer1.name = 'worse_layer1' better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size # numpy.set_printoptions(edgeitems=500) # better_pred_weights = theano.printing.Print('better')(better_pred_weights) # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights) # #cost : max(0,- score-better + score-worse + margin) margin = config.margin conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX) self.predictions = conditions cost = (-better_pred_weights + worse_pred_weights + margin) * conditions cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') # set up 32-bit integer matrices question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') # and the multple choice answers: ans1 = tensor.ivector('ans1') ans1_mask = tensor.ivector('ans1_mask') ans2 = tensor.ivector('ans2') ans2_mask = tensor.ivector('ans2_mask') ans3 = tensor.ivector('ans3') ans3_mask = tensor.ivector('ans3_mask') ans4 = tensor.ivector('ans4') ans4_mask = tensor.ivector('ans4_mask') bricks = [] # inverts 1st and 2nd dimensions of matrix question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) a1embed = embed.apply(ans1) a2embed = embed.apply(ans2) a3embed = embed.apply(ans3) a4embed = embed.apply(ans4) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' # not needed anymore, since we're not only looking at entities # is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], # tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate # vocab = tensor.arange(10) # probs = numpy.asarray([0, 0.8, 0, 0.2], dtype=numpy.float32) # context = numpy.asarray([3, 2, 8, 1], dtype=numpy.int32) # ans3 = numpy.asarray([2, 8, 1], dtype=numpy.int32) # ans1 = numpy.asarray([1, 3, 4], dtype=numpy.int32) # ans2 = numpy.asarray([1, 1, 4], dtype=numpy.int32) # convert probs vector to one that's the same size as vocab, with all zeros except probs: # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) probsPadded = tensor.zeros_like(vocab_size, dtype=numpy.float32) probsSubset = probsPadded[cembed] #TODO this should be masked b = tensor.set_subtensor(probsSubset, probs) # get the similarity score of each (masked) answer with the context probs: ans1probs = b[a1enc] ans1score = tensor.switch(ans1_mask, ans1probs, tensor.zeros_like(ans1probs)).sum() ans2probs = b[a2enc] ans2score = ans2probs.sum() ans3probs = b[a3enc] ans3score = ans3probs.sum() ans4probs = b[a4enc] ans4score = ans4probs.sum() # and pick the best one: allans = tensor.stacklists([ans1score, ans2score, ans3score, ans4score]) pred = tensor.argmax(allans) cg = ComputationGraph([ans1probs, ans1score, ans2probs, ans2score, ans3probs, ans3score, ans4probs, ans4score, allans, pred]) f = cg.get_theano_function() out = f() #pred = probs.argmax(axis=1) #print "pred" #print pred TODO CHANGE THIS! cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.probs = probs self.probs.name = "probs" self.cost = cost self.cost.name = "cost" # self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def get_prediction_function(): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') """ question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) """ # Embed questions and cntext embed = bricks[-5] qembed = embed.apply(question.dimshuffle(1, 0)) cembed = embed.apply(context.dimshuffle(1, 0)) global _qembed,_cembed _qembed = theano.function([question], qembed) _cembed = theano.function([context], cembed) qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.dimshuffle(1, 0).astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.dimshuffle(1, 0).astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') global _qhidden, _chidden _qhidden = theano.function([question, question_mask], qhidden_list) _chidden = theano.function([context, context_mask], chidden_list) # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' global _qenc, _cenc _qenc = theano.function([question, question_mask], qenc) _cenc = theano.function([context, context_mask], cenc) # Attention mechanism MLP attention_mlp = bricks[-2] #attention_mlp attention_qlinear = bricks[4] #attq attention_clinear = bricks[11] # attc layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) global _attention_clinear, _attention_qlinear _attention_clinear = theano.function([context, context_mask], attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))).reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))) _attention_qlinear = theano.function([question, question_mask], attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' global _attended _attended = theano.function([question, question_mask, context, context_mask], attended) # Now we can calculate our output out_mlp = bricks[-1] #out_mlp probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' f = theano.function([question, question_mask, context, context_mask], probs) return f
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism Bilinear attention_clinear = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc') bricks += [attention_clinear] att_weights = qenc[None, :, :] * attention_clinear.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_weights = att_weights.sum(axis=2) att_weights = tensor.nnet.softmax(att_weights.T).T att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' answer_bag = to_bag(answer, vocab_size) answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0) relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX) def createSequences(j, index, c_enc, c_enc_dim, c_context, c_window_size): sequence = tensor.concatenate([ c_context[j:j + index, :], tensor.zeros((c_window_size - index, c_context.shape[1])) ], axis=0) enc = tensor.concatenate([ c_enc[j + index - 1, :, :c_enc_dim / 2], c_enc[j, :, c_enc_dim / 2:-1], tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1)) ], axis=1) return enc, sequence def createTargetValues(j, index, c_context, c_vocab_size): sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size) sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0) selected_items = sequence_bag.sum(axis=1, dtype=theano.config.floatX) tp = (sequence_bag * answer_bag).sum(axis=1, dtype=theano.config.floatX) precision = tp / (selected_items + 0.00001) recall = tp / (relevant_items + 0.00001) #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0) #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0) macroF1 = (2 * (precision * recall)) / (precision + recall + 0.00001) #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0) return macroF1 window_size = 3 senc = [] sequences = [] pred_targets = [] for i in range(1, window_size + 1): (all_enc, all_sequence), _ = theano.scan( fn=createSequences, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, cenc, cenc_dim, context, window_size]) (all_macroF1), _ = theano.scan( fn=createTargetValues, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, context, vocab_size]) senc.append(all_enc) sequences.append(all_sequence) pred_targets.append(all_macroF1) senc = tensor.concatenate(senc, axis=0) sequences = tensor.concatenate(sequences, axis=0) pred_targets = tensor.concatenate(pred_targets, axis=0) # F1 prediction MLP prediction_mlp = MLP( dims=config.prediction_mlp_hidden + [1], activations=config.prediction_mlp_activations[1:] + [Identity()], name='prediction_mlp') prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0], name='preq') prediction_clinear = Linear(input_dim=cenc_dim, output_dim=config.prediction_mlp_hidden[0], use_bias=False, name='prec') prediction_slinear = Linear(input_dim=cenc_dim, output_dim=config.prediction_mlp_hidden[0], use_bias=False, name='pres') bricks += [ prediction_mlp, prediction_qlinear, prediction_clinear, prediction_slinear ] layer1 = Tanh().apply( prediction_clinear.apply(attended)[None, :, :] + prediction_qlinear.apply(qenc)[None, :, :] + prediction_slinear.apply( senc.reshape((senc.shape[0] * senc.shape[1], senc.shape[2] ))).reshape((senc.shape[0], senc.shape[1], config.prediction_mlp_hidden[0]))) layer1.name = 'layer1' pred_weights = prediction_mlp.apply( layer1.reshape( (layer1.shape[0] * layer1.shape[1], layer1.shape[2]))) pred_weights = pred_weights.reshape((layer1.shape[0], layer1.shape[1])) pred_weights = tensor.nnet.sigmoid(pred_weights.T).T pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001) pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001) #numpy.set_printoptions(edgeitems=500) #pred_targets = theano.printing.Print('pred_targets')(pred_targets) #pred_weights = theano.printing.Print('pred_weights')(pred_weights) cost = tensor.nnet.binary_crossentropy(pred_weights, pred_targets).mean() self.predictions = sequences[pred_weights.argmax(axis=0), :, tensor.arange(sequences.shape[2])].T # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()