class ShallowEnergyComputer(Initializable, Feedforward): """A simple energy computer: first tanh, then weighted sum.""" @lazy() def __init__(self, **kwargs): super(ShallowEnergyComputer, self).__init__(**kwargs) self.tanh = Tanh() self.linear = Linear(use_bias=False) self.children = [self.tanh, self.linear] @application def apply(self, *args): output = args output = self.tanh.apply(*pack(output)) output = self.linear.apply(*pack(output)) return output @property def input_dim(self): return self.children[1].input_dim @input_dim.setter def input_dim(self, value): self.children[1].input_dim = value @property def output_dim(self): return self.children[1].output_dim @output_dim.setter def output_dim(self, value): self.children[1].output_dim = value
def __init__(self, config): inp = tensor.imatrix('bytes') embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), name='embedding_matrix') in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) in_repr.name = 'in_repr' bricks = [] states = [] # Construct predictive GRU hierarchy hidden = [] costs = [] next_target = in_repr.dimshuffle(1, 0, 2) for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, config.cost_factors, config.hidden_q)): init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), name='st0_%d'%i) linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, name="lstm_in_%d"%i) lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, name="lstm_rec_%d"%i) linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) tanh = Tanh('lstm_out_tanh_%d'%i) bricks += [linear, lstm, linear2, tanh] if i > 0: linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, name='lstm_in2_%d'%i) bricks += [linear1] next_target = tensor.cast(next_target, dtype=theano.config.floatX) inter = linear.apply(theano.gradient.disconnected_grad(next_target)) if i > 0: inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) new_hidden = lstm.apply(inputs=inter[:,:,:hdim], gate_inputs=inter[:,:,hdim:], states=init_state) states.append((init_state, new_hidden[-1, :, :])) hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] diff = next_target - pred next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) # Construct output from hidden states hidden = [s.dimshuffle(1, 0, 2) for s in hidden] out_parts = [] out_dims = config.out_hidden + [config.io_dim] for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], name='pred_linear_%d'%i) bricks.append(pred_linear) lin = theano.gradient.disconnected_grad(state) out_parts.append(pred_linear.apply(lin)) # Do prediction and calculate cost out = sum(out_parts) if len(out_dims) > 1: out = config.out_hidden_act[0](name='out_act0').apply(out) mlp = MLP(dims=out_dims, activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] +[Identity()], name='out_mlp') bricks.append(mlp) out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) ).reshape((inp.shape[0],inp.shape[1]+1,-1)) pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp.flatten(), out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], config.io_dim))).mean() error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() sgd_cost = cost + sum(costs) # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # apply noise cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) if config.weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.weight_noise) sgd_cost = cg.outputs[0] cost = cg.outputs[1] error_rate = cg.outputs[2] costs = cg.outputs[3:] # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost sgd_cost.name = 'sgd_cost' for i in range(len(costs)): costs[i].name = 'pred_cost_%d'%i cost.name = 'cost' error_rate.name = 'error_rate' self.monitor_vars = [costs, [cost], [error_rate]] self.out = out[:,1:,:] self.pred = pred[:,1:] self.states = states
class FRNNEmitter(AbstractEmitter, Initializable, Random): """An RNN emitter for the case of real outputs. Parameters ---------- """ def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu") self.sigma = MLP( activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma" ) self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state" ) # self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh(name="frnn_activation") self.frnn_linear_transition_state = Linear( input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state" ) self.frnn_linear_transition_input = Linear( input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input" ) # self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2, self.frnn_initial_state, self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input, ] @application def emit(self, readouts): """ keep_parameters is True if mu,sigma,coeffs must be stacked and returned if false, only the result is given, the others will be empty list. """ # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) results = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. mu = self.mu.apply(state) sigma = self.sigma.apply(state) + self.const coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const shape_result = coeff.shape shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size) ndim_result = coeff.ndim mu = mu.reshape((-1, self.frnn_step_size, self.k)) sigma = sigma.reshape((-1, self.frnn_step_size, self.k)) coeff = coeff.reshape((-1, self.k)) sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype) idx = predict(sample_coeff, axis=-1) # idx = predict(coeff, axis = -1) use this line for using most likely coeff. # shapes (ls*bs)*(fs) mu = mu[tensor.arange(mu.shape[0]), :, idx] sigma = sigma[tensor.arange(sigma.shape[0]), :, idx] epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype) result = mu + sigma * epsilon # *0.6 #reduce variance. result = result.reshape(shape_result, ndim=ndim_result) results.append(result) # if the total size does not correspond to the frame_size, # this removes the need for padding if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result) ) results = tensor.stack(results, axis=-1) results = tensor.flatten(results, outdim=results.ndim - 1) # truncate if not good size if self.last_steps != 0: results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])] return results @application def cost(self, readouts, outputs): # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) inputs = outputs mus = [] sigmas = [] coeffs = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. freq_mu = self.mu.apply(state) freq_sigma = self.sigma.apply(state) + self.const freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k)) freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k)) freq_coeff = freq_coeff.reshape((-1, self.k)) # mu,sigma: shape (-1,fs,k) # coeff: shape (-1,k) mus.append(freq_mu) sigmas.append(freq_sigma) coeffs.append(freq_coeff) index = self.frnn_step_size freq_inputs = inputs[ tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)]) ] if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(freq_inputs) ) mus = tensor.stack(mus, axis=-2) sigmas = tensor.stack(sigmas, axis=-2) coeffs = tensor.stack(coeffs, axis=-2) mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) coeffs = coeffs.repeat(self.frnn_step_size, axis=-2) mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] # actually prob not necessary mu = mus.reshape((-1, self.target_size)) sigma = sigmas.reshape((-1, self.target_size)) coeff = coeffs.reshape((-1, self.target_size)) return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k) @application def initial_outputs(self, batch_size): return tensor.zeros((batch_size, self.frame_size), dtype=floatX) def get_dim(self, name): # modification here to ensure the right dim. if name == "outputs": return self.frame_size return super(FRNNEmitter, self).get_dim(name)
class Decoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim readout = Readout( source_names=['states', 'feedback', 'readout_context'], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(), feedback_brick=LookupFeedback(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=1000).apply, Maxout(num_pieces=2).apply, Linear(input_dim=state_dim / 2, output_dim=100, use_bias=False).apply, Linear(input_dim=100).apply]), merged_dim=1000) self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim, name='decoder') # Readout will apply the linear transformation to 'readout_context' # with a Merge brick, so no need to fork it here self.fork = Fork([name for name in self.transition.apply.contexts + self.transition.apply.states if name != 'readout_context'], prototype=Linear()) self.tanh = Tanh() self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork_inputs=[name for name in self.transition.apply.sequences if name != 'mask'], ) self.children = [self.fork, self.sequence_generator, self.tanh] def _push_allocation_config(self): self.fork.input_dim = self.representation_dim self.fork.output_dims = [self.state_dim for _ in self.fork.output_names] @application(inputs=['representation', 'target_sentence_mask', 'target_sentence'], outputs=['cost']) def cost(self, representation, target_sentence, target_sentence_mask): target_sentence = target_sentence.dimshuffle(1, 0) target_sentence_mask = target_sentence_mask.T # The initial state and contexts, all functions of the representation contexts = {key: value.dimshuffle('x', 0, 1) if key not in self.transition.apply.states else value for key, value in self.fork.apply(representation, as_dict=True).items()} contexts['states'] = self.tanh.apply(contexts['states']) cost = self.sequence_generator.cost(**merge( contexts, {'mask': target_sentence_mask, 'outputs': target_sentence, 'readout_context': representation.dimshuffle('x', 0, 1)} )) return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
class FRNNEmitter(AbstractEmitter, Initializable, Random): """An RNN emitter for the case of real outputs. Parameters ---------- """ def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, \ frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k*frnn_step_size], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[frnn_hidden_size, k*frnn_step_size], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim = self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state") #self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh( name="frnn_activation") self.frnn_linear_transition_state = Linear ( input_dim = frnn_hidden_size, output_dim= frnn_hidden_size, name="frnn_linear_transition_state") self.frnn_linear_transition_input = Linear ( input_dim = self.frnn_step_size, output_dim = frnn_hidden_size, name="frnn_linear_transition_input") #self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [self.mlp,self.mu,self.sigma,self.coeff, self.coeff2,self.frnn_initial_state,self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input] @application def emit(self,readouts): """ keep_parameters is True if mu,sigma,coeffs must be stacked and returned if false, only the result is given, the others will be empty list. """ # initial state state = self.frnn_initial_state.apply(\ self.mlp.apply(readouts)) results = [] for i in range(self.number_of_steps): last_iteration = (i == self.number_of_steps - 1) # First generating distribution parameters and sampling. mu = self.mu.apply(state) sigma = self.sigma.apply(state) + self.const coeff = self.coeff2.apply(self.coeff.apply(state),\ extra_ndim=state.ndim - 2) + self.const shape_result = coeff.shape shape_result = tensor.set_subtensor(shape_result[-1],self.frnn_step_size) ndim_result = coeff.ndim mu = mu.reshape((-1, self.frnn_step_size,self.k)) sigma = sigma.reshape((-1, self.frnn_step_size,self.k)) coeff = coeff.reshape((-1, self.k)) sample_coeff = self.theano_rng.multinomial(pvals = coeff, dtype=coeff.dtype) idx = predict(sample_coeff, axis = -1) #idx = predict(coeff, axis = -1) use this line for using most likely coeff. #shapes (ls*bs)*(fs) mu = mu[tensor.arange(mu.shape[0]), :,idx] sigma = sigma[tensor.arange(sigma.shape[0]), :,idx] epsilon = self.theano_rng.normal( size=mu.shape, avg=0., std=1., dtype=mu.dtype) result = mu + sigma*epsilon#*0.6 #reduce variance. result = result.reshape(shape_result, ndim = ndim_result) results.append(result) # if the total size does not correspond to the frame_size, #this removes the need for padding if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result)) results = tensor.stack(results,axis=-1) results = tensor.flatten(results,outdim=results.ndim-1) # truncate if not good size if self.last_steps != 0: results = results[tuple([slice(0,None)] * \ (results.ndim-1) +[slice(0,self.frame_size)])] return results @application def cost(self, readouts, outputs): # initial state state = self.frnn_initial_state.apply(\ self.mlp.apply(readouts)) inputs = outputs mus = [] sigmas = [] coeffs = [] for i in range(self.number_of_steps): last_iteration = (i == self.number_of_steps - 1) # First generating distribution parameters and sampling. freq_mu = self.mu.apply(state) freq_sigma = self.sigma.apply(state) + self.const freq_coeff = self.coeff2.apply(self.coeff.apply(state),\ extra_ndim=state.ndim - 2) + self.const freq_mu = freq_mu.reshape((-1,self.frnn_step_size,self.k)) freq_sigma = freq_sigma.reshape((-1,self.frnn_step_size,self.k)) freq_coeff = freq_coeff.reshape((-1,self.k)) #mu,sigma: shape (-1,fs,k) #coeff: shape (-1,k) mus.append(freq_mu) sigmas.append(freq_sigma) coeffs.append(freq_coeff) index = self.frnn_step_size freq_inputs = inputs[tuple([slice(0,None)] * \ (inputs.ndim-1) +[slice(index,index+self.frnn_step_size)])] if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(freq_inputs)) mus = tensor.stack(mus,axis=-2) sigmas = tensor.stack(sigmas,axis=-2) coeffs = tensor.stack(coeffs,axis=-2) mus = mus.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k)) sigmas = sigmas.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k)) coeffs = coeffs.repeat(self.frnn_step_size,axis=-2) mus = mus[tuple([slice(0,None)] * \ (mus.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])] sigmas = sigmas[tuple([slice(0,None)] * \ (sigmas.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])] coeffs = coeffs[tuple([slice(0,None)] * \ (coeffs.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])] # actually prob not necessary mu = mus.reshape((-1,self.target_size)) sigma = sigmas.reshape((-1,self.target_size)) coeff = coeffs.reshape((-1, self.target_size)) return FRNN_NLL (y=outputs, mu=mu, sig=sigma, coeff=coeff,\ frame_size=self.frame_size,k=self.k) @application def initial_outputs(self, batch_size): return tensor.zeros((batch_size, self.frame_size), dtype=floatX) def get_dim(self, name): # modification here to ensure the right dim. if name == 'outputs': return self.frame_size return super(FRNNEmitter, self).get_dim(name)
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP fwd attention_mlp_fwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_fwd') attention_qlinear_fwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_fwd') attention_clinear_fwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_fwd') bricks += [ attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd ] layer1_fwd = Tanh(name='tanh_fwd') layer1_fwd = layer1_fwd.apply( attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_fwd.apply(qenc)[None, :, :]) att_weights_fwd = attention_mlp_fwd.apply( layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1], layer1_fwd.shape[2]))) att_weights_fwd = att_weights_fwd.reshape( (layer1_fwd.shape[0], layer1_fwd.shape[1])) att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T) att_weights_fwd.name = 'att_weights_fwd' attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] * att_weights_fwd.T[:, :, None], axis=0) attended_fwd.name = 'attended_fwd' # Attention mechanism MLP bwd attention_mlp_bwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_bwd') attention_qlinear_bwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_bwd') attention_clinear_bwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_bwd') bricks += [ attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd ] layer1_bwd = Tanh(name='tanh_bwd') layer1_bwd = layer1_bwd.apply( attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_bwd.apply(qenc)[None, :, :]) att_weights_bwd = attention_mlp_bwd.apply( layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1], layer1_bwd.shape[2]))) att_weights_bwd = att_weights_bwd.reshape( (layer1_bwd.shape[0], layer1_bwd.shape[1])) att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T) att_weights_bwd.name = 'att_weights_bwd' attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] * att_weights_bwd.T[:, :, None], axis=0) attended_bwd.name = 'attended_bwd' ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc], axis=1) ctx_question.name = 'ctx_question' answer_bag = to_bag(answer, vocab_size) answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0) relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX) def createSequences(j, index, c_enc, c_enc_dim, c_context, c_window_size): sequence = tensor.concatenate([ c_context[j:j + index, :], tensor.zeros((c_window_size - index, c_context.shape[1])) ], axis=0) enc = tensor.concatenate([ c_enc[j + index - 1, :, :], c_enc[j, :, :-1], tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1)) ], axis=1) return enc, sequence def createTargetValues(j, index, c_context, c_vocab_size): sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size) sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0) selected_items = sequence_bag.sum(axis=1, dtype=theano.config.floatX) tp = (sequence_bag * answer_bag).sum(axis=1, dtype=theano.config.floatX) precision = tp / (selected_items + 0.00001) recall = tp / (relevant_items + 0.00001) #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0) #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0) macroF1 = (2 * (precision * recall)) / (precision + recall + 0.00001) #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0) return macroF1 window_size = 3 senc = [] sequences = [] pred_targets = [] for i in range(1, window_size + 1): (all_enc, all_sequence), _ = theano.scan( fn=createSequences, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, cenc, cenc_dim, context, window_size]) (all_macroF1), _ = theano.scan( fn=createTargetValues, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, context, vocab_size]) senc.append(all_enc) sequences.append(all_sequence) pred_targets.append(all_macroF1) senc = tensor.concatenate(senc, axis=0) sequences = tensor.concatenate(sequences, axis=0) pred_targets = tensor.concatenate(pred_targets, axis=0) # F1 prediction Bilinear prediction_linear = Linear(input_dim=2 * cenc_dim, output_dim=cenc_dim + qenc_dim, name='pred_linear') bricks += [prediction_linear] pred_weights = ctx_question[None, :, :] * prediction_linear.apply( senc.reshape( (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape( (senc.shape[0], senc.shape[1], senc.shape[2])) pred_weights = pred_weights.sum(axis=2) pred_weights = tensor.nnet.sigmoid(pred_weights.T).T pred_weights.name = 'pred_weights' pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001) pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001) #numpy.set_printoptions(edgeitems=500) #pred_targets = theano.printing.Print('pred_targets')(pred_targets) #pred_weights = theano.printing.Print('pred_weights')(pred_weights) cost = tensor.nnet.binary_crossentropy(pred_weights, pred_targets).mean() self.predictions = sequences[pred_weights.argmax(axis=0), :, tensor.arange(sequences.shape[2])].T # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
class Decoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim readout = Readout( source_names=['states', 'feedback', 'readout_context'], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(), feedback_brick=LookupFeedback(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=1000).apply, Maxout(num_pieces=2).apply, Linear(input_dim=state_dim / 2, output_dim=100, use_bias=False).apply, Linear(input_dim=100).apply ]), merged_dim=1000) self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim, name='decoder') # Readout will apply the linear transformation to 'readout_context' # with a Merge brick, so no need to fork it here self.fork = Fork([ name for name in self.transition.apply.contexts + self.transition.apply.states if name != 'readout_context' ], prototype=Linear()) self.tanh = Tanh() self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork_inputs=[ name for name in self.transition.apply.sequences if name != 'mask' ], ) self.children = [self.fork, self.sequence_generator, self.tanh] def _push_allocation_config(self): self.fork.input_dim = self.representation_dim self.fork.output_dims = [ self.state_dim for _ in self.fork.output_names ] @application( inputs=['representation', 'target_sentence_mask', 'target_sentence'], outputs=['cost']) def cost(self, representation, target_sentence, target_sentence_mask): target_sentence = target_sentence.dimshuffle(1, 0) target_sentence_mask = target_sentence_mask.T # The initial state and contexts, all functions of the representation contexts = { key: value.dimshuffle('x', 0, 1) if key not in self.transition.apply.states else value for key, value in self.fork.apply(representation, as_dict=True).items() } contexts['states'] = self.tanh.apply(contexts['states']) cost = self.sequence_generator.cost(**merge( contexts, { 'mask': target_sentence_mask, 'outputs': target_sentence, 'readout_context': representation.dimshuffle('x', 0, 1) })) return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)], axis=2) clstms, chidden_list = make_bidir_lstm_stack(cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP start attention_mlp_start = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_start') attention_qlinear_start = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_start') #Wum attention_clinear_start = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_start') # Wym bricks += [attention_mlp_start, attention_qlinear_start, attention_clinear_start] layer1_start = Tanh(name='layer1_start') layer1_start = layer1_start.apply(attention_clinear_start.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear_start.apply(qenc)[None, :, :]) att_weights_start = attention_mlp_start.apply(layer1_start.reshape((layer1_start.shape[0]*layer1_start.shape[1], layer1_start.shape[2]))) att_weights_start = att_weights_start.reshape((layer1_start.shape[0], layer1_start.shape[1])) att_weights_start = tensor.nnet.softmax(att_weights_start.T).T attended = tensor.sum(cenc * att_weights_start[:, :, None], axis=0) attended.name = 'attended' # Attention mechanism MLP end attention_mlp_end = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_end') attention_qlinear_end = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_end') #Wum attention_clinear_end = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_end') # Wym bricks += [attention_mlp_end, attention_qlinear_end, attention_clinear_end] layer1_end = Tanh(name='layer1_end') layer1_end = layer1_end.apply(attention_clinear_end.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear_end.apply(attended)[None, :, :]) att_weights_end = attention_mlp_end.apply(layer1_end.reshape((layer1_end.shape[0]*layer1_end.shape[1], layer1_end.shape[2]))) att_weights_end = att_weights_end.reshape((layer1_end.shape[0], layer1_end.shape[1])) att_weights_end = tensor.nnet.softmax(att_weights_end.T).T att_weights_start = tensor.dot(tensor.le(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)), tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_start) att_weights_end = tensor.dot(tensor.ge(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)), tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_end) # add attention from left and right #att_weights = att_weights_start * att_weights_end att_weights = tensor.minimum(att_weights_start, att_weights_end) att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)), tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1) self.predictions = tensor.gt(att_weights, 0.5) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_weights_start.name = 'att_weights_start' att_weights_end.name = 'att_weights_end' att_target.name = 'att_target' att_weights.name = 'att_weights' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') #embed.weights_init = IsotropicGaussian(0.01) embed.weights_init = Constant( init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' # Attention mechanism MLP start attention_mlp_start = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_start') attention_clinear_start = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='attm_start') # Wym bricks += [attention_mlp_start, attention_clinear_start] layer1_start = Tanh(name='layer1_start') layer1_start = layer1_start.apply( attention_clinear_start.apply( menc.reshape( (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape( (menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0]))) att_weights_start = attention_mlp_start.apply( layer1_start.reshape( (layer1_start.shape[0] * layer1_start.shape[1], layer1_start.shape[2]))) att_weights_start = att_weights_start.reshape( (layer1_start.shape[0], layer1_start.shape[1])) att_weights_start = tensor.nnet.softmax(att_weights_start.T).T attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0) attended.name = 'attended' # Attention mechanism MLP end attention_mlp_end = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_end') attention_qlinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='atts_end') #Wum attention_clinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attm_end') # Wym bricks += [ attention_mlp_end, attention_qlinear_end, attention_clinear_end ] layer1_end = Tanh(name='layer1_end') layer1_end = layer1_end.apply( attention_clinear_end.apply( menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2] ))).reshape((menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_end.apply(attended)[None, :, :]) att_weights_end = attention_mlp_end.apply( layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1], layer1_end.shape[2]))) att_weights_end = att_weights_end.reshape( (layer1_end.shape[0], layer1_end.shape[1])) att_weights_end = tensor.nnet.softmax(att_weights_end.T).T att_weights_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_start) att_weights_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_end) # add attention from left and right att_weights = att_weights_start * att_weights_end #att_weights = tensor.minimum(att_weights_start, att_weights_end) att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]), dtype=theano.config.floatX) att_target = tensor.set_subtensor( att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1) att_target = att_target.dimshuffle(1, 0) #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)), # tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_weights_start.name = 'att_weights_start' att_weights_end.name = 'att_weights_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()