def create_rnn(hidden_dim, vocab_dim,mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name = "W1", #dim = hidden_dim*4, dim = hidden_dim, length = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) if mode == "lstm": # Long Short Term Memory H = LSTM( hidden_dim, name = 'H', weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0.0) ) else: # recurrent history weight H = SimpleRecurrent( name = "H", dim = hidden_dim, activation = Tanh(), weights_init = initialization.IsotropicGaussian(0.01) ) # S = Linear( name = "W2", input_dim = hidden_dim, output_dim = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) A = NDimensionalSoftmax( name = "softmax" ) initLayers([W,H,S]) activations = W.apply(x) hiddens = H.apply(activations)#[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def softmax_layer(h, y, frame_length, hidden_size): hidden_to_output = Linear(name="hidden_to_output", input_dim=hidden_size, output_dim=frame_length) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = "linear_output" softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = "y_hat" cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = "cost" return y_hat, cost
class NewSoftmaxEmitter(AbstractEmitter, Initializable, Random): """A softmax emitter for the case of integer outputs. Interprets readout elements as energies corresponding to their indices. Parameters ---------- initial_output : int or a scalar :class:`~theano.Variable` The initial output. """ def __init__(self, initial_output=0, **kwargs): super(NewSoftmaxEmitter, self).__init__(**kwargs) self.initial_output = initial_output self.softmax = NDimensionalSoftmax() self.children = [self.softmax] self.name = 'newbidirectional' @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) @application def emitProbs(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] self.pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=self.pvals_flat) return self.pvals_flat @application def emit(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] self.pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=self.pvals_flat) winning_index = generated.reshape(probs.shape).argmax(axis=-1) return winning_index, self.pvals_flat[0][winning_index] @application def cost(self, readouts, outputs): # WARNING: unfortunately this application method works # just fine when `readouts` and `outputs` have # different dimensions. Be careful! return self.softmax.categorical_cross_entropy( outputs, readouts, extra_ndim=readouts.ndim - 2) @application def initial_outputs(self, batch_size): return self.initial_output * tensor.ones((batch_size, ), dtype='int64') def get_dim(self, name): if name == 'outputs': return 0 return super(SoftmaxEmitter, self).get_dim(name)
def softmax_layer(h, y, vocab_size, hidden_size): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy( y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
class SoftmaxEmitter(AbstractEmitter, Initializable, Random): """A softmax emitter for the case of integer outputs. Interprets readout elements as energies corresponding to their indices. Parameters ---------- initial_output : int or a scalar :class:`~theano.Variable` The initial output. """ def __init__(self, initial_output=0, **kwargs): super(SoftmaxEmitter, self).__init__(**kwargs) self.initial_output = initial_output self.softmax = NDimensionalSoftmax() self.children = [self.softmax] @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) @application def emit(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=pvals_flat) return generated.reshape(probs.shape).argmax(axis=-1) @application def cost(self, readouts, outputs): # WARNING: unfortunately this application method works # just fine when `readouts` and `outputs` have # different dimensions. Be careful! return self.softmax.categorical_cross_entropy( outputs, readouts, extra_ndim=readouts.ndim - 2) @application def costs(self, readouts): return -self.softmax.log_probabilities( readouts, extra_ndim=readouts.ndim - 2) @application def initial_outputs(self, batch_size): return self.initial_output * tensor.ones((batch_size,), dtype='int64') def get_dim(self, name): if name == 'outputs': return 0 return super(SoftmaxEmitter, self).get_dim(name)
def softmax_layer(h, y, vocab_size, hidden_size): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
def create_rnn(hidden_dim, vocab_dim, mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name="W1", #dim = hidden_dim*4, dim=hidden_dim, length=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) if mode == "lstm": # Long Short Term Memory H = LSTM(hidden_dim, name='H', weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0.0)) else: # recurrent history weight H = SimpleRecurrent( name="H", dim=hidden_dim, activation=Tanh(), weights_init=initialization.IsotropicGaussian(0.01)) # S = Linear(name="W2", input_dim=hidden_dim, output_dim=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) A = NDimensionalSoftmax(name="softmax") initLayers([W, H, S]) activations = W.apply(x) hiddens = H.apply(activations) #[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
class GMMMLP(Initializable): """An mlp brick that branchs out to output sigma and mu for GMM Parameters ---------- mlp: MLP brick the main mlp to wrap around. dim: output dim """ def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2 ] #self.children.extend(self.mlp.children) @application def apply(self, inputs): state = self.mlp.apply(inputs) mu = self.mu.apply(state) sigma = self.sigma.apply(state) coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const return mu, sigma, coeff @property def output_dim(self): return self.dim
class GMMMLP(Initializable): """An mlp brick that branchs out to output sigma and mu for GMM Parameters ---------- mlp: MLP brick the main mlp to wrap around. dim: output dim """ def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [self.mlp, self.mu, self.sigma, self.coeff, self.coeff2] #self.children.extend(self.mlp.children) @application def apply(self, inputs): state = self.mlp.apply(inputs) mu = self.mu.apply(state) sigma = self.sigma.apply(state) coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const return mu, sigma, coeff @property def output_dim(self): return self.dim
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def softmax_output_layer(x, h, y, in_size, out_size, hidden_size, pred): if connect_h_to_o: hidden_to_output = Linear(name='hidden_to_output' + str(pred), input_dim=hidden_size * len(h), output_dim=out_size) hiddens = T.concatenate([hidden for hidden in h], axis=2) else: hidden_to_output = Linear(name='hidden_to_output' + str(pred), input_dim=hidden_size, output_dim=out_size) hiddens = h[-1] initialize([hidden_to_output]) linear_output = hidden_to_output.apply(hiddens) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() extra_ndim = 1 if single_dim_out else 2 y_hat = softmax.apply(linear_output, extra_ndim=extra_ndim) cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=extra_ndim).mean() return y_hat, cost
class LanguageModel(Initializable): """The dictionary-equipped language model. Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. retrieval The dictionary retrieval algorithm. If `None`, the language model does not use any dictionary. def_reader: either 'LSTM' or 'mean' standalone_def_rnn : bool If `True`, a standalone RNN with separate word embeddings is used to embed definition. If `False` the language model is reused. disregard_word_embeddings : bool If `True`, the word embeddings are not used, only the information from the definitions is used. compose_type : str If 'sum', the definition and word embeddings are averaged If 'fully_connected_linear', a learned perceptron compose the 2 embeddings linearly If 'fully_connected_relu', ... If 'fully_connected_tanh', ... """ def __init__(self, emb_dim, emb_def_dim, dim, num_input_words, def_num_input_words, num_output_words, vocab, retrieval=None, def_reader='LSTM', standalone_def_lookup=True, standalone_def_rnn=True, disregard_word_embeddings=False, compose_type='sum', very_rare_threshold=[10], cache_size=0, **kwargs): # TODO(tombosc): document if emb_dim == 0: emb_dim = dim if emb_def_dim == 0: emb_def_dim = emb_dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words if (num_input_words != def_num_input_words) and (not standalone_def_lookup): raise NotImplementedError() self._very_rare_threshold = very_rare_threshold self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._retrieval = retrieval self._disregard_word_embeddings = disregard_word_embeddings self._compose_type = compose_type self._word_to_id = WordToIdOp(self._vocab) self._word_to_count = WordToCountOp(self._vocab) children = [] self._cache = None if cache_size > 0: #TODO(tombosc) do we implement cache as LookupTable or theano matrix? #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim))) self._cache = LookupTable(cache_size, emb_dim, name='cache_def_embeddings') children.append(self._cache) if self._retrieval: self._retrieve = RetrievalOp(retrieval) self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork') self._main_rnn = DebugLSTM( dim, name='main_rnn') # TODO(tombosc): use regular LSTM? children.extend([self._main_lookup, self._main_fork, self._main_rnn]) if self._retrieval: if standalone_def_lookup: lookup = None else: if emb_dim != emb_def_dim: raise ValueError( "emb_dim != emb_def_dim: cannot share lookup") lookup = self._main_lookup if def_reader == 'LSTM': if standalone_def_rnn: fork_and_rnn = None else: fork_and_rnn = (self._main_fork, self._main_rnn) self._def_reader = LSTMReadDefinitions(def_num_input_words, emb_def_dim, dim, vocab, lookup, fork_and_rnn, cache=self._cache) elif def_reader == 'mean': self._def_reader = MeanPoolReadDefinitions( def_num_input_words, emb_def_dim, dim, vocab, lookup, translate=(emb_def_dim != dim), normalize=False) else: raise Exception("def reader not understood") self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs) def _push_initialization_config(self): super(LanguageModel, self)._push_initialization_config() if self._cache: self._cache.weights_init = Constant(0.) def set_def_embeddings(self, embeddings): self._def_reader._def_lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def get_def_embeddings_params(self): return self._def_reader._def_lookup.parameters[0] def get_cache_params(self): return self._cache.W def add_perplexity_measure(self, application_call, minus_logs, mask, name): costs = (minus_logs * mask).sum(axis=0) perplexity = tensor.exp(costs.sum() / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity(costs.sum(), mask.sum()) full_name = "perplexity_" + name application_call.add_auxiliary_variable(perplexity, name=full_name) return costs @application def apply(self, application_call, words, mask): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. """ if self._retrieval: defs, def_mask, def_map = self._retrieve(words) def_embeddings = self._def_reader.apply(defs, def_mask) # Auxililary variable for debugging application_call.add_auxiliary_variable(def_embeddings.shape[0], name="num_definitions") word_ids = self._word_to_id(words) # shortlisting input_word_ids = ( tensor.lt(word_ids, self._num_input_words) * word_ids + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk) output_word_ids = ( tensor.lt(word_ids, self._num_output_words) * word_ids + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk) application_call.add_auxiliary_variable(unk_ratio( input_word_ids, mask, self._vocab.unk), name='unk_ratio') # Run the main rnn with combined inputs word_embs = self._main_lookup.apply(input_word_ids) application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='word_emb_RMS') if self._retrieval: rnn_inputs, updated, positions = self._combiner.apply( word_embs, mask, def_embeddings, def_map) else: rnn_inputs = word_embs updates = [] if self._cache: flat_word_ids = word_ids.flatten() flat_word_ids_to_update = flat_word_ids[positions] # computing updates for cache updates = [ (self._cache.W, tensor.set_subtensor(self._cache.W[flat_word_ids_to_update], updated)) ] application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='main_rnn_in_RMS') main_rnn_states = self._main_rnn.apply(tensor.transpose( self._main_fork.apply(rnn_inputs), (1, 0, 2)), mask=mask.T)[0] # The first token is not predicted logits = self._pre_softmax.apply(main_rnn_states[:-1]) targets = output_word_ids.T[1:] out_softmax = self._softmax.apply(logits, extra_ndim=1) application_call.add_auxiliary_variable(out_softmax.copy(), name="proba_out") minus_logs = self._softmax.categorical_cross_entropy(targets, logits, extra_ndim=1) targets_mask = mask.T[1:] costs = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "") missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L) self.add_perplexity_measure(application_call, minus_logs, targets_mask * missing_embs.T[:-1], "after_mis_word_embs") self.add_perplexity_measure(application_call, minus_logs, targets_mask * (1 - missing_embs.T[:-1]), "after_word_embs") word_counts = self._word_to_count(words) very_rare_masks = [] for threshold in self._very_rare_threshold: very_rare_mask = tensor.lt(word_counts, threshold).astype('int32') very_rare_mask = targets_mask * (very_rare_mask.T[:-1]) very_rare_masks.append(very_rare_mask) self.add_perplexity_measure(application_call, minus_logs, very_rare_mask, "after_very_rare_" + str(threshold)) if self._retrieval: has_def = tensor.zeros_like(output_word_ids) has_def = tensor.inc_subtensor( has_def[def_map[:, 0], def_map[:, 1]], 1) mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs) self.add_perplexity_measure(application_call, minus_logs, mask_targets_has_def, "after_def_embs") for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks): self.add_perplexity_measure( application_call, minus_logs, very_rare_mask * mask_targets_has_def, "after_def_very_rare_" + str(thresh)) application_call.add_auxiliary_variable(mask_targets_has_def.T, name='mask_def_emb') return costs, updates
rnn.initialize() linear_output = Linear( name='linear_output', input_dim=hidden_layer_dim, output_dim=charset_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent( cost=cost, parameters=cg.parameters,
class ActorCriticReadout(SoftmaxReadout): """Actor-critic Params ------ bos_token : int The token used to pad critic input. Critic needs to do at least one extra step compared to the actor in order to get the first glimpse of the ground-truth sequence before predicting the actual values. """ def __init__(self, reward_brick, compute_targets, compute_policy, solve_bellman, freeze_actor, freeze_critic, critic_uses_actor_states, critic_uses_groundtruth, critic=None, critic_burnin_steps=None, critic_policy_t=None, entropy_reward_coof=None, cross_entropy_reward_coof=None, discount=None, value_penalty=None, value_softmax=False, same_value_for_wrong=False, accumulate_outputs=False, use_value_biases=None, actor_grad_estimate=None, bos_token=None, **kwargs): super(ActorCriticReadout, self).__init__(**kwargs) self.reward_brick = reward_brick self.critic = critic self.freeze_actor = freeze_actor self.freeze_critic = freeze_critic self.critic_uses_actor_states = critic_uses_actor_states self.critic_uses_groundtruth = (critic_uses_groundtruth if critic_uses_groundtruth is not None else True) self.critic_burnin_steps = (critic_burnin_steps if critic_burnin_steps is not None else 0) self.value_summand = Linear(output_dim=1, name='summand') self.softmax_t = 1. self.critic_policy_t = (critic_policy_t if critic_policy_t is not None else 1.0) self.epsilon = 0. self.discount = (discount if discount is not None else 1.) self.entropy_reward_coof = (entropy_reward_coof if entropy_reward_coof is not None else 0.) self.cross_entropy_reward_coof = (cross_entropy_reward_coof if cross_entropy_reward_coof is not None else 0.) self.value_penalty = value_penalty self.value_softmax = value_softmax self.same_value_for_wrong = same_value_for_wrong self.compute_targets = compute_targets self.compute_policy = compute_policy self.solve_bellman = solve_bellman self.accumulate_outputs = accumulate_outputs self.use_value_biases = (use_value_biases if use_value_biases is not None else True) self.actor_grad_estimate = (actor_grad_estimate if actor_grad_estimate else 'all_actions') self.bos_token = bos_token self.softmax = NDimensionalSoftmax() self.children += [reward_brick, self.value_summand, self.softmax] if self.critic: self.children.append(self.critic) self.costs.inputs += ['attended', 'attended_mask'] def _push_allocation_config(self): super(ActorCriticReadout, self)._push_allocation_config() self.value_summand.input_dim = self.get_dim('attended') @application def scores(self, **inputs): merged = self.merge(**dict_subset(inputs, self.merge_names)) return self.softmax.log_probabilities(merged * self.softmax_t, extra_ndim=merged.ndim - 2) @application def costs(self, application_call, prediction, prediction_mask, groundtruth, groundtruth_mask, **inputs): def _prediction_subtensor(data): if data.ndim != 3: raise ValueError flat_data = data.reshape( (data.shape[0] * data.shape[1], data.shape[2])) flat_data = flat_data[tensor.arange(flat_data.shape[0]), prediction.flatten()] return flat_data.reshape( (prediction.shape[0], prediction.shape[1])) attended = disconnected_grad(inputs.pop('attended')) attended_mask = disconnected_grad(inputs.pop('attended_mask')) # Compute the rewards rewards = self.reward_brick.apply(prediction, prediction_mask, groundtruth, groundtruth_mask)[:, :, 0] future_rewards = rewards[::-1].cumsum(axis=0)[::-1] # Compute the critic outputs if self.critic: padding = tensor.repeat(tensor.fill(prediction[0:1], self.bos_token), 1, axis=0) mask_padding = tensor.repeat(tensor.fill(prediction_mask[0:1], 1.), 1, axis=0) padded_prediction = tensor.concatenate([padding, prediction]) padded_prediction_mask = tensor.concatenate( [mask_padding, prediction_mask]) if self.critic_uses_groundtruth: critic_context = groundtruth critic_context_mask = groundtruth_mask else: critic_context = tensor.zeros_like(groundtruth[0:1]) critic_context_mask = tensor.zeros_like(groundtruth_mask[0:1]) critic_kwargs = dict(prediction=padded_prediction, prediction_mask=padded_prediction_mask, groundtruth=critic_context, groundtruth_mask=critic_context_mask, inputs=critic_context, inputs_mask=critic_context_mask) if self.critic_uses_actor_states: extra_inputs = disconnected_grad(inputs['states']) # We don't the very last hidden state of the actor # in extra_inputs. We have to add something instead for the shapes # to match. It doesn't matter at all, what exactly we add. critic_kwargs['extra_inputs'] = tensor.concatenate( [extra_inputs, tensor.zeros_like(extra_inputs[0:1])]) critic_cg = ComputationGraph(self.critic.costs(**critic_kwargs)) outputs, = VariableFilter( applications=[self.critic.generator.readout.all_outputs], roles=[OUTPUT])(critic_cg) # The first subtensor should be discarded, because it was outputted # for the padding. In addition to that Q-values from the first # 'critic_burnin_steps' will be ignored, see later in the code. outputs = outputs[1:] else: outputs = self.merge(**dict_subset(inputs, self.merge_names)) prediction_outputs = _prediction_subtensor(outputs) # Compute Q adjustments adjustments = outputs prediction_adjustments = prediction_outputs if self.accumulate_outputs: prediction_adjustments = prediction_outputs.cumsum(axis=0) adjustments = tensor.inc_subtensor( adjustments[1:], prediction_adjustments[:-1][:, :, None]) # Compute shared additive biases for all Q values if self.use_value_biases: value_biases = (self.value_summand.apply(attended)[:, :, 0] * attended_mask).sum(axis=0) else: value_biases = tensor.zeros_like(adjustments[0, :, 0]) values = adjustments + value_biases[None, :, None] prediction_values = prediction_adjustments + value_biases[None, :] rolled_prediction_mask = tensor.roll(prediction_mask, -1, axis=0) rolled_prediction_mask = tensor.set_subtensor( rolled_prediction_mask[-1], 0) # Compute probabilities logs = self.scores(use_epsilon=False, **inputs) probs = tensor.exp(logs) if not self.compute_policy: raise NotImplementedError("Not supported any more") prediction_logs = _prediction_subtensor(logs) # Compute value targets value_targets = (disconnected_grad(probs) * values).sum(axis=-1) value_targets = tensor.roll(value_targets, -1, axis=0) value_targets = ( self.discount * value_targets * rolled_prediction_mask + rewards) value_targets = value_targets.astype(theano.config.floatX) total_costs = 0 # Compute critic cost if not self.compute_targets: logger.debug("Using given targets") value_targets = tensor.matrix('value_targets') if self.solve_bellman == 'no': logger.debug("Not solving Bellman, just predicting the rewards") value_targets = rewards.copy(name='value_targets') elif self.solve_bellman == 'without_dp': future_rewards = rewards[::-1].cumsum(axis=0)[::-1] logger.debug("Solving Bellman, but without DP") value_targets = future_rewards elif self.solve_bellman is not True: raise ValueError() critic_costs_per_char = ( (prediction_values - value_targets)**2) * prediction_mask critic_costs = critic_costs_per_char[self.critic_burnin_steps:].sum( axis=0) if not self.freeze_critic: total_costs += critic_costs # Compute critic Monte-Carlo cost critic_monte_carlo_costs = ( (((prediction_values - future_rewards)**2) * prediction_mask)[self.critic_burnin_steps:].sum(axis=0)) # Value penalty if self.value_penalty: logger.debug("Use value penalty") value_deviations = (values - values.mean(axis=-1, keepdims=True))**2 if not self.freeze_critic: total_costs += ( self.value_penalty * (value_deviations.sum(axis=-1) * prediction_mask)[self.critic_burnin_steps:].sum(axis=0)) # Compute actor cost if self.critic: # The actor cost will be minimized, that's why values # must be negated. est_name = self.actor_grad_estimate if est_name == 'all_actions': disadvantages = disconnected_grad( values.max(axis=-1)[:, :, None] - values) actor_costs = ((probs * disadvantages).sum(axis=-1) * prediction_mask) actor_costs = actor_costs[self.critic_burnin_steps:] elif est_name.startswith('1_action'): # Here we do not provide a target for the first step for # the reason we lack an estimate of the value of the initial state. # This is how our critic works. # Hopefully the network won't unlearn # to produce a BOS first. future_reward_estimate = (future_rewards if est_name.endswith('unbiased') else prediction_values) weights = -disconnected_grad(future_reward_estimate[1:] + rewards[:-1] - prediction_values[:-1]) actor_costs = ((prediction_logs[1:] * weights) * prediction_mask[1:]) actor_costs = actor_costs[self.critic_burnin_steps + 1:] else: raise ValueError actor_costs = actor_costs.sum(axis=0) actor_entropies = (probs * -logs).sum(axis=-1) * prediction_mask actor_entropies = actor_entropies[self.critic_burnin_steps:].sum( axis=0) critic_policy = disconnected_grad( self.softmax.apply(self.critic_policy_t * values, extra_ndim=1)) critic_cross_entropies = ((critic_policy * -logs).sum(axis=-1) * prediction_mask) critic_cross_entropies = critic_cross_entropies[ self.critic_burnin_steps:].sum(axis=0) actor_costs_with_penalties = ( actor_costs - self.entropy_reward_coof * actor_entropies - self.cross_entropy_reward_coof * critic_cross_entropies) if not self.freeze_actor: total_costs += actor_costs_with_penalties else: total_costs += disconnected_grad(actor_costs_with_penalties) # Add auxiliary variables for intermediate steps of the computation application_call.add_auxiliary_variable(rewards, name='rewards') application_call.add_auxiliary_variable(value_biases, name='value_biases') application_call.add_auxiliary_variable(values.copy(), name='values') application_call.add_auxiliary_variable(outputs.copy(), name='outputs') application_call.add_auxiliary_variable(prediction_values, name='prediction_values') application_call.add_auxiliary_variable(prediction_outputs, name='prediction_outputs') application_call.add_auxiliary_variable(value_targets.copy(), name='value_targets') application_call.add_auxiliary_variable(probs.copy(), name='probs') application_call.add_auxiliary_variable(prediction_logs, name='prediction_log_probs') # Compute some statistics for debugging last_character_mask = prediction_mask - rolled_prediction_mask last_character_costs = (critic_costs_per_char * last_character_mask).sum(axis=0) mean2_output = (((prediction_outputs**2) * prediction_mask).sum() / prediction_mask.sum())**0.5 max_output = abs(prediction_outputs * prediction_mask).max() expected_reward = (probs[0] * values[0]).sum(axis=-1) application_call.add_auxiliary_variable(last_character_costs, name='last_character_costs') application_call.add_auxiliary_variable(critic_costs.mean(), name='mean_critic_cost') application_call.add_auxiliary_variable( critic_monte_carlo_costs.mean(), name='mean_critic_monte_carlo_cost') if self.critic: application_call.add_auxiliary_variable(actor_costs.mean(), name='mean_actor_cost') application_call.add_auxiliary_variable(actor_entropies.mean(), name='mean_actor_entropy') application_call.add_auxiliary_variable(expected_reward.mean(), name='mean_expected_reward') application_call.add_auxiliary_variable(mean2_output, name='mean2_output') application_call.add_auxiliary_variable(max_output, name='max_output') return total_costs
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM( dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01) ) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
class FRNNEmitter(AbstractEmitter, Initializable, Random): """An RNN emitter for the case of real outputs. Parameters ---------- """ def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu") self.sigma = MLP( activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma" ) self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state" ) # self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh(name="frnn_activation") self.frnn_linear_transition_state = Linear( input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state" ) self.frnn_linear_transition_input = Linear( input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input" ) # self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2, self.frnn_initial_state, self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input, ] @application def emit(self, readouts): """ keep_parameters is True if mu,sigma,coeffs must be stacked and returned if false, only the result is given, the others will be empty list. """ # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) results = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. mu = self.mu.apply(state) sigma = self.sigma.apply(state) + self.const coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const shape_result = coeff.shape shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size) ndim_result = coeff.ndim mu = mu.reshape((-1, self.frnn_step_size, self.k)) sigma = sigma.reshape((-1, self.frnn_step_size, self.k)) coeff = coeff.reshape((-1, self.k)) sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype) idx = predict(sample_coeff, axis=-1) # idx = predict(coeff, axis = -1) use this line for using most likely coeff. # shapes (ls*bs)*(fs) mu = mu[tensor.arange(mu.shape[0]), :, idx] sigma = sigma[tensor.arange(sigma.shape[0]), :, idx] epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype) result = mu + sigma * epsilon # *0.6 #reduce variance. result = result.reshape(shape_result, ndim=ndim_result) results.append(result) # if the total size does not correspond to the frame_size, # this removes the need for padding if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result) ) results = tensor.stack(results, axis=-1) results = tensor.flatten(results, outdim=results.ndim - 1) # truncate if not good size if self.last_steps != 0: results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])] return results @application def cost(self, readouts, outputs): # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) inputs = outputs mus = [] sigmas = [] coeffs = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. freq_mu = self.mu.apply(state) freq_sigma = self.sigma.apply(state) + self.const freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k)) freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k)) freq_coeff = freq_coeff.reshape((-1, self.k)) # mu,sigma: shape (-1,fs,k) # coeff: shape (-1,k) mus.append(freq_mu) sigmas.append(freq_sigma) coeffs.append(freq_coeff) index = self.frnn_step_size freq_inputs = inputs[ tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)]) ] if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(freq_inputs) ) mus = tensor.stack(mus, axis=-2) sigmas = tensor.stack(sigmas, axis=-2) coeffs = tensor.stack(coeffs, axis=-2) mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) coeffs = coeffs.repeat(self.frnn_step_size, axis=-2) mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] # actually prob not necessary mu = mus.reshape((-1, self.target_size)) sigma = sigmas.reshape((-1, self.target_size)) coeff = coeffs.reshape((-1, self.target_size)) return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k) @application def initial_outputs(self, batch_size): return tensor.zeros((batch_size, self.frame_size), dtype=floatX) def get_dim(self, name): # modification here to ensure the right dim. if name == "outputs": return self.frame_size return super(FRNNEmitter, self).get_dim(name)
def __init__(self, input_sources_list, input_sources_vocab_size_list, output_source, output_source_vocab_size, lookup_dim=200, hidden_size=256, recurrent_stack_size=1): self.InputSources = input_sources_list self.InputSourcesVocab = input_sources_vocab_size_list self.OutputSource = output_source self.OutputSourceVocab = output_source_vocab_size inputs = [tensor.lmatrix(source) for source in input_sources_list] output = tensor.lmatrix(output_source) lookups = self.get_lookups(lookup_dim, input_sources_vocab_size_list) for lookup in lookups: lookup.initialize() merge = Merge([lookup.name for lookup in lookups], [lookup.dim for lookup in lookups], hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() linear0 = Linear(input_dim=hidden_size, output_dim=hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear0') linear0.initialize() recurrent_blocks = [] for i in range(recurrent_stack_size): recurrent_blocks.append(SimpleRecurrent( dim=hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01), use_bias=False)) for i, recurrent_block in enumerate(recurrent_blocks): recurrent_block.name = 'recurrent'+str(i+1) recurrent_block.initialize() linear_out = Linear(input_dim=hidden_size, output_dim=output_source_vocab_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear_out') linear_out.initialize() softmax = NDimensionalSoftmax(name='softmax') lookup_outputs = [lookup.apply(input) for lookup, input in zip(lookups, inputs)] m = merge.apply(*lookup_outputs) r = linear0.apply(m) for block in recurrent_blocks: r = block.apply(r) a = linear_out.apply(r) self.Cost = softmax.categorical_cross_entropy(output, a, extra_ndim=1).mean() self.Cost.name = 'cost' y_hat = softmax.apply(a, extra_ndim=1) y_hat.name = 'y_hat' self.ComputationGraph = ComputationGraph(self.Cost) self.Function = None self.MainLoop = None self.Model = Model(y_hat)
weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore')
class FRNNEmitter(AbstractEmitter, Initializable, Random): """An RNN emitter for the case of real outputs. Parameters ---------- """ def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, \ frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k*frnn_step_size], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[frnn_hidden_size, k*frnn_step_size], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim = self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state") #self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh( name="frnn_activation") self.frnn_linear_transition_state = Linear ( input_dim = frnn_hidden_size, output_dim= frnn_hidden_size, name="frnn_linear_transition_state") self.frnn_linear_transition_input = Linear ( input_dim = self.frnn_step_size, output_dim = frnn_hidden_size, name="frnn_linear_transition_input") #self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [self.mlp,self.mu,self.sigma,self.coeff, self.coeff2,self.frnn_initial_state,self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input] @application def emit(self,readouts): """ keep_parameters is True if mu,sigma,coeffs must be stacked and returned if false, only the result is given, the others will be empty list. """ # initial state state = self.frnn_initial_state.apply(\ self.mlp.apply(readouts)) results = [] for i in range(self.number_of_steps): last_iteration = (i == self.number_of_steps - 1) # First generating distribution parameters and sampling. mu = self.mu.apply(state) sigma = self.sigma.apply(state) + self.const coeff = self.coeff2.apply(self.coeff.apply(state),\ extra_ndim=state.ndim - 2) + self.const shape_result = coeff.shape shape_result = tensor.set_subtensor(shape_result[-1],self.frnn_step_size) ndim_result = coeff.ndim mu = mu.reshape((-1, self.frnn_step_size,self.k)) sigma = sigma.reshape((-1, self.frnn_step_size,self.k)) coeff = coeff.reshape((-1, self.k)) sample_coeff = self.theano_rng.multinomial(pvals = coeff, dtype=coeff.dtype) idx = predict(sample_coeff, axis = -1) #idx = predict(coeff, axis = -1) use this line for using most likely coeff. #shapes (ls*bs)*(fs) mu = mu[tensor.arange(mu.shape[0]), :,idx] sigma = sigma[tensor.arange(sigma.shape[0]), :,idx] epsilon = self.theano_rng.normal( size=mu.shape, avg=0., std=1., dtype=mu.dtype) result = mu + sigma*epsilon#*0.6 #reduce variance. result = result.reshape(shape_result, ndim = ndim_result) results.append(result) # if the total size does not correspond to the frame_size, #this removes the need for padding if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result)) results = tensor.stack(results,axis=-1) results = tensor.flatten(results,outdim=results.ndim-1) # truncate if not good size if self.last_steps != 0: results = results[tuple([slice(0,None)] * \ (results.ndim-1) +[slice(0,self.frame_size)])] return results @application def cost(self, readouts, outputs): # initial state state = self.frnn_initial_state.apply(\ self.mlp.apply(readouts)) inputs = outputs mus = [] sigmas = [] coeffs = [] for i in range(self.number_of_steps): last_iteration = (i == self.number_of_steps - 1) # First generating distribution parameters and sampling. freq_mu = self.mu.apply(state) freq_sigma = self.sigma.apply(state) + self.const freq_coeff = self.coeff2.apply(self.coeff.apply(state),\ extra_ndim=state.ndim - 2) + self.const freq_mu = freq_mu.reshape((-1,self.frnn_step_size,self.k)) freq_sigma = freq_sigma.reshape((-1,self.frnn_step_size,self.k)) freq_coeff = freq_coeff.reshape((-1,self.k)) #mu,sigma: shape (-1,fs,k) #coeff: shape (-1,k) mus.append(freq_mu) sigmas.append(freq_sigma) coeffs.append(freq_coeff) index = self.frnn_step_size freq_inputs = inputs[tuple([slice(0,None)] * \ (inputs.ndim-1) +[slice(index,index+self.frnn_step_size)])] if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(freq_inputs)) mus = tensor.stack(mus,axis=-2) sigmas = tensor.stack(sigmas,axis=-2) coeffs = tensor.stack(coeffs,axis=-2) mus = mus.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k)) sigmas = sigmas.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k)) coeffs = coeffs.repeat(self.frnn_step_size,axis=-2) mus = mus[tuple([slice(0,None)] * \ (mus.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])] sigmas = sigmas[tuple([slice(0,None)] * \ (sigmas.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])] coeffs = coeffs[tuple([slice(0,None)] * \ (coeffs.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])] # actually prob not necessary mu = mus.reshape((-1,self.target_size)) sigma = sigmas.reshape((-1,self.target_size)) coeff = coeffs.reshape((-1, self.target_size)) return FRNN_NLL (y=outputs, mu=mu, sig=sigma, coeff=coeff,\ frame_size=self.frame_size,k=self.k) @application def initial_outputs(self, batch_size): return tensor.zeros((batch_size, self.frame_size), dtype=floatX) def get_dim(self, name): # modification here to ensure the right dim. if name == 'outputs': return self.frame_size return super(FRNNEmitter, self).get_dim(name)
class Seq2Seq(Initializable): """ seq2seq model Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. """ def __init__(self, emb_dim, dim, num_input_words, num_output_words, vocab, **kwargs): if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if num_output_words == 0: num_output_words = vocab.size() self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._word_to_id = WordToIdOp(self._vocab) children = [] self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._decoder_fork = Linear(emb_dim, 4 * dim, name='decoder_fork') self._decoder_rnn = LSTM(dim, name='decoder_rnn') children.extend([self._main_lookup, self._encoder_fork, self._encoder_rnn, self._decoder_fork, self._decoder_rnn]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs) def set_def_embeddings(self, embeddings): self._main_lookup.parameters[0].set_value(embeddings.astype(theano.config.floatX)) def get_def_embeddings_params(self): return self._main_lookup.parameters[0] def add_perplexity_measure(self, application_call, minus_logs, mask, name): costs = (minus_logs * mask).sum(axis=0) perplexity = tensor.exp(costs.sum() / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity( costs.sum(), mask.sum()) application_call.add_auxiliary_variable(perplexity, name=name) return costs @application def apply(self, application_call, words, mask): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. """ word_ids = self._word_to_id(words) # shortlisting input_word_ids = (tensor.lt(word_ids, self._num_input_words) * word_ids + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk) output_word_ids = (tensor.lt(word_ids, self._num_output_words) * word_ids + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk) application_call.add_auxiliary_variable( unk_ratio(input_word_ids, mask, self._vocab.unk), name='unk_ratio') # Run the main rnn with combined inputs rnn_inputs = self._main_lookup.apply(input_word_ids) encoder_rnn_states = self._encoder_rnn.apply( tensor.transpose(self._encoder_fork.apply(rnn_inputs), (1, 0, 2)), mask=mask.T)[0] # The first token is not predicted logits = self._pre_softmax.apply(main_rnn_states[:-1]) targets = output_word_ids.T[1:] out_softmax = self._softmax.apply(logits, extra_ndim=1) application_call.add_auxiliary_variable( out_softmax.copy(), name="proba_out") minus_logs = self._softmax.categorical_cross_entropy( targets, logits, extra_ndim=1) targets_mask = mask.T[1:] costs = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "perplexity") missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L) self.add_perplexity_measure(application_call, minus_logs, targets_mask * missing_embs.T[:-1], "perplexity_after_mis_word_embs") self.add_perplexity_measure(application_call, minus_logs, targets_mask * (1-missing_embs.T[:-1]), "perplexity_after_word_embs") word_counts = self._word_to_count(words) very_rare_masks = [] for threshold in self._very_rare_threshold: very_rare_mask = tensor.lt(word_counts, threshold).astype('int32') very_rare_mask = targets_mask * (very_rare_mask.T[:-1]) very_rare_masks.append(very_rare_mask) self.add_perplexity_measure(application_call, minus_logs, very_rare_mask, "perplexity_after_very_rare_" + str(threshold)) if self._retrieval: has_def = tensor.zeros_like(output_word_ids) has_def = tensor.inc_subtensor(has_def[def_map[:,0], def_map[:,1]], 1) mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs) self.add_perplexity_measure(application_call, minus_logs, mask_targets_has_def, "perplexity_after_def_embs") for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks): self.add_perplexity_measure(application_call, minus_logs, very_rare_mask * mask_targets_has_def, "perplexity_after_def_very_rare_" + str(thresh)) application_call.add_auxiliary_variable( mask_targets_has_def.T, name='mask_def_emb') return costs, updates
parser.add_argument('-temperature', type=float, default=1.0, help='temperature of sampling') args = parser.parse_args() # Define primetext ix_to_char, char_to_ix, vocab_size = get_metadata(hdf5_file) if args.primetext and len(args.primetext) > 0: primetext = ''.join( [ch for ch in args.primetext if ch in char_to_ix.keys()]) x_curr = numpy.expand_dims( numpy.array([char_to_ix[ch] for ch in primetext], dtype='uint8'), axis=1) else: dev_stream = get_stream(hdf5_file, 'dev', batch_size) x_curr, y_curr = dev_stream.get_epoch_iterator().next() x_curr = x_curr[:, -1].reshape(seq_length, 1) print 'Loading model from {0}...'.format(args.model) main_loop = load(args.model) print 'Model loaded. Building prediction function...' model = main_loop.model y, x = model.inputs softmax = NDimensionalSoftmax() linear_output = [ v for v in model.variables if v.name == 'linear_output'][0] y_hat = softmax.apply(linear_output, extra_ndim=1) predict = theano.function([x], y_hat) print 'Starting sampling' sample_string = sample(args.length, x_curr, predict, ix_to_char, seed=args.seed, temperature=args.temperature)
class ExtractiveQAModel(Initializable): """The dictionary-equipped extractive QA model. Parameters ---------- dim : int The default dimensionality for the components. emd_dim : int The dimensionality for the embeddings. If 0, `dim` is used. coattention : bool Use the coattention mechanism. num_input_words : int The number of input words. If 0, `vocab.size()` is used. The vocabulary object. use_definitions : bool Triggers the use of definitions. reuse_word_embeddings : bool compose_type : str """ def __init__(self, dim, emb_dim, readout_dims, num_input_words, def_num_input_words, vocab, use_definitions, def_word_gating, compose_type, coattention, def_reader, reuse_word_embeddings, random_unk, **kwargs): self._vocab = vocab if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words self._coattention = coattention self._num_input_words = num_input_words self._use_definitions = use_definitions self._random_unk = random_unk self._reuse_word_embeddings = reuse_word_embeddings lookup_num_words = num_input_words if reuse_word_embeddings: lookup_num_words = max(num_input_words, def_num_input_words) if random_unk: lookup_num_words = vocab.size() # Dima: we can have slightly less copy-paste here if we # copy the RecurrentFromFork class from my other projects. children = [] self._lookup = LookupTable(lookup_num_words, emb_dim) self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._question_transform = Linear(dim, dim, name='question_transform') self._bidir_fork = Linear(3 * dim if coattention else 2 * dim, 4 * dim, name='bidir_fork') self._bidir = Bidirectional(LSTM(dim), name='bidir') children.extend([ self._lookup, self._encoder_fork, self._encoder_rnn, self._question_transform, self._bidir, self._bidir_fork ]) activations = [Rectifier()] * len(readout_dims) + [None] readout_dims = [2 * dim] + readout_dims + [1] self._begin_readout = MLP(activations, readout_dims, name='begin_readout') self._end_readout = MLP(activations, readout_dims, name='end_readout') self._softmax = NDimensionalSoftmax() children.extend( [self._begin_readout, self._end_readout, self._softmax]) if self._use_definitions: # A potential bug here: we pass the same vocab to the def reader. # If a different token is reserved for UNK in text and in the definitions, # we can be screwed. def_reader_class = eval(def_reader) def_reader_kwargs = dict( num_input_words=def_num_input_words, dim=dim, emb_dim=emb_dim, vocab=vocab, lookup=self._lookup if reuse_word_embeddings else None) if def_reader_class == MeanPoolReadDefinitions: def_reader_kwargs.update(dict(normalize=True, translate=False)) self._def_reader = def_reader_class(**def_reader_kwargs) self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, def_word_gating=def_word_gating, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) super(ExtractiveQAModel, self).__init__(children=children, **kwargs) # create default input variables self.contexts = tensor.lmatrix('contexts') self.context_mask = tensor.matrix('contexts_mask') self.questions = tensor.lmatrix('questions') self.question_mask = tensor.matrix('questions_mask') self.answer_begins = tensor.lvector('answer_begins') self.answer_ends = tensor.lvector('answer_ends') input_vars = [ self.contexts, self.context_mask, self.questions, self.question_mask, self.answer_begins, self.answer_ends ] if self._use_definitions: self.defs = tensor.lmatrix('defs') self.def_mask = tensor.matrix('def_mask') self.contexts_def_map = tensor.lmatrix('contexts_def_map') self.questions_def_map = tensor.lmatrix('questions_def_map') input_vars.extend([ self.defs, self.def_mask, self.contexts_def_map, self.questions_def_map ]) self.input_vars = OrderedDict([(var.name, var) for var in input_vars]) def set_embeddings(self, embeddings): self._lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def embeddings_var(self): return self._lookup.parameters[0] def def_reading_parameters(self): parameters = Selector(self._def_reader).get_parameters().values() parameters.extend(Selector(self._combiner).get_parameters().values()) if self._reuse_word_embeddings: lookup_parameters = Selector( self._lookup).get_parameters().values() parameters = [p for p in parameters if p not in lookup_parameters] return parameters @application def _encode(self, application_call, text, mask, def_embs=None, def_map=None, text_name=None): if not self._random_unk: text = (tensor.lt(text, self._num_input_words) * text + tensor.ge(text, self._num_input_words) * self._vocab.unk) if text_name: application_call.add_auxiliary_variable( unk_ratio(text, mask, self._vocab.unk), name='{}_unk_ratio'.format(text_name)) embs = self._lookup.apply(text) if self._random_unk: embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs + tensor.ge(text, self._num_input_words)[:, :, None] * disconnected_grad(embs)) if def_embs: embs = self._combiner.apply(embs, mask, def_embs, def_map) add_role(embs, EMBEDDINGS) encoded = flip01( self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)), mask=mask.T)[0]) return encoded @application def apply(self, application_call, contexts, contexts_mask, questions, questions_mask, answer_begins, answer_ends, defs=None, def_mask=None, contexts_def_map=None, questions_def_map=None): def_embs = None if self._use_definitions: def_embs = self._def_reader.apply(defs, def_mask) context_enc = self._encode(contexts, contexts_mask, def_embs, contexts_def_map, 'context') question_enc_pre = self._encode(questions, questions_mask, def_embs, questions_def_map, 'question') question_enc = tensor.tanh( self._question_transform.apply(question_enc_pre)) # should be (batch size, context length, question_length) affinity = tensor.batched_dot(context_enc, flip12(question_enc)) affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :] affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask) # soft-aligns every position in the context to positions in the question d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1) application_call.add_auxiliary_variable(d2q_att_weights.copy(), name='d2q_att_weights') # soft-aligns every position in the question to positions in the document q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1) application_call.add_auxiliary_variable(q2d_att_weights.copy(), name='q2d_att_weights') # question encoding "in the view of the document" question_enc_informed = tensor.batched_dot(q2d_att_weights, context_enc) question_enc_concatenated = tensor.concatenate( [question_enc, question_enc_informed], 2) # document encoding "in the view of the question" context_enc_informed = tensor.batched_dot(d2q_att_weights, question_enc_concatenated) if self._coattention: context_enc_concatenated = tensor.concatenate( [context_enc, context_enc_informed], 2) else: question_repr_repeated = tensor.repeat(question_enc[:, [-1], :], context_enc.shape[1], axis=1) context_enc_concatenated = tensor.concatenate( [context_enc, question_repr_repeated], 2) # note: forward and backward LSTMs share the # input weights in the current impl bidir_states = flip01( self._bidir.apply(self._bidir_fork.apply( flip01(context_enc_concatenated)), mask=contexts_mask.T)[0]) begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0] begin_readouts = begin_readouts * contexts_mask - 1000.0 * ( 1 - contexts_mask) begin_costs = self._softmax.categorical_cross_entropy( answer_begins, begin_readouts) end_readouts = self._end_readout.apply(bidir_states)[:, :, 0] end_readouts = end_readouts * contexts_mask - 1000.0 * (1 - contexts_mask) end_costs = self._softmax.categorical_cross_entropy( answer_ends, end_readouts) predicted_begins = begin_readouts.argmax(axis=-1) predicted_ends = end_readouts.argmax(axis=-1) exact_match = (tensor.eq(predicted_begins, answer_begins) * tensor.eq(predicted_ends, answer_ends)) application_call.add_auxiliary_variable(predicted_begins, name='predicted_begins') application_call.add_auxiliary_variable(predicted_ends, name='predicted_ends') application_call.add_auxiliary_variable(exact_match, name='exact_match') return begin_costs + end_costs def apply_with_default_vars(self): return self.apply(*self.input_vars.values())
class ESIM(Initializable): """ ESIM model based on https://github.com/NYU-MLL/multiNLI/blob/master/python/models/esim.py """ # seq_length, emb_dim, hidden_dim def __init__( self, dim, emb_dim, vocab, def_emb_translate_dim=-1, def_dim=-1, encoder='bilstm', bn=True, def_reader=None, def_combiner=None, dropout=0.5, num_input_words=-1, # Others **kwargs): self._dropout = dropout self._vocab = vocab self._emb_dim = emb_dim self._def_reader = def_reader self._def_combiner = def_combiner if encoder != 'bilstm': raise NotImplementedError() if def_emb_translate_dim < 0: self.def_emb_translate_dim = emb_dim else: self.def_emb_translate_dim = def_emb_translate_dim if def_dim < 0: self._def_dim = emb_dim else: self._def_dim = def_dim if num_input_words > 0: logger.info("Restricting vocab to " + str(num_input_words)) self._num_input_words = num_input_words else: self._num_input_words = vocab.size() children = [] if self.def_emb_translate_dim != self._emb_dim: self._translate_pre_def = Linear(input_dim=emb_dim, output_dim=def_emb_translate_dim) children.append(self._translate_pre_def) else: self._translate_pre_def = None ## Embedding self._lookup = LookupTable(self._num_input_words, emb_dim, weights_init=GlorotUniform()) children.append(self._lookup) if def_reader: self._final_emb_dim = self._def_dim self._def_reader = def_reader self._def_combiner = def_combiner children.extend([self._def_reader, self._def_combiner]) else: self._final_emb_dim = self._emb_dim ## BiLSTM self._hyp_bidir_fork = Linear( self._def_dim if def_reader else self._emb_dim, 4 * dim, name='hyp_bidir_fork') self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir') self._prem_bidir_fork = Linear( self._def_dim if def_reader else self._emb_dim, 4 * dim, name='prem_bidir_fork') self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir') children.extend([self._hyp_bidir_fork, self._hyp_bidir]) children.extend([self._prem_bidir, self._prem_bidir_fork]) ## BiLSTM no. 2 (encoded attentioned embeddings) self._hyp_bidir_fork2 = Linear(8 * dim, 4 * dim, name='hyp_bidir_fork2') self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2') self._prem_bidir_fork2 = Linear(8 * dim, 4 * dim, name='prem_bidir_fork2') self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2') children.extend([self._hyp_bidir_fork2, self._hyp_bidir2]) children.extend([self._prem_bidir2, self._prem_bidir_fork2]) self._rnns = [ self._prem_bidir2, self._hyp_bidir2, self._prem_bidir, self._hyp_bidir ] ## MLP if bn: self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim], conserve_memory=False, name="mlp") self._pred = BatchNormalizedMLP([Softmax()], [dim, 3], conserve_memory=False, name="pred_mlp") else: self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp") self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp") children.append(self._mlp) children.append(self._pred) ## Softmax self._ndim_softmax = NDimensionalSoftmax() children.append(self._ndim_softmax) super(ESIM, self).__init__(children=children, **kwargs) def get_embeddings_lookups(self): return [self._lookup] def set_embeddings(self, embeddings): self._lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def get_def_embeddings_lookups(self): return [self._def_reader._def_lookup] def set_def_embeddings(self, embeddings): self._def_reader._def_lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) @application def apply(self, application_call, s1_preunk, s1_mask, s2_preunk, s2_mask, def_mask=None, defs=None, s1_def_map=None, s2_def_map=None, train_phase=True): # Shortlist words (sometimes we want smaller vocab, especially when dict is small) s1 = (tensor.lt(s1_preunk, self._num_input_words) * s1_preunk + tensor.ge(s1_preunk, self._num_input_words) * self._vocab.unk) s2 = (tensor.lt(s2_preunk, self._num_input_words) * s2_preunk + tensor.ge(s2_preunk, self._num_input_words) * self._vocab.unk) ### Embed ### s1_emb = self._lookup.apply(s1) s2_emb = self._lookup.apply(s2) application_call.add_auxiliary_variable(1 * s1_emb, name='s1_word_embeddings') if self._def_reader: assert defs is not None def_embs = self._def_reader.apply(defs, def_mask) if self._translate_pre_def: logger.info("Translate pre def") s1_emb = s1_emb.reshape( (s1_emb.shape[0] * s1_emb.shape[1], s1_emb.shape[2])) s2_emb = s2_emb.reshape( (s2_emb.shape[0] * s2_emb.shape[1], s2_emb.shape[2])) s1_emb = self._translate_pre_def.apply(s1_emb) s2_emb = self._translate_pre_def.apply(s2_emb) s1_emb = s1_emb.reshape( (s1_preunk.shape[0], s1_preunk.shape[1], -1)) s2_emb = s2_emb.reshape( (s2_preunk.shape[0], s2_preunk.shape[1], -1)) s1_emb = self._def_combiner.apply(s1_emb, s1_mask, def_embs, s1_def_map, word_ids=s1, train_phase=train_phase, call_name="s1") s2_emb = self._def_combiner.apply(s2_emb, s2_mask, def_embs, s2_def_map, word_ids=s2, train_phase=train_phase, call_name="s2") else: if train_phase and self._dropout > 0: s1_emb = apply_dropout(s1_emb, drop_prob=self._dropout) s2_emb = apply_dropout(s2_emb, drop_prob=self._dropout) ### Encode ### # TODO: Share this bilstm? s1_bilstm, _ = self._prem_bidir.apply( flip01(self._prem_bidir_fork.apply(s1_emb)), mask=s1_mask.T) # (batch_size, n_seq, 2 * dim) s2_bilstm, _ = self._hyp_bidir.apply( flip01(self._hyp_bidir_fork.apply(s2_emb)), mask=s2_mask.T) # (batch_size, n_seq, 2 * dim) s1_bilstm = flip01(s1_bilstm) s2_bilstm = flip01(s2_bilstm) ### Attention ### # Compute E matrix (eq. 11) # E_ij = <s1[i], s2[j]> # each call computes E[i, :] def compute_e_row(s2_i, s1_bilstm, s1_mask): b_size = s1_bilstm.shape[0] # s2_i is (batch_size, emb_dim) # s1_bilstm is (batch_size, seq_len, emb_dim) # s1_mask is (batch_size, seq_len) # s2_i = s2_i.reshape((s2_i.shape[0], s2_i.shape[1], 1)) s2_i = s2_i.reshape((b_size, s2_i.shape[1], 1)) s2_i = T.repeat(s2_i, 2, axis=2) # s2_i is (batch_size, emb_dim, 2) assert s1_bilstm.ndim == 3 assert s2_i.ndim == 3 score = T.batched_dot(s1_bilstm, s2_i) # (batch_size, seq_len, 1) score = score[:, :, 0].reshape( (b_size, -1)) # (batch_size, seq_len) return score # E[i, :] # NOTE: No point in masking here E, _ = theano.scan(compute_e_row, sequences=[s1_bilstm.transpose(1, 0, 2)], non_sequences=[s2_bilstm, s2_mask]) # (seq_len, batch_size, seq_len) E = E.dimshuffle(1, 0, 2) assert E.ndim == 3 s2s_att_weights = self._ndim_softmax.apply(E, extra_ndim=1) application_call.add_auxiliary_variable(s2s_att_weights.copy(), name='s2s_att_weights') # (batch_size, seq_len, seq_len) ### Compute tilde vectors (eq. 12 and 13) ### def compute_tilde_vector(e_i, s, s_mask): # e_i is (batch_size, seq_len) # s_mask is (batch_size, seq_len) # s_tilde_i = \sum e_ij b_j, (batch_size, seq_len) score = masked_softmax(e_i, s_mask, axis=1) score = score.dimshuffle(0, 1, "x") s_tilde_i = (score * (s * s_mask.dimshuffle(0, 1, "x"))).sum(axis=1) return s_tilde_i # (batch_size, seq_len, def_dim) s1_tilde, _ = theano.scan(compute_tilde_vector, sequences=[E.dimshuffle(1, 0, 2)], non_sequences=[s2_bilstm, s2_mask]) s1_tilde = s1_tilde.dimshuffle(1, 0, 2) s2_tilde, _ = theano.scan(compute_tilde_vector, sequences=[E.dimshuffle(2, 0, 1)], non_sequences=[s1_bilstm, s1_mask]) s2_tilde = s2_tilde.dimshuffle(1, 0, 2) ### Compose (eq. 14 and 15) ### # (batch_size, seq_len, 8 * dim) s1_comp = T.concatenate( [s1_bilstm, s1_tilde, s1_bilstm - s1_tilde, s1_bilstm * s1_tilde], axis=2) s2_comp = T.concatenate( [s2_bilstm, s2_tilde, s2_bilstm - s2_tilde, s2_bilstm * s2_tilde], axis=2) ### Encode (eq. 16 and 17) ### # (batch_size, seq_len, 8 * dim) # TODO: Share this bilstm? s1_comp_bilstm, _ = self._prem_bidir2.apply( self._prem_bidir_fork2.apply(flip01(s1_comp)), mask=s1_mask.T) # (batch_size, n_seq, 2 * dim) s2_comp_bilstm, _ = self._hyp_bidir2.apply( self._hyp_bidir_fork2.apply(flip01(s2_comp)), mask=s2_mask.T) # (batch_size, n_seq, 2 * dim) s1_comp_bilstm = flip01(s1_comp_bilstm) s2_comp_bilstm = flip01(s2_comp_bilstm) ### Pooling Layer ### s1_comp_bilstm_ave = (s1_mask.dimshuffle(0, 1, "x") * s1_comp_bilstm).sum(axis=1) \ / s1_mask.sum(axis=1).dimshuffle(0, "x") s1_comp_bilstm_max = T.max( ((1 - s1_mask.dimshuffle(0, 1, "x")) * -10000) + \ (s1_mask.dimshuffle(0, 1, "x")) * s1_comp_bilstm, axis=1) s2_comp_bilstm_ave = (s2_mask.dimshuffle(0, 1, "x") * s2_comp_bilstm).sum(axis=1) \ / s2_mask.sum(axis=1).dimshuffle(0, "x") # (batch_size, dim) s2_comp_bilstm_max = T.max(((1 - s2_mask.dimshuffle(0, 1, "x")) * -10000) + \ (s2_mask.dimshuffle(0, 1, "x")) * s2_comp_bilstm, axis=1) ### Final classifier ### # MLP layer # (batch_size, 8 * dim) m = T.concatenate([ s1_comp_bilstm_ave, s1_comp_bilstm_max, s2_comp_bilstm_ave, s2_comp_bilstm_max ], axis=1) pre_logits = self._mlp.apply(m) if train_phase: pre_logits = apply_dropout(pre_logits, drop_prob=self._dropout) # Get prediction self.logits = self._pred.apply(pre_logits) return self.logits
class MinRiskInitialContextSequenceGenerator(InitialContextSequenceGenerator): def __init__(self, *args, **kwargs): self.softmax = NDimensionalSoftmax() super(MinRiskInitialContextSequenceGenerator, self).__init__(*args, **kwargs) self.children.append(self.softmax) @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) # TODO: check where 'target_samples_mask' is used -- do we need a mask for context features (probably not) # Note: the @application decorator inspects the arguments, and transparently adds args ('application_call') @application(inputs=[ 'representation', 'source_sentence_mask', 'target_samples_mask', 'target_samples', 'scores' ], outputs=['cost']) def expected_cost(self, application_call, representation, source_sentence_mask, target_samples, target_samples_mask, scores, smoothing_constant=0.005, **kwargs): """ emulate the process in sequence_generator.cost_matrix, but compute log probabilities instead of costs for each sample, we need its probability according to the model (these could actually be passed from the sampling model, which could be more efficient) """ # Transpose everything (note we can use transpose here only if it's 2d, otherwise we need dimshuffle) source_sentence_mask = source_sentence_mask.T # make samples (time, batch) samples = target_samples.T samples_mask = target_samples_mask.T # we need this to set the 'attended' kwarg keywords = { 'mask': target_samples_mask, 'outputs': target_samples, 'attended': representation, 'attended_mask': source_sentence_mask } batch_size = samples.shape[1] # Prepare input for the iterative part states = dict_subset(keywords, self._state_names, must_have=False) # masks in context are optional (e.g. `attended_mask`) # contexts = dict_subset(keywords, self._context_names, must_have=False) # add the initial state context features contexts = dict_subset(keywords, self._context_names, must_have=False) contexts['initial_state_context'] = kwargs['initial_state_context'] feedback = self.readout.feedback(samples) inputs = self.fork.apply(feedback, as_dict=True) # Run the recurrent network results = self.transition.apply(mask=samples_mask, return_initial_states=True, as_dict=True, **dict_union(inputs, states, contexts)) # Separate the deliverables. The last states are discarded: they # are not used to predict any output symbol. The initial glimpses # are discarded because they are not used for prediction. # Remember, glimpses are computed _before_ output stage, states are # computed after. states = {name: results[name][:-1] for name in self._state_names} glimpses = {name: results[name][1:] for name in self._glimpse_names} # Compute the cost feedback = tensor.roll(feedback, 1, 0) feedback = tensor.set_subtensor( feedback[0], self.readout.feedback(self.readout.initial_outputs(batch_size))) readouts = self.readout.readout(feedback=feedback, **dict_union(states, glimpses, contexts)) word_probs = self.probs(readouts) word_probs = tensor.log(word_probs) # Note: converting the samples to one-hot wastes space, but it gets the job done # TODO: this may be the op that sometimes causes out-of-memory one_hot_samples = tensor.eye(word_probs.shape[-1])[samples] one_hot_samples.astype('float32') actual_probs = word_probs * one_hot_samples # reshape to (batch, time, prob), then sum over the batch dimension # to get sequence-level probability actual_probs = actual_probs.dimshuffle(1, 0, 2) # we are first summing over vocabulary (only one non-zero cell per row) sequence_probs = actual_probs.sum(axis=2) sequence_probs = sequence_probs * target_samples_mask # now sum over time dimension sequence_probs = sequence_probs.sum(axis=1) # reshape and do exp() to get the true probs back # sequence_probs = tensor.exp(sequence_probs.reshape(scores.shape)) sequence_probs = sequence_probs.reshape(scores.shape) # Note that the smoothing constant can be set by user sequence_distributions = ( tensor.exp(sequence_probs * smoothing_constant) / tensor.exp(sequence_probs * smoothing_constant).sum(axis=1, keepdims=True)) # the following lines are done explicitly for code clarity # -- first get sequence expectation, then sum up the expectations for every # seq in the minibatch expected_scores = (sequence_distributions * scores).sum(axis=1) expected_scores = expected_scores.sum(axis=0) return expected_scores
ix_to_char, char_to_ix, vocab_size = get_metadata(hdf5_file) if args.primetext and len(args.primetext) > 0: primetext = ''.join( [ch for ch in args.primetext if ch in char_to_ix.keys()]) x_curr = numpy.expand_dims(numpy.array( [char_to_ix[ch] for ch in primetext], dtype='uint8'), axis=1) else: dev_stream = get_stream(hdf5_file, 'dev', batch_size) x_curr, y_curr = dev_stream.get_epoch_iterator().next() x_curr = x_curr[:, -1].reshape(seq_length, 1) print 'Loading model from {0}...'.format(args.model) main_loop = load(args.model) print 'Model loaded. Building prediction function...' model = main_loop.model y, x = model.inputs softmax = NDimensionalSoftmax() linear_output = [v for v in model.variables if v.name == 'linear_output'][0] y_hat = softmax.apply(linear_output, extra_ndim=1) predict = theano.function([x], y_hat) print 'Starting sampling' sample_string = sample(args.length, x_curr, predict, ix_to_char, seed=args.seed, temperature=args.temperature)