def __init__(self, emb_dim, dim, num_input_words, num_output_words, vocab, **kwargs): if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if num_output_words == 0: num_output_words = vocab.size() self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._word_to_id = WordToIdOp(self._vocab) children = [] self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._decoder_fork = Linear(emb_dim, 4 * dim, name='decoder_fork') self._decoder_rnn = LSTM(dim, name='decoder_rnn') children.extend([self._main_lookup, self._encoder_fork, self._encoder_rnn, self._decoder_fork, self._decoder_rnn]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs)
def softmax_layer(h, y, x_mask, y_mask, lens, vocab_size, hidden_size, boosting): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() #y_hat = softmax.apply(linear_output, extra_ndim=1) #y_hat.name = 'y_hat' cost_a = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1) #produces correct average cost_a = cost_a * y_mask if boosting: #boosting step, must divide by length here lensMat = T.tile(lens, (y.shape[0], 1)) cost_a = cost_a / lensMat #only count cost of correctly masked entries cost = cost_a.sum() / y_mask.sum() cost.name = 'cost' return (linear_output, cost)
class ShallowFusionReadout(Readout): def __init__(self, lm_costs_name, lm_weight, normalize_am_weights=False, normalize_lm_weights=False, normalize_tot_weights=True, am_beta=1.0, **kwargs): super(ShallowFusionReadout, self).__init__(**kwargs) self.lm_costs_name = lm_costs_name self.lm_weight = lm_weight self.normalize_am_weights = normalize_am_weights self.normalize_lm_weights = normalize_lm_weights self.normalize_tot_weights = normalize_tot_weights self.am_beta = am_beta self.softmax = NDimensionalSoftmax() self.children += [self.softmax] @application def readout(self, **kwargs): lm_costs = -kwargs.pop(self.lm_costs_name) if self.normalize_lm_weights: lm_costs = self.softmax.log_probabilities( lm_costs, extra_ndim=lm_costs.ndim - 2) am_pre_softmax = self.am_beta * super(ShallowFusionReadout, self).readout(**kwargs) if self.normalize_am_weights: am_pre_softmax = self.softmax.log_probabilities( am_pre_softmax, extra_ndim=am_pre_softmax.ndim - 2) x = am_pre_softmax + self.lm_weight * lm_costs if self.normalize_tot_weights: x = self.softmax.log_probabilities(x, extra_ndim=x.ndim - 2) return x
def create_rnn(hidden_dim, vocab_dim,mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name = "W1", #dim = hidden_dim*4, dim = hidden_dim, length = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) if mode == "lstm": # Long Short Term Memory H = LSTM( hidden_dim, name = 'H', weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0.0) ) else: # recurrent history weight H = SimpleRecurrent( name = "H", dim = hidden_dim, activation = Tanh(), weights_init = initialization.IsotropicGaussian(0.01) ) # S = Linear( name = "W2", input_dim = hidden_dim, output_dim = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) A = NDimensionalSoftmax( name = "softmax" ) initLayers([W,H,S]) activations = W.apply(x) hiddens = H.apply(activations)#[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def softmax_layer(h, y, frame_length, hidden_size): hidden_to_output = Linear(name="hidden_to_output", input_dim=hidden_size, output_dim=frame_length) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = "linear_output" softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = "y_hat" cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = "cost" return y_hat, cost
class NewSoftmaxEmitter(AbstractEmitter, Initializable, Random): """A softmax emitter for the case of integer outputs. Interprets readout elements as energies corresponding to their indices. Parameters ---------- initial_output : int or a scalar :class:`~theano.Variable` The initial output. """ def __init__(self, initial_output=0, **kwargs): super(NewSoftmaxEmitter, self).__init__(**kwargs) self.initial_output = initial_output self.softmax = NDimensionalSoftmax() self.children = [self.softmax] self.name = 'newbidirectional' @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) @application def emitProbs(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] self.pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=self.pvals_flat) return self.pvals_flat @application def emit(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] self.pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=self.pvals_flat) winning_index = generated.reshape(probs.shape).argmax(axis=-1) return winning_index, self.pvals_flat[0][winning_index] @application def cost(self, readouts, outputs): # WARNING: unfortunately this application method works # just fine when `readouts` and `outputs` have # different dimensions. Be careful! return self.softmax.categorical_cross_entropy( outputs, readouts, extra_ndim=readouts.ndim - 2) @application def initial_outputs(self, batch_size): return self.initial_output * tensor.ones((batch_size, ), dtype='int64') def get_dim(self, name): if name == 'outputs': return 0 return super(SoftmaxEmitter, self).get_dim(name)
def softmax_layer(h, y, vocab_size, hidden_size): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy( y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
def softmax_layer(h, y, vocab_size, hidden_size): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
class SoftmaxEmitter(AbstractEmitter, Initializable, Random): """A softmax emitter for the case of integer outputs. Interprets readout elements as energies corresponding to their indices. Parameters ---------- initial_output : int or a scalar :class:`~theano.Variable` The initial output. """ def __init__(self, initial_output=0, **kwargs): super(SoftmaxEmitter, self).__init__(**kwargs) self.initial_output = initial_output self.softmax = NDimensionalSoftmax() self.children = [self.softmax] @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) @application def emit(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=pvals_flat) return generated.reshape(probs.shape).argmax(axis=-1) @application def cost(self, readouts, outputs): # WARNING: unfortunately this application method works # just fine when `readouts` and `outputs` have # different dimensions. Be careful! return self.softmax.categorical_cross_entropy( outputs, readouts, extra_ndim=readouts.ndim - 2) @application def costs(self, readouts): return -self.softmax.log_probabilities( readouts, extra_ndim=readouts.ndim - 2) @application def initial_outputs(self, batch_size): return self.initial_output * tensor.ones((batch_size,), dtype='int64') def get_dim(self, name): if name == 'outputs': return 0 return super(SoftmaxEmitter, self).get_dim(name)
def create_rnn(hidden_dim, vocab_dim, mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name="W1", #dim = hidden_dim*4, dim=hidden_dim, length=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) if mode == "lstm": # Long Short Term Memory H = LSTM(hidden_dim, name='H', weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0.0)) else: # recurrent history weight H = SimpleRecurrent( name="H", dim=hidden_dim, activation=Tanh(), weights_init=initialization.IsotropicGaussian(0.01)) # S = Linear(name="W2", input_dim=hidden_dim, output_dim=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) A = NDimensionalSoftmax(name="softmax") initLayers([W, H, S]) activations = W.apply(x) hiddens = H.apply(activations) #[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def rating_cost(pred_score, true_ratings, input_masks, output_masks, D, d, std=1.0, alpha=0.01): pred_score_cum = T.extra_ops.cumsum(pred_score, axis=2) prob_item_ratings = NDimensionalSoftmax(name='rating_cost_sf').apply( pred_score_cum, extra_ndim=1) accu_prob_1N = T.extra_ops.cumsum(prob_item_ratings, axis=2) accu_prob_N1 = T.extra_ops.cumsum(prob_item_ratings[:, :, ::-1], axis=2)[:, :, ::-1] mask1N = T.extra_ops.cumsum(true_ratings[:, :, ::-1], axis=2)[:, :, ::-1] maskN1 = T.extra_ops.cumsum(true_ratings, axis=2) cost_ordinal_1N = -T.sum( (T.log(prob_item_ratings) - T.log(accu_prob_1N)) * mask1N, axis=2) cost_ordinal_N1 = -T.sum( (T.log(prob_item_ratings) - T.log(accu_prob_N1)) * maskN1, axis=2) cost_ordinal = cost_ordinal_1N + cost_ordinal_N1 nll_item_ratings = -(true_ratings * T.log(prob_item_ratings)).sum(axis=2) nll = std * nll_item_ratings.sum( axis=1) * 1.0 * D / (D - d + 1e-6) + alpha * cost_ordinal.sum( axis=1) * 1.0 * D / (D - d + 1e-6) cost = T.mean(nll) return cost, nll, nll_item_ratings, cost_ordinal_1N, cost_ordinal_N1, prob_item_ratings
def __init__(self, lm_costs_name, lm_weight, normalize_am_weights=False, normalize_lm_weights=False, normalize_tot_weights=True, am_beta=1.0, **kwargs): super(ShallowFusionReadout, self).__init__(**kwargs) self.lm_costs_name = lm_costs_name self.lm_weight = lm_weight self.normalize_am_weights = normalize_am_weights self.normalize_lm_weights = normalize_lm_weights self.normalize_tot_weights = normalize_tot_weights self.am_beta = am_beta self.softmax = NDimensionalSoftmax() self.children += [self.softmax]
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def softmax_output_layer(x, h, y, in_size, out_size, hidden_size, pred): if connect_h_to_o: hidden_to_output = Linear(name='hidden_to_output' + str(pred), input_dim=hidden_size * len(h), output_dim=out_size) hiddens = T.concatenate([hidden for hidden in h], axis=2) else: hidden_to_output = Linear(name='hidden_to_output' + str(pred), input_dim=hidden_size, output_dim=out_size) hiddens = h[-1] initialize([hidden_to_output]) linear_output = hidden_to_output.apply(hiddens) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() extra_ndim = 1 if single_dim_out else 2 y_hat = softmax.apply(linear_output, extra_ndim=extra_ndim) cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=extra_ndim).mean() return y_hat, cost
def softmax_layer(self, h, y): """ Perform Softmax over the hidden state in order to predict the next word in the sequence and compute the loss. :param h The hidden state sequence :param y The target words """ hidden_to_output = Linear(name='hidden_to_output', input_dim=self.hidden_size, output_dim=self.vocab_size) initialize(hidden_to_output, sqrt(6.0 / (self.hidden_size + self.vocab_size))) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax(name="lm_softmax") y_hat = softmax.log_probabilities(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2 ]
class GMMMLP(Initializable): """An mlp brick that branchs out to output sigma and mu for GMM Parameters ---------- mlp: MLP brick the main mlp to wrap around. dim: output dim """ def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [self.mlp, self.mu, self.sigma, self.coeff, self.coeff2] #self.children.extend(self.mlp.children) @application def apply(self, inputs): state = self.mlp.apply(inputs) mu = self.mu.apply(state) sigma = self.sigma.apply(state) coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const return mu, sigma, coeff @property def output_dim(self): return self.dim
class GMMMLP(Initializable): """An mlp brick that branchs out to output sigma and mu for GMM Parameters ---------- mlp: MLP brick the main mlp to wrap around. dim: output dim """ def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2 ] #self.children.extend(self.mlp.children) @application def apply(self, inputs): state = self.mlp.apply(inputs) mu = self.mu.apply(state) sigma = self.sigma.apply(state) coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const return mu, sigma, coeff @property def output_dim(self): return self.dim
def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [self.mlp, self.mu, self.sigma, self.coeff, self.coeff2]
def __init__(self, visual_dim, textual_dim, output_dim, hidden_size, init_ranges, **kwargs): (visual_range, textual_range, linear_range_1, linear_range_2, linear_range_3) = init_ranges manager_dim = visual_dim + textual_dim visual_mlp = MLPGenreClassifier( visual_dim, output_dim, hidden_size, [linear_range_1, linear_range_2, linear_range_3], name='visual_mlp') textual_mlp = MLPGenreClassifier( textual_dim, output_dim, hidden_size, [linear_range_1, linear_range_2, linear_range_3], name='textual_mlp') # manager_mlp = MLPGenreClassifier(manager_dim, 2, hidden_size, [ # linear_range_1, linear_range_2, linear_range_3], output_act=Softmax, # name='manager_mlp') bn = BatchNormalization(input_dim=manager_dim, name='bn3') manager_mlp = Sequence([ Linear(manager_dim, 2, name='linear_output', use_bias=False, weights_init=initialization.Uniform( width=linear_range_1)).apply, ], name='manager_mlp') fork = Fork( input_dim=manager_dim, output_dims=[2] * output_dim, prototype=manager_mlp, output_names=['linear_' + str(i) for i in range(output_dim)]) children = [visual_mlp, textual_mlp, fork, bn, NDimensionalSoftmax()] kwargs.setdefault('use_bias', False) kwargs.setdefault('children', children) super(MoEClassifier, self).__init__(**kwargs)
def __init__(self, config_dict, init_type="xavier", **kwargs): super(CharRNNModel, self).__init__(**kwargs) self.batch_size = config_dict["batch_size"] self.num_subwords = config_dict["num_subwords"] self.num_words = config_dict["num_words"] self.subword_embedding_size = config_dict["subword_embedding_size"] self.input_vocab_size = config_dict["input_vocab_size"] self.output_vocab_size = config_dict["output_vocab_size"] self.subword_RNN_hidden_state_size = config_dict["subword_RNN_hidden_state_size"] self.table_width = config_dict["table_width"] self.max_out_dim = config_dict["max_out_dim"] self.max_out_K = config_dict["max_out_K"] self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name="input_lookup") self.lookup.weights_init = Uniform(width=self.table_width) self.lookup.biases_init = Constant(0) if init_type == "xavier": linear_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size) lstm_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size) else: # default is gaussian linear_init = IsotropicGaussian() lstm_init = IsotropicGaussian() # The `inputs` are then split in this order: Input gates, forget gates, cells and output gates self.linear_forward = Linear( input_dim=self.subword_embedding_size, output_dim=self.subword_RNN_hidden_state_size * 4, name="linear_forward", weights_init=linear_init, biases_init=Constant(0.0), ) self.language_model = LSTM( dim=self.subword_RNN_hidden_state_size, activation=Tanh(), name="language_model_RNN", weights_init=lstm_init, biases_init=Constant(0.0), ) self.max_out = LinearMaxout( self.subword_RNN_hidden_state_size, self.max_out_dim, self.max_out_K, name="max_out", weights_init=IsotropicGaussian(), biases_init=Constant(0.0), ) self.softmax_linear = Linear( self.max_out_dim, self.output_vocab_size, name="soft_max_linear", weights_init=IsotropicGaussian(), biases_init=Constant(0.0), ) self.softmax = NDimensionalSoftmax() self.children = [ self.lookup, self.linear_forward, self.language_model, self.max_out, self.softmax_linear, self.softmax, ]
class LanguageModel(Initializable): """The dictionary-equipped language model. Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. retrieval The dictionary retrieval algorithm. If `None`, the language model does not use any dictionary. def_reader: either 'LSTM' or 'mean' standalone_def_rnn : bool If `True`, a standalone RNN with separate word embeddings is used to embed definition. If `False` the language model is reused. disregard_word_embeddings : bool If `True`, the word embeddings are not used, only the information from the definitions is used. compose_type : str If 'sum', the definition and word embeddings are averaged If 'fully_connected_linear', a learned perceptron compose the 2 embeddings linearly If 'fully_connected_relu', ... If 'fully_connected_tanh', ... """ def __init__(self, emb_dim, emb_def_dim, dim, num_input_words, def_num_input_words, num_output_words, vocab, retrieval=None, def_reader='LSTM', standalone_def_lookup=True, standalone_def_rnn=True, disregard_word_embeddings=False, compose_type='sum', very_rare_threshold=[10], cache_size=0, **kwargs): # TODO(tombosc): document if emb_dim == 0: emb_dim = dim if emb_def_dim == 0: emb_def_dim = emb_dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words if (num_input_words != def_num_input_words) and (not standalone_def_lookup): raise NotImplementedError() self._very_rare_threshold = very_rare_threshold self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._retrieval = retrieval self._disregard_word_embeddings = disregard_word_embeddings self._compose_type = compose_type self._word_to_id = WordToIdOp(self._vocab) self._word_to_count = WordToCountOp(self._vocab) children = [] self._cache = None if cache_size > 0: #TODO(tombosc) do we implement cache as LookupTable or theano matrix? #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim))) self._cache = LookupTable(cache_size, emb_dim, name='cache_def_embeddings') children.append(self._cache) if self._retrieval: self._retrieve = RetrievalOp(retrieval) self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork') self._main_rnn = DebugLSTM( dim, name='main_rnn') # TODO(tombosc): use regular LSTM? children.extend([self._main_lookup, self._main_fork, self._main_rnn]) if self._retrieval: if standalone_def_lookup: lookup = None else: if emb_dim != emb_def_dim: raise ValueError( "emb_dim != emb_def_dim: cannot share lookup") lookup = self._main_lookup if def_reader == 'LSTM': if standalone_def_rnn: fork_and_rnn = None else: fork_and_rnn = (self._main_fork, self._main_rnn) self._def_reader = LSTMReadDefinitions(def_num_input_words, emb_def_dim, dim, vocab, lookup, fork_and_rnn, cache=self._cache) elif def_reader == 'mean': self._def_reader = MeanPoolReadDefinitions( def_num_input_words, emb_def_dim, dim, vocab, lookup, translate=(emb_def_dim != dim), normalize=False) else: raise Exception("def reader not understood") self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs) def _push_initialization_config(self): super(LanguageModel, self)._push_initialization_config() if self._cache: self._cache.weights_init = Constant(0.) def set_def_embeddings(self, embeddings): self._def_reader._def_lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def get_def_embeddings_params(self): return self._def_reader._def_lookup.parameters[0] def get_cache_params(self): return self._cache.W def add_perplexity_measure(self, application_call, minus_logs, mask, name): costs = (minus_logs * mask).sum(axis=0) perplexity = tensor.exp(costs.sum() / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity(costs.sum(), mask.sum()) full_name = "perplexity_" + name application_call.add_auxiliary_variable(perplexity, name=full_name) return costs @application def apply(self, application_call, words, mask): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. """ if self._retrieval: defs, def_mask, def_map = self._retrieve(words) def_embeddings = self._def_reader.apply(defs, def_mask) # Auxililary variable for debugging application_call.add_auxiliary_variable(def_embeddings.shape[0], name="num_definitions") word_ids = self._word_to_id(words) # shortlisting input_word_ids = ( tensor.lt(word_ids, self._num_input_words) * word_ids + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk) output_word_ids = ( tensor.lt(word_ids, self._num_output_words) * word_ids + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk) application_call.add_auxiliary_variable(unk_ratio( input_word_ids, mask, self._vocab.unk), name='unk_ratio') # Run the main rnn with combined inputs word_embs = self._main_lookup.apply(input_word_ids) application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='word_emb_RMS') if self._retrieval: rnn_inputs, updated, positions = self._combiner.apply( word_embs, mask, def_embeddings, def_map) else: rnn_inputs = word_embs updates = [] if self._cache: flat_word_ids = word_ids.flatten() flat_word_ids_to_update = flat_word_ids[positions] # computing updates for cache updates = [ (self._cache.W, tensor.set_subtensor(self._cache.W[flat_word_ids_to_update], updated)) ] application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='main_rnn_in_RMS') main_rnn_states = self._main_rnn.apply(tensor.transpose( self._main_fork.apply(rnn_inputs), (1, 0, 2)), mask=mask.T)[0] # The first token is not predicted logits = self._pre_softmax.apply(main_rnn_states[:-1]) targets = output_word_ids.T[1:] out_softmax = self._softmax.apply(logits, extra_ndim=1) application_call.add_auxiliary_variable(out_softmax.copy(), name="proba_out") minus_logs = self._softmax.categorical_cross_entropy(targets, logits, extra_ndim=1) targets_mask = mask.T[1:] costs = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "") missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L) self.add_perplexity_measure(application_call, minus_logs, targets_mask * missing_embs.T[:-1], "after_mis_word_embs") self.add_perplexity_measure(application_call, minus_logs, targets_mask * (1 - missing_embs.T[:-1]), "after_word_embs") word_counts = self._word_to_count(words) very_rare_masks = [] for threshold in self._very_rare_threshold: very_rare_mask = tensor.lt(word_counts, threshold).astype('int32') very_rare_mask = targets_mask * (very_rare_mask.T[:-1]) very_rare_masks.append(very_rare_mask) self.add_perplexity_measure(application_call, minus_logs, very_rare_mask, "after_very_rare_" + str(threshold)) if self._retrieval: has_def = tensor.zeros_like(output_word_ids) has_def = tensor.inc_subtensor( has_def[def_map[:, 0], def_map[:, 1]], 1) mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs) self.add_perplexity_measure(application_call, minus_logs, mask_targets_has_def, "after_def_embs") for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks): self.add_perplexity_measure( application_call, minus_logs, very_rare_mask * mask_targets_has_def, "after_def_very_rare_" + str(thresh)) application_call.add_auxiliary_variable(mask_targets_has_def.T, name='mask_def_emb') return costs, updates
def __init__(self, emb_dim, emb_def_dim, dim, num_input_words, def_num_input_words, num_output_words, vocab, retrieval=None, def_reader='LSTM', standalone_def_lookup=True, standalone_def_rnn=True, disregard_word_embeddings=False, compose_type='sum', very_rare_threshold=[10], cache_size=0, **kwargs): # TODO(tombosc): document if emb_dim == 0: emb_dim = dim if emb_def_dim == 0: emb_def_dim = emb_dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words if (num_input_words != def_num_input_words) and (not standalone_def_lookup): raise NotImplementedError() self._very_rare_threshold = very_rare_threshold self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._retrieval = retrieval self._disregard_word_embeddings = disregard_word_embeddings self._compose_type = compose_type self._word_to_id = WordToIdOp(self._vocab) self._word_to_count = WordToCountOp(self._vocab) children = [] self._cache = None if cache_size > 0: #TODO(tombosc) do we implement cache as LookupTable or theano matrix? #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim))) self._cache = LookupTable(cache_size, emb_dim, name='cache_def_embeddings') children.append(self._cache) if self._retrieval: self._retrieve = RetrievalOp(retrieval) self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork') self._main_rnn = DebugLSTM( dim, name='main_rnn') # TODO(tombosc): use regular LSTM? children.extend([self._main_lookup, self._main_fork, self._main_rnn]) if self._retrieval: if standalone_def_lookup: lookup = None else: if emb_dim != emb_def_dim: raise ValueError( "emb_dim != emb_def_dim: cannot share lookup") lookup = self._main_lookup if def_reader == 'LSTM': if standalone_def_rnn: fork_and_rnn = None else: fork_and_rnn = (self._main_fork, self._main_rnn) self._def_reader = LSTMReadDefinitions(def_num_input_words, emb_def_dim, dim, vocab, lookup, fork_and_rnn, cache=self._cache) elif def_reader == 'mean': self._def_reader = MeanPoolReadDefinitions( def_num_input_words, emb_def_dim, dim, vocab, lookup, translate=(emb_def_dim != dim), normalize=False) else: raise Exception("def reader not understood") self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs)
class MinRiskInitialContextSequenceGenerator(InitialContextSequenceGenerator): def __init__(self, *args, **kwargs): self.softmax = NDimensionalSoftmax() super(MinRiskInitialContextSequenceGenerator, self).__init__(*args, **kwargs) self.children.append(self.softmax) @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) # TODO: check where 'target_samples_mask' is used -- do we need a mask for context features (probably not) # Note: the @application decorator inspects the arguments, and transparently adds args ('application_call') @application(inputs=[ 'representation', 'source_sentence_mask', 'target_samples_mask', 'target_samples', 'scores' ], outputs=['cost']) def expected_cost(self, application_call, representation, source_sentence_mask, target_samples, target_samples_mask, scores, smoothing_constant=0.005, **kwargs): """ emulate the process in sequence_generator.cost_matrix, but compute log probabilities instead of costs for each sample, we need its probability according to the model (these could actually be passed from the sampling model, which could be more efficient) """ # Transpose everything (note we can use transpose here only if it's 2d, otherwise we need dimshuffle) source_sentence_mask = source_sentence_mask.T # make samples (time, batch) samples = target_samples.T samples_mask = target_samples_mask.T # we need this to set the 'attended' kwarg keywords = { 'mask': target_samples_mask, 'outputs': target_samples, 'attended': representation, 'attended_mask': source_sentence_mask } batch_size = samples.shape[1] # Prepare input for the iterative part states = dict_subset(keywords, self._state_names, must_have=False) # masks in context are optional (e.g. `attended_mask`) # contexts = dict_subset(keywords, self._context_names, must_have=False) # add the initial state context features contexts = dict_subset(keywords, self._context_names, must_have=False) contexts['initial_state_context'] = kwargs['initial_state_context'] feedback = self.readout.feedback(samples) inputs = self.fork.apply(feedback, as_dict=True) # Run the recurrent network results = self.transition.apply(mask=samples_mask, return_initial_states=True, as_dict=True, **dict_union(inputs, states, contexts)) # Separate the deliverables. The last states are discarded: they # are not used to predict any output symbol. The initial glimpses # are discarded because they are not used for prediction. # Remember, glimpses are computed _before_ output stage, states are # computed after. states = {name: results[name][:-1] for name in self._state_names} glimpses = {name: results[name][1:] for name in self._glimpse_names} # Compute the cost feedback = tensor.roll(feedback, 1, 0) feedback = tensor.set_subtensor( feedback[0], self.readout.feedback(self.readout.initial_outputs(batch_size))) readouts = self.readout.readout(feedback=feedback, **dict_union(states, glimpses, contexts)) word_probs = self.probs(readouts) word_probs = tensor.log(word_probs) # Note: converting the samples to one-hot wastes space, but it gets the job done # TODO: this may be the op that sometimes causes out-of-memory one_hot_samples = tensor.eye(word_probs.shape[-1])[samples] one_hot_samples.astype('float32') actual_probs = word_probs * one_hot_samples # reshape to (batch, time, prob), then sum over the batch dimension # to get sequence-level probability actual_probs = actual_probs.dimshuffle(1, 0, 2) # we are first summing over vocabulary (only one non-zero cell per row) sequence_probs = actual_probs.sum(axis=2) sequence_probs = sequence_probs * target_samples_mask # now sum over time dimension sequence_probs = sequence_probs.sum(axis=1) # reshape and do exp() to get the true probs back # sequence_probs = tensor.exp(sequence_probs.reshape(scores.shape)) sequence_probs = sequence_probs.reshape(scores.shape) # Note that the smoothing constant can be set by user sequence_distributions = ( tensor.exp(sequence_probs * smoothing_constant) / tensor.exp(sequence_probs * smoothing_constant).sum(axis=1, keepdims=True)) # the following lines are done explicitly for code clarity # -- first get sequence expectation, then sum up the expectations for every # seq in the minibatch expected_scores = (sequence_distributions * scores).sum(axis=1) expected_scores = expected_scores.sum(axis=0) return expected_scores
def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu") self.sigma = MLP( activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma" ) self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state" ) # self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh(name="frnn_activation") self.frnn_linear_transition_state = Linear( input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state" ) self.frnn_linear_transition_input = Linear( input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input" ) # self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2, self.frnn_initial_state, self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input, ]
def __init__(self, emb_dim, dim, num_input_words, num_output_words, vocab, proximity_coef=0, proximity_distance='l2', encoder='lstm', decoder='lstm', shared_rnn=False, translate_layer=None, word_dropout=0., tied_in_out=False, vocab_keys=None, seed=0, reconstruction_coef=1., provide_targets=False, **kwargs): """ translate_layer: either a string containing the activation function to use either a list containg the list of activations for a MLP """ if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if num_output_words == 0: num_output_words = vocab.size() self._word_dropout = word_dropout self._tied_in_out = tied_in_out if not encoder: if proximity_coef: raise ValueError("Err: meaningless penalty term (no encoder)") if not vocab_keys: raise ValueError("Err: specify a key vocabulary (no encoder)") if tied_in_out and num_input_words != num_output_words: raise ValueError("Can't tie in and out embeddings. Different " "vocabulary size") if shared_rnn and (encoder != 'lstm' or decoder != 'lstm'): raise ValueError( "can't share RNN because either encoder or decoder" "is not an RNN") if shared_rnn and decoder == 'lstm_c': raise ValueError( "can't share RNN because the decoder takes different" "inputs") if word_dropout < 0 or word_dropout > 1: raise ValueError("invalid value for word dropout", str(word_dropout)) if proximity_distance not in ['l1', 'l2', 'cos']: raise ValueError( "unrecognized distance: {}".format(proximity_distance)) if proximity_coef and emb_dim != dim and not translate_layer: raise ValueError( """if proximity penalisation, emb_dim should equal dim or there should be a translate layer""") if encoder not in [ None, 'lstm', 'bilstm', 'mean', 'weighted_mean', 'max_bilstm', 'bilstm_sum', 'max_bilstm_sum' ]: raise ValueError('encoder not recognized') if decoder not in ['skip-gram', 'lstm', 'lstm_c']: raise ValueError('decoder not recognized') self._proximity_distance = proximity_distance self._decoder = decoder self._encoder = encoder self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._proximity_coef = proximity_coef self._reconstruction_coef = reconstruction_coef self._provide_targets = provide_targets self._word_to_id = WordToIdOp(self._vocab) if vocab_keys: self._key_to_id = WordToIdOp(vocab_keys) children = [] if encoder or (not encoder and decoder in ['lstm', 'lstm_c']): self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') children.append(self._main_lookup) if provide_targets: # this is useful to simulate Hill's baseline without pretrained embeddings # in the encoder, only as targets for the encoder. self._target_lookup = LookupTable(self._num_input_words, emb_dim, name='target_lookup') children.append(self._target_lookup) if not encoder: self._key_lookup = LookupTable(vocab_keys.size(), emb_dim, name='key_lookup') children.append(self._key_lookup) elif encoder == 'lstm': self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm', 'max_bilstm']: # dim is the dim of the concatenated vector self._encoder_fork = Linear(emb_dim, 2 * dim, name='encoder_fork') self._encoder_rnn = Bidirectional(LSTM(dim / 2, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm_sum', 'max_bilstm_sum']: self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = BidirectionalSum(LSTM(dim, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder == 'mean': pass elif encoder == 'weighted_mean': self._encoder_w = MLP([Logistic()], [dim, 1], name="encoder_weights") children.extend([self._encoder_w]) else: raise NotImplementedError() if decoder in ['lstm', 'lstm_c']: dim_after_translate = emb_dim if shared_rnn: self._decoder_fork = self._encoder_fork self._decoder_rnn = self._encoder_rnn else: if decoder == 'lstm_c': dim_2 = dim + emb_dim else: dim_2 = dim self._decoder_fork = Linear(dim_2, 4 * dim, name='decoder_fork') self._decoder_rnn = LSTM(dim, name='decoder_rnn') children.extend([self._decoder_fork, self._decoder_rnn]) elif decoder == 'skip-gram': dim_after_translate = emb_dim self._translate_layer = None activations = {'sigmoid': Logistic(), 'tanh': Tanh(), 'linear': None} if translate_layer: if type(translate_layer) == str: translate_layer = [translate_layer] assert (type(translate_layer) == list) activations_translate = [activations[a] for a in translate_layer] dims_translate = [ dim, ] * len(translate_layer) + [dim_after_translate] self._translate_layer = MLP(activations_translate, dims_translate, name="translate_layer") children.append(self._translate_layer) if not self._tied_in_out: self._pre_softmax = Linear(emb_dim, self._num_output_words) children.append(self._pre_softmax) if decoder in ['lstm', 'lstm_c']: self._softmax = NDimensionalSoftmax() elif decoder in ['skip-gram']: self._softmax = Softmax() children.append(self._softmax) super(Seq2Seq, self).__init__(children=children, **kwargs)
class ExtractiveQAModel(Initializable): """The dictionary-equipped extractive QA model. Parameters ---------- dim : int The default dimensionality for the components. emd_dim : int The dimensionality for the embeddings. If 0, `dim` is used. coattention : bool Use the coattention mechanism. num_input_words : int The number of input words. If 0, `vocab.size()` is used. The vocabulary object. use_definitions : bool Triggers the use of definitions. reuse_word_embeddings : bool compose_type : str """ def __init__(self, dim, emb_dim, readout_dims, num_input_words, def_num_input_words, vocab, use_definitions, def_word_gating, compose_type, coattention, def_reader, reuse_word_embeddings, random_unk, **kwargs): self._vocab = vocab if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words self._coattention = coattention self._num_input_words = num_input_words self._use_definitions = use_definitions self._random_unk = random_unk self._reuse_word_embeddings = reuse_word_embeddings lookup_num_words = num_input_words if reuse_word_embeddings: lookup_num_words = max(num_input_words, def_num_input_words) if random_unk: lookup_num_words = vocab.size() # Dima: we can have slightly less copy-paste here if we # copy the RecurrentFromFork class from my other projects. children = [] self._lookup = LookupTable(lookup_num_words, emb_dim) self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._question_transform = Linear(dim, dim, name='question_transform') self._bidir_fork = Linear(3 * dim if coattention else 2 * dim, 4 * dim, name='bidir_fork') self._bidir = Bidirectional(LSTM(dim), name='bidir') children.extend([ self._lookup, self._encoder_fork, self._encoder_rnn, self._question_transform, self._bidir, self._bidir_fork ]) activations = [Rectifier()] * len(readout_dims) + [None] readout_dims = [2 * dim] + readout_dims + [1] self._begin_readout = MLP(activations, readout_dims, name='begin_readout') self._end_readout = MLP(activations, readout_dims, name='end_readout') self._softmax = NDimensionalSoftmax() children.extend( [self._begin_readout, self._end_readout, self._softmax]) if self._use_definitions: # A potential bug here: we pass the same vocab to the def reader. # If a different token is reserved for UNK in text and in the definitions, # we can be screwed. def_reader_class = eval(def_reader) def_reader_kwargs = dict( num_input_words=def_num_input_words, dim=dim, emb_dim=emb_dim, vocab=vocab, lookup=self._lookup if reuse_word_embeddings else None) if def_reader_class == MeanPoolReadDefinitions: def_reader_kwargs.update(dict(normalize=True, translate=False)) self._def_reader = def_reader_class(**def_reader_kwargs) self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, def_word_gating=def_word_gating, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) super(ExtractiveQAModel, self).__init__(children=children, **kwargs) # create default input variables self.contexts = tensor.lmatrix('contexts') self.context_mask = tensor.matrix('contexts_mask') self.questions = tensor.lmatrix('questions') self.question_mask = tensor.matrix('questions_mask') self.answer_begins = tensor.lvector('answer_begins') self.answer_ends = tensor.lvector('answer_ends') input_vars = [ self.contexts, self.context_mask, self.questions, self.question_mask, self.answer_begins, self.answer_ends ] if self._use_definitions: self.defs = tensor.lmatrix('defs') self.def_mask = tensor.matrix('def_mask') self.contexts_def_map = tensor.lmatrix('contexts_def_map') self.questions_def_map = tensor.lmatrix('questions_def_map') input_vars.extend([ self.defs, self.def_mask, self.contexts_def_map, self.questions_def_map ]) self.input_vars = OrderedDict([(var.name, var) for var in input_vars]) def set_embeddings(self, embeddings): self._lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def embeddings_var(self): return self._lookup.parameters[0] def def_reading_parameters(self): parameters = Selector(self._def_reader).get_parameters().values() parameters.extend(Selector(self._combiner).get_parameters().values()) if self._reuse_word_embeddings: lookup_parameters = Selector( self._lookup).get_parameters().values() parameters = [p for p in parameters if p not in lookup_parameters] return parameters @application def _encode(self, application_call, text, mask, def_embs=None, def_map=None, text_name=None): if not self._random_unk: text = (tensor.lt(text, self._num_input_words) * text + tensor.ge(text, self._num_input_words) * self._vocab.unk) if text_name: application_call.add_auxiliary_variable( unk_ratio(text, mask, self._vocab.unk), name='{}_unk_ratio'.format(text_name)) embs = self._lookup.apply(text) if self._random_unk: embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs + tensor.ge(text, self._num_input_words)[:, :, None] * disconnected_grad(embs)) if def_embs: embs = self._combiner.apply(embs, mask, def_embs, def_map) add_role(embs, EMBEDDINGS) encoded = flip01( self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)), mask=mask.T)[0]) return encoded @application def apply(self, application_call, contexts, contexts_mask, questions, questions_mask, answer_begins, answer_ends, defs=None, def_mask=None, contexts_def_map=None, questions_def_map=None): def_embs = None if self._use_definitions: def_embs = self._def_reader.apply(defs, def_mask) context_enc = self._encode(contexts, contexts_mask, def_embs, contexts_def_map, 'context') question_enc_pre = self._encode(questions, questions_mask, def_embs, questions_def_map, 'question') question_enc = tensor.tanh( self._question_transform.apply(question_enc_pre)) # should be (batch size, context length, question_length) affinity = tensor.batched_dot(context_enc, flip12(question_enc)) affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :] affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask) # soft-aligns every position in the context to positions in the question d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1) application_call.add_auxiliary_variable(d2q_att_weights.copy(), name='d2q_att_weights') # soft-aligns every position in the question to positions in the document q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1) application_call.add_auxiliary_variable(q2d_att_weights.copy(), name='q2d_att_weights') # question encoding "in the view of the document" question_enc_informed = tensor.batched_dot(q2d_att_weights, context_enc) question_enc_concatenated = tensor.concatenate( [question_enc, question_enc_informed], 2) # document encoding "in the view of the question" context_enc_informed = tensor.batched_dot(d2q_att_weights, question_enc_concatenated) if self._coattention: context_enc_concatenated = tensor.concatenate( [context_enc, context_enc_informed], 2) else: question_repr_repeated = tensor.repeat(question_enc[:, [-1], :], context_enc.shape[1], axis=1) context_enc_concatenated = tensor.concatenate( [context_enc, question_repr_repeated], 2) # note: forward and backward LSTMs share the # input weights in the current impl bidir_states = flip01( self._bidir.apply(self._bidir_fork.apply( flip01(context_enc_concatenated)), mask=contexts_mask.T)[0]) begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0] begin_readouts = begin_readouts * contexts_mask - 1000.0 * ( 1 - contexts_mask) begin_costs = self._softmax.categorical_cross_entropy( answer_begins, begin_readouts) end_readouts = self._end_readout.apply(bidir_states)[:, :, 0] end_readouts = end_readouts * contexts_mask - 1000.0 * (1 - contexts_mask) end_costs = self._softmax.categorical_cross_entropy( answer_ends, end_readouts) predicted_begins = begin_readouts.argmax(axis=-1) predicted_ends = end_readouts.argmax(axis=-1) exact_match = (tensor.eq(predicted_begins, answer_begins) * tensor.eq(predicted_ends, answer_ends)) application_call.add_auxiliary_variable(predicted_begins, name='predicted_begins') application_call.add_auxiliary_variable(predicted_ends, name='predicted_ends') application_call.add_auxiliary_variable(exact_match, name='exact_match') return begin_costs + end_costs def apply_with_default_vars(self): return self.apply(*self.input_vars.values())
def __init__(self, input_sources_list, input_sources_vocab_size_list, output_source, output_source_vocab_size, lookup_dim=200, hidden_size=256, recurrent_stack_size=1): self.InputSources = input_sources_list self.InputSourcesVocab = input_sources_vocab_size_list self.OutputSource = output_source self.OutputSourceVocab = output_source_vocab_size inputs = [tensor.lmatrix(source) for source in input_sources_list] output = tensor.lmatrix(output_source) lookups = self.get_lookups(lookup_dim, input_sources_vocab_size_list) for lookup in lookups: lookup.initialize() merge = Merge([lookup.name for lookup in lookups], [lookup.dim for lookup in lookups], hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() linear0 = Linear(input_dim=hidden_size, output_dim=hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear0') linear0.initialize() recurrent_blocks = [] for i in range(recurrent_stack_size): recurrent_blocks.append(SimpleRecurrent( dim=hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01), use_bias=False)) for i, recurrent_block in enumerate(recurrent_blocks): recurrent_block.name = 'recurrent'+str(i+1) recurrent_block.initialize() linear_out = Linear(input_dim=hidden_size, output_dim=output_source_vocab_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear_out') linear_out.initialize() softmax = NDimensionalSoftmax(name='softmax') lookup_outputs = [lookup.apply(input) for lookup, input in zip(lookups, inputs)] m = merge.apply(*lookup_outputs) r = linear0.apply(m) for block in recurrent_blocks: r = block.apply(r) a = linear_out.apply(r) self.Cost = softmax.categorical_cross_entropy(output, a, extra_ndim=1).mean() self.Cost.name = 'cost' y_hat = softmax.apply(a, extra_ndim=1) y_hat.name = 'y_hat' self.ComputationGraph = ComputationGraph(self.Cost) self.Function = None self.MainLoop = None self.Model = Model(y_hat)
rnn = SimpleRecurrent( name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear( name='linear_output', input_dim=hidden_layer_dim, output_dim=charset_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]
parser.add_argument('-temperature', type=float, default=1.0, help='temperature of sampling') args = parser.parse_args() # Define primetext ix_to_char, char_to_ix, vocab_size = get_metadata(hdf5_file) if args.primetext and len(args.primetext) > 0: primetext = ''.join( [ch for ch in args.primetext if ch in char_to_ix.keys()]) x_curr = numpy.expand_dims( numpy.array([char_to_ix[ch] for ch in primetext], dtype='uint8'), axis=1) else: dev_stream = get_stream(hdf5_file, 'dev', batch_size) x_curr, y_curr = dev_stream.get_epoch_iterator().next() x_curr = x_curr[:, -1].reshape(seq_length, 1) print 'Loading model from {0}...'.format(args.model) main_loop = load(args.model) print 'Model loaded. Building prediction function...' model = main_loop.model y, x = model.inputs softmax = NDimensionalSoftmax() linear_output = [ v for v in model.variables if v.name == 'linear_output'][0] y_hat = softmax.apply(linear_output, extra_ndim=1) predict = theano.function([x], y_hat) print 'Starting sampling' sample_string = sample(args.length, x_curr, predict, ix_to_char, seed=args.seed, temperature=args.temperature)
class CharRNNModel(Initializable): """ A model for testing that the components of my more complex models work. This is just a model that predicts one character at a time using a LSTM layer """ def __init__(self, config_dict, init_type="xavier", **kwargs): super(CharRNNModel, self).__init__(**kwargs) self.batch_size = config_dict["batch_size"] self.num_subwords = config_dict["num_subwords"] self.num_words = config_dict["num_words"] self.subword_embedding_size = config_dict["subword_embedding_size"] self.input_vocab_size = config_dict["input_vocab_size"] self.output_vocab_size = config_dict["output_vocab_size"] self.subword_RNN_hidden_state_size = config_dict["subword_RNN_hidden_state_size"] self.table_width = config_dict["table_width"] self.max_out_dim = config_dict["max_out_dim"] self.max_out_K = config_dict["max_out_K"] self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name="input_lookup") self.lookup.weights_init = Uniform(width=self.table_width) self.lookup.biases_init = Constant(0) if init_type == "xavier": linear_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size) lstm_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size) else: # default is gaussian linear_init = IsotropicGaussian() lstm_init = IsotropicGaussian() # The `inputs` are then split in this order: Input gates, forget gates, cells and output gates self.linear_forward = Linear( input_dim=self.subword_embedding_size, output_dim=self.subword_RNN_hidden_state_size * 4, name="linear_forward", weights_init=linear_init, biases_init=Constant(0.0), ) self.language_model = LSTM( dim=self.subword_RNN_hidden_state_size, activation=Tanh(), name="language_model_RNN", weights_init=lstm_init, biases_init=Constant(0.0), ) self.max_out = LinearMaxout( self.subword_RNN_hidden_state_size, self.max_out_dim, self.max_out_K, name="max_out", weights_init=IsotropicGaussian(), biases_init=Constant(0.0), ) self.softmax_linear = Linear( self.max_out_dim, self.output_vocab_size, name="soft_max_linear", weights_init=IsotropicGaussian(), biases_init=Constant(0.0), ) self.softmax = NDimensionalSoftmax() self.children = [ self.lookup, self.linear_forward, self.language_model, self.max_out, self.softmax_linear, self.softmax, ] @application(inputs=["features", "features_mask", "targets", "targets_mask"], outputs=["cost"]) def apply(self, features, features_mask, targets, targets_mask): subword_embeddings = self.lookup.apply(features) sentence_embeddings = self.language_model.apply( self.linear_forward.apply(subword_embeddings), mask=features_mask )[ 0 ] # [0] = hidden states, [1] = cells linear_output = self.softmax_linear.apply(self.max_out.apply(sentence_embeddings)) cost = self.softmax.categorical_cross_entropy(targets, linear_output, extra_ndim=1).mean() cost.name = "cost" return ((cost * targets_mask).sum()) / targets_mask.sum()
def __init__(self, initial_output=0, **kwargs): self.initial_output = initial_output self.softmax = NDimensionalSoftmax() children = [self.softmax] kwargs.setdefault('children', []).extend(children) super(SoftmaxEmitter, self).__init__(**kwargs)
def __init__(self, initial_output=0, **kwargs): super(SoftmaxEmitter, self).__init__(**kwargs) self.initial_output = initial_output self.softmax = NDimensionalSoftmax() self.children = [self.softmax]
def __init__( self, dim, emb_dim, vocab, def_emb_translate_dim=-1, def_dim=-1, encoder='bilstm', bn=True, def_reader=None, def_combiner=None, dropout=0.5, num_input_words=-1, # Others **kwargs): self._dropout = dropout self._vocab = vocab self._emb_dim = emb_dim self._def_reader = def_reader self._def_combiner = def_combiner if encoder != 'bilstm': raise NotImplementedError() if def_emb_translate_dim < 0: self.def_emb_translate_dim = emb_dim else: self.def_emb_translate_dim = def_emb_translate_dim if def_dim < 0: self._def_dim = emb_dim else: self._def_dim = def_dim if num_input_words > 0: logger.info("Restricting vocab to " + str(num_input_words)) self._num_input_words = num_input_words else: self._num_input_words = vocab.size() children = [] if self.def_emb_translate_dim != self._emb_dim: self._translate_pre_def = Linear(input_dim=emb_dim, output_dim=def_emb_translate_dim) children.append(self._translate_pre_def) else: self._translate_pre_def = None ## Embedding self._lookup = LookupTable(self._num_input_words, emb_dim, weights_init=GlorotUniform()) children.append(self._lookup) if def_reader: self._final_emb_dim = self._def_dim self._def_reader = def_reader self._def_combiner = def_combiner children.extend([self._def_reader, self._def_combiner]) else: self._final_emb_dim = self._emb_dim ## BiLSTM self._hyp_bidir_fork = Linear( self._def_dim if def_reader else self._emb_dim, 4 * dim, name='hyp_bidir_fork') self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir') self._prem_bidir_fork = Linear( self._def_dim if def_reader else self._emb_dim, 4 * dim, name='prem_bidir_fork') self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir') children.extend([self._hyp_bidir_fork, self._hyp_bidir]) children.extend([self._prem_bidir, self._prem_bidir_fork]) ## BiLSTM no. 2 (encoded attentioned embeddings) self._hyp_bidir_fork2 = Linear(8 * dim, 4 * dim, name='hyp_bidir_fork2') self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2') self._prem_bidir_fork2 = Linear(8 * dim, 4 * dim, name='prem_bidir_fork2') self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2') children.extend([self._hyp_bidir_fork2, self._hyp_bidir2]) children.extend([self._prem_bidir2, self._prem_bidir_fork2]) self._rnns = [ self._prem_bidir2, self._hyp_bidir2, self._prem_bidir, self._hyp_bidir ] ## MLP if bn: self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim], conserve_memory=False, name="mlp") self._pred = BatchNormalizedMLP([Softmax()], [dim, 3], conserve_memory=False, name="pred_mlp") else: self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp") self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp") children.append(self._mlp) children.append(self._pred) ## Softmax self._ndim_softmax = NDimensionalSoftmax() children.append(self._ndim_softmax) super(ESIM, self).__init__(children=children, **kwargs)
linear_input.initialize() rnn = SimpleRecurrent(name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)]
def __init__(self, initial_output=0, **kwargs): self.initial_output = initial_output self.softmax = NDimensionalSoftmax() children = [self.softmax] + kwargs.get('children', []) super(SoftmaxEmitter, self).__init__(children=children, **kwargs)
def __init__(self, dim, emb_dim, readout_dims, num_input_words, def_num_input_words, vocab, use_definitions, def_word_gating, compose_type, coattention, def_reader, reuse_word_embeddings, random_unk, **kwargs): self._vocab = vocab if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words self._coattention = coattention self._num_input_words = num_input_words self._use_definitions = use_definitions self._random_unk = random_unk self._reuse_word_embeddings = reuse_word_embeddings lookup_num_words = num_input_words if reuse_word_embeddings: lookup_num_words = max(num_input_words, def_num_input_words) if random_unk: lookup_num_words = vocab.size() # Dima: we can have slightly less copy-paste here if we # copy the RecurrentFromFork class from my other projects. children = [] self._lookup = LookupTable(lookup_num_words, emb_dim) self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._question_transform = Linear(dim, dim, name='question_transform') self._bidir_fork = Linear(3 * dim if coattention else 2 * dim, 4 * dim, name='bidir_fork') self._bidir = Bidirectional(LSTM(dim), name='bidir') children.extend([ self._lookup, self._encoder_fork, self._encoder_rnn, self._question_transform, self._bidir, self._bidir_fork ]) activations = [Rectifier()] * len(readout_dims) + [None] readout_dims = [2 * dim] + readout_dims + [1] self._begin_readout = MLP(activations, readout_dims, name='begin_readout') self._end_readout = MLP(activations, readout_dims, name='end_readout') self._softmax = NDimensionalSoftmax() children.extend( [self._begin_readout, self._end_readout, self._softmax]) if self._use_definitions: # A potential bug here: we pass the same vocab to the def reader. # If a different token is reserved for UNK in text and in the definitions, # we can be screwed. def_reader_class = eval(def_reader) def_reader_kwargs = dict( num_input_words=def_num_input_words, dim=dim, emb_dim=emb_dim, vocab=vocab, lookup=self._lookup if reuse_word_embeddings else None) if def_reader_class == MeanPoolReadDefinitions: def_reader_kwargs.update(dict(normalize=True, translate=False)) self._def_reader = def_reader_class(**def_reader_kwargs) self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, def_word_gating=def_word_gating, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) super(ExtractiveQAModel, self).__init__(children=children, **kwargs) # create default input variables self.contexts = tensor.lmatrix('contexts') self.context_mask = tensor.matrix('contexts_mask') self.questions = tensor.lmatrix('questions') self.question_mask = tensor.matrix('questions_mask') self.answer_begins = tensor.lvector('answer_begins') self.answer_ends = tensor.lvector('answer_ends') input_vars = [ self.contexts, self.context_mask, self.questions, self.question_mask, self.answer_begins, self.answer_ends ] if self._use_definitions: self.defs = tensor.lmatrix('defs') self.def_mask = tensor.matrix('def_mask') self.contexts_def_map = tensor.lmatrix('contexts_def_map') self.questions_def_map = tensor.lmatrix('questions_def_map') input_vars.extend([ self.defs, self.def_mask, self.contexts_def_map, self.questions_def_map ]) self.input_vars = OrderedDict([(var.name, var) for var in input_vars])
def fit(self, trainset, retrain=True): batch_size = self.batch_size n_iter = self.n_iter look_ahead = self.look_ahead lr = self.lr b1 = self.b1 b2 = self.b2 epsilon = self.epsilon hidden_size = self.hidden_size activation_function = self.activation_function drop_rate = self.drop_rate weight_decay = self.weight_decay optimizer = self.optimizer std = self.std alpha = self.alpha polyak_mu = self.polyak_mu rating_category = self.rating_category item_num = self.item_num user_num = self.user_num trainset = self.load_dataset(which_set=['train'], sources=('input_ratings', 'output_ratings', 'input_masks', 'output_masks')) validset = self.load_dataset(which_set=['valid'], sources=('input_ratings', 'output_ratings', 'input_masks', 'output_masks')) train_loop_stream = ForceFloatX(data_stream=MovieLensTransformer( data_stream=Trainer_MovieLensTransformer(data_stream=DataStream( dataset=trainset, iteration_scheme=ShuffledScheme(trainset.num_examples, batch_size))))) valid_monitor_stream = ForceFloatX(data_stream=MovieLensTransformer( data_stream=DataStream(dataset=validset, iteration_scheme=ShuffledScheme( validset.num_examples, batch_size)))) rating_freq = np.zeros((user_num, rating_category)) init_b = np.zeros((user_num, rating_category)) for batch in valid_monitor_stream.get_epoch_iterator(): inp_r, out_r, inp_m, out_m = batch rating_freq += inp_r.sum(axis=0) log_rating_freq = np.log(rating_freq + 1e-8) log_rating_freq_diff = np.diff(log_rating_freq, axis=1) init_b[:, 1:] = log_rating_freq_diff init_b[:, 0] = log_rating_freq[:, 0] # init_b = np.log(rating_freq / (rating_freq.sum(axis=1)[:, None] + 1e-8) +1e-8) * (rating_freq>0) new_items = np.where(rating_freq.sum(axis=1) == 0)[0] self.new_items = new_items input_ratings = T.tensor3(name='input_ratings', dtype=theano.config.floatX) output_ratings = T.tensor3(name='output_ratings', dtype=theano.config.floatX) input_masks = T.matrix(name='input_masks', dtype=theano.config.floatX) output_masks = T.matrix(name='output_masks', dtype=theano.config.floatX) input_ratings_cum = T.extra_ops.cumsum(input_ratings[:, :, ::-1], axis=2)[:, :, ::-1] # hidden_size = [256] if activation_function == 'reclin': act = Rectifier elif activation_function == 'tanh': act = Tanh elif activation_function == 'sigmoid': act = Logistic else: act = Softplus layers_act = [act('layer_%d' % i) for i in range(len(hidden_size))] NADE_CF_model = tabula_NADE(activations=layers_act, input_dim0=user_num, input_dim1=rating_category, other_dims=hidden_size, batch_size=batch_size, weights_init=Uniform(std=0.05), biases_init=Constant(0.0)) NADE_CF_model.push_initialization_config() dims = [user_num] + hidden_size + [user_num] linear_layers = [ layer for layer in NADE_CF_model.children if 'linear' in layer.name ] assert len(linear_layers) == len(dims) - 1 for i in range(len(linear_layers)): H1 = dims[i] H2 = dims[i + 1] width = 2 * np.sqrt(6) / np.sqrt(H1 + H2) # std = np.sqrt(2. / dim) linear_layers[i].weights_init = Uniform(width=width) NADE_CF_model.initialize() NADE_CF_model.children[-1].parameters[-1].set_value( init_b.astype(theano.config.floatX)) y = NADE_CF_model.apply(input_ratings_cum) y_cum = T.extra_ops.cumsum(y, axis=2) predicted_ratings = NDimensionalSoftmax().apply(y_cum, extra_ndim=1) d = input_masks.sum(axis=1) D = (input_masks + output_masks).sum(axis=1) cost, nll, nll_item_ratings, cost_ordinal_1N, cost_ordinal_N1, prob_item_ratings = rating_cost( y, output_ratings, input_masks, output_masks, D, d, alpha=alpha, std=std) cost.name = 'cost' cg = ComputationGraph(cost) if weight_decay > 0.0: all_weights = VariableFilter(roles=[WEIGHT])(cg.variables) l2_weights = T.sum([(W**2).sum() for W in all_weights]) l2_cost = cost + weight_decay * l2_weights l2_cost.name = 'l2_decay_' + cost.name cg = ComputationGraph(l2_cost) if drop_rate > 0.0: dropped_layer = VariableFilter(roles=[INPUT], bricks=NADE_CF_model.children)( cg.variables) dropped_layer = [ layer for layer in dropped_layer if 'linear' in layer.name ] dropped_layer = dropped_layer[1:] cg_dropout = apply_dropout(cg, dropped_layer, drop_rate) else: cg_dropout = cg training_cost = cg_dropout.outputs[0] lr0 = T.scalar(name='learning_rate', dtype=theano.config.floatX) input_list = [input_ratings, input_masks, output_ratings, output_masks] if optimizer == 'Adam': f_get_grad, f_update_parameters, shared_gradients = Adam_optimizer( input_list, training_cost, cg_dropout.parameters, lr0, b1, b2, epsilon) elif optimizer == 'Adadelta': f_get_grad, f_update_parameters, shared_gradients = Adadelta_optimizer( input_list, training_cost, cg_dropout.parameters, lr, epsilon) else: f_get_grad, f_update_parameters, shared_gradients = SGD_optimizer( input_list, training_cost, cg_dropout.parameters, lr0, b1) param_list = [] [param_list.extend(p.parameters) for p in NADE_CF_model.children] f_update_polyak, shared_polyak = polyak(param_list, mu=polyak_mu) f_monitor = theano.function(inputs=[input_ratings], outputs=[predicted_ratings]) nb_of_epocs_without_improvement = 0 best_valid_error = np.Inf epoch = 0 best_model = cp.deepcopy(NADE_CF_model) best_polyak = cp.deepcopy(shared_polyak) start_training_time = t.time() lr_tracer = [] rate_score = np.array(list(range(1, rating_category + 1)), np.float32) best_epoch = -1 while epoch < n_iter and nb_of_epocs_without_improvement < look_ahead: print('Epoch {0}'.format(epoch)) epoch += 1 start_time_epoch = t.time() cost_train = [] squared_error_train = [] n_sample_train = [] cntt = 0 train_time = 0 for batch in train_loop_stream.get_epoch_iterator(): inp_r, out_r, inp_m, out_m = batch train_t = t.time() cost_value = f_get_grad(inp_r, inp_m, out_r, out_m) train_time += t.time() - train_t # pred_ratings = f_monitor(inp_r) if optimizer == 'Adadelta': f_update_parameters() else: f_update_parameters(lr) f_update_polyak() pred_ratings = f_monitor(inp_r) true_r = out_r.argmax(axis=2) + 1 pred_r = (pred_ratings[0] * rate_score[np.newaxis, np.newaxis, :]).sum(axis=2) pred_r[:, new_items] = 3 mask = out_r.sum(axis=2) se = np.sum(np.square(true_r - pred_r) * mask) n = np.sum(mask) squared_error_train.append(se) n_sample_train.append(n) cost_train.append(cost_value) cntt += 1 cost_train = np.array(cost_train).mean() squared_error_ = np.array(squared_error_train).sum() n_samples = np.array(n_sample_train).sum() train_RMSE = np.sqrt(squared_error_ / (n_samples * 1.0 + 1e-8)) print('\tTraining ...') print('Train :', "RMSE: {0:.6f}".format(train_RMSE), " Cost Error: {0:.6f}".format(cost_train), "Train Time: {0:.6f}".format(train_time), get_done_text(start_time_epoch)) print('\tValidating ...', ) start_time = t.time() squared_error_valid = [] n_sample_valid = [] valid_time = 0 for batch in valid_monitor_stream.get_epoch_iterator(): inp_r, out_r, inp_m, out_m = batch valid_t = t.time() pred_ratings = f_monitor(inp_r) valid_time += t.time() - valid_t true_r = out_r.argmax(axis=2) + 1 pred_r = (pred_ratings[0] * rate_score[np.newaxis, np.newaxis, :]).sum(axis=2) pred_r[:, new_items] = 3 mask = out_r.sum(axis=2) se = np.sum(np.square(true_r - pred_r) * mask) n = np.sum(mask) squared_error_valid.append(se) n_sample_valid.append(n) squared_error_ = np.array(squared_error_valid).sum() n_samples = np.array(n_sample_valid).sum() valid_RMSE = np.sqrt(squared_error_ / (n_samples * 1.0 + 1e-8)) print('Validation:', " RMSE: {0:.6f}".format(valid_RMSE), "Valid Time: {0:.6f}".format(valid_time), get_done_text(start_time)) if valid_RMSE < best_valid_error: best_epoch = epoch nb_of_epocs_without_improvement = 0 best_valid_error = valid_RMSE del best_model del best_polyak gc.collect() best_model = cp.deepcopy(NADE_CF_model) best_polyak = cp.deepcopy(shared_polyak) print('\n\n Got a good one') else: nb_of_epocs_without_improvement += 1 if optimizer == 'Adadelta': pass elif nb_of_epocs_without_improvement == look_ahead and lr > 1e-5: nb_of_epocs_without_improvement = 0 lr /= 4 print("learning rate is now %s" % lr) lr_tracer.append(lr) print('\n### Training, n_layers=%d' % (len(hidden_size)), get_done_text(start_training_time)) best_y = best_model.apply(input_ratings_cum) best_y_cum = T.extra_ops.cumsum(best_y, axis=2) best_predicted_ratings = NDimensionalSoftmax().apply(best_y_cum, extra_ndim=1) self.f_monitor_best = theano.function(inputs=[input_ratings], outputs=[best_predicted_ratings]) self.best_valid_error = best_valid_error self.best_epoch = best_epoch self.best_model = best_model self.best_polyak = best_polyak
MaxPooling((2, 2), name='MaxPol1'), Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx3'), Rectifier(), MaxPooling((2, 2), name='MaxPol2'), Convolutional(filter_size=(1, 1), num_filters=2, name='Convx4'), Rectifier(), ]) conv_sequence1 = ConvolutionalSequence(conv_layers1, num_channels=512, image_size=(10, 10), weights_init=Orthogonal(), use_bias=False, name='ConvSeq3') conv_sequence1.initialize() out_soft1 = Flattener(name='Flatt1').apply(conv_sequence1.apply(out5)) predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1) cost1 = CategoricalCrossEntropy(name='Cross1').apply( y.flatten(), predict1).copy(name='cost1') #SECOND SOFTMAX conv_layers2 = list([ MaxPooling((2, 2), name='MaxPol2'), Convolutional(filter_size=(1, 1), num_filters=128, name='Convx21'), Rectifier(), MaxPooling((2, 2), name='MaxPol11'), Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx31'), Rectifier(), MaxPooling((2, 2), name='MaxPol21'), Convolutional(filter_size=(1, 1), num_filters=2, name='Convx41'), Rectifier(), ])
H1 = dims[i] H2 = dims[i + 1] width = 2 * np.sqrt(6) / np.sqrt(H1 + H2) # std = np.sqrt(2. / dim) linear_layers[i].weights_init = Uniform(width=width) # NADE_CF_model.children[0].weights_init = Constant(1) # NADE_CF_model.children[0].biases_init = Constant(1.5) # NADE_CF_model.children[1].weights_init = Constant(2) # NADE_CF_model.children[1].biases_init = Constant(2.5) NADE_CF_model.initialize() NADE_CF_model.children[-1].parameters[-1].set_value( init_b.astype(theano.config.floatX)) y = NADE_CF_model.apply(input_ratings_cum) y_cum = T.extra_ops.cumsum(y, axis=2) predicted_ratings = NDimensionalSoftmax().apply(y_cum, extra_ndim=1) d = input_masks.sum(axis=1) D = (input_masks + output_masks).sum(axis=1) # ratings = T.tensor3(name="ratings", dtype=theano.config.floatX) cost, nll, nll_item_ratings, cost_ordinal_1N, cost_ordinal_N1, prob_item_ratings = rating_cost( y, output_ratings, input_masks, output_masks, D, d, alpha=alpha, std=std) cost.name = 'cost' cg = ComputationGraph(cost)
# ******************* Model ******************* recognizer = SimpleSpeechRecognizer(transition=transition, dims_transition=conf.dims_transition, num_features=num_features, num_classes=num_classes) #recognizer = SpeechRecognizer( # num_features=num_features, dims_bottom=[], # dims_bidir=conf.dims_transition, dims_top=[num_classes], # bidir_trans=GatedRecurrent, bottom_activation=None) # ******************* output ******************* y_hat = recognizer.apply(x,x_m) y_hat.name = 'outputs' y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim = y_hat.ndim - 2) y_hat_softmax.name = 'outputs_softmax' # there is a cost function for monitoring and for training, because one is more stable to compute # gradients and seems also to be more memory efficient, but does not compute the true cost. if conf.task=='CTC': cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean() cost_train.name = "cost_train" cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean() cost_monitor.name = "cost_monitor" elif conf.task=='framewise': cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m) cost_train.name='cost' cost_monitor = cost_train else:
class FRNNEmitter(AbstractEmitter, Initializable, Random): """An RNN emitter for the case of real outputs. Parameters ---------- """ def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu") self.sigma = MLP( activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma" ) self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state" ) # self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh(name="frnn_activation") self.frnn_linear_transition_state = Linear( input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state" ) self.frnn_linear_transition_input = Linear( input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input" ) # self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2, self.frnn_initial_state, self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input, ] @application def emit(self, readouts): """ keep_parameters is True if mu,sigma,coeffs must be stacked and returned if false, only the result is given, the others will be empty list. """ # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) results = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. mu = self.mu.apply(state) sigma = self.sigma.apply(state) + self.const coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const shape_result = coeff.shape shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size) ndim_result = coeff.ndim mu = mu.reshape((-1, self.frnn_step_size, self.k)) sigma = sigma.reshape((-1, self.frnn_step_size, self.k)) coeff = coeff.reshape((-1, self.k)) sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype) idx = predict(sample_coeff, axis=-1) # idx = predict(coeff, axis = -1) use this line for using most likely coeff. # shapes (ls*bs)*(fs) mu = mu[tensor.arange(mu.shape[0]), :, idx] sigma = sigma[tensor.arange(sigma.shape[0]), :, idx] epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype) result = mu + sigma * epsilon # *0.6 #reduce variance. result = result.reshape(shape_result, ndim=ndim_result) results.append(result) # if the total size does not correspond to the frame_size, # this removes the need for padding if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result) ) results = tensor.stack(results, axis=-1) results = tensor.flatten(results, outdim=results.ndim - 1) # truncate if not good size if self.last_steps != 0: results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])] return results @application def cost(self, readouts, outputs): # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) inputs = outputs mus = [] sigmas = [] coeffs = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. freq_mu = self.mu.apply(state) freq_sigma = self.sigma.apply(state) + self.const freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k)) freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k)) freq_coeff = freq_coeff.reshape((-1, self.k)) # mu,sigma: shape (-1,fs,k) # coeff: shape (-1,k) mus.append(freq_mu) sigmas.append(freq_sigma) coeffs.append(freq_coeff) index = self.frnn_step_size freq_inputs = inputs[ tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)]) ] if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(freq_inputs) ) mus = tensor.stack(mus, axis=-2) sigmas = tensor.stack(sigmas, axis=-2) coeffs = tensor.stack(coeffs, axis=-2) mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) coeffs = coeffs.repeat(self.frnn_step_size, axis=-2) mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] # actually prob not necessary mu = mus.reshape((-1, self.target_size)) sigma = sigmas.reshape((-1, self.target_size)) coeff = coeffs.reshape((-1, self.target_size)) return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k) @application def initial_outputs(self, batch_size): return tensor.zeros((batch_size, self.frame_size), dtype=floatX) def get_dim(self, name): # modification here to ensure the right dim. if name == "outputs": return self.frame_size return super(FRNNEmitter, self).get_dim(name)
def __init__(self, *args, **kwargs): self.softmax = NDimensionalSoftmax() super(MinRiskInitialContextSequenceGenerator, self).__init__(*args, **kwargs) self.children.append(self.softmax)
class ActorCriticReadout(SoftmaxReadout): """Actor-critic Params ------ bos_token : int The token used to pad critic input. Critic needs to do at least one extra step compared to the actor in order to get the first glimpse of the ground-truth sequence before predicting the actual values. """ def __init__(self, reward_brick, compute_targets, compute_policy, solve_bellman, freeze_actor, freeze_critic, critic_uses_actor_states, critic_uses_groundtruth, critic=None, critic_burnin_steps=None, critic_policy_t=None, entropy_reward_coof=None, cross_entropy_reward_coof=None, discount=None, value_penalty=None, value_softmax=False, same_value_for_wrong=False, accumulate_outputs=False, use_value_biases=None, actor_grad_estimate=None, bos_token=None, **kwargs): super(ActorCriticReadout, self).__init__(**kwargs) self.reward_brick = reward_brick self.critic = critic self.freeze_actor = freeze_actor self.freeze_critic = freeze_critic self.critic_uses_actor_states = critic_uses_actor_states self.critic_uses_groundtruth = (critic_uses_groundtruth if critic_uses_groundtruth is not None else True) self.critic_burnin_steps = (critic_burnin_steps if critic_burnin_steps is not None else 0) self.value_summand = Linear(output_dim=1, name='summand') self.softmax_t = 1. self.critic_policy_t = (critic_policy_t if critic_policy_t is not None else 1.0) self.epsilon = 0. self.discount = (discount if discount is not None else 1.) self.entropy_reward_coof = (entropy_reward_coof if entropy_reward_coof is not None else 0.) self.cross_entropy_reward_coof = (cross_entropy_reward_coof if cross_entropy_reward_coof is not None else 0.) self.value_penalty = value_penalty self.value_softmax = value_softmax self.same_value_for_wrong = same_value_for_wrong self.compute_targets = compute_targets self.compute_policy = compute_policy self.solve_bellman = solve_bellman self.accumulate_outputs = accumulate_outputs self.use_value_biases = (use_value_biases if use_value_biases is not None else True) self.actor_grad_estimate = (actor_grad_estimate if actor_grad_estimate else 'all_actions') self.bos_token = bos_token self.softmax = NDimensionalSoftmax() self.children += [reward_brick, self.value_summand, self.softmax] if self.critic: self.children.append(self.critic) self.costs.inputs += ['attended', 'attended_mask'] def _push_allocation_config(self): super(ActorCriticReadout, self)._push_allocation_config() self.value_summand.input_dim = self.get_dim('attended') @application def scores(self, **inputs): merged = self.merge(**dict_subset(inputs, self.merge_names)) return self.softmax.log_probabilities(merged * self.softmax_t, extra_ndim=merged.ndim - 2) @application def costs(self, application_call, prediction, prediction_mask, groundtruth, groundtruth_mask, **inputs): def _prediction_subtensor(data): if data.ndim != 3: raise ValueError flat_data = data.reshape( (data.shape[0] * data.shape[1], data.shape[2])) flat_data = flat_data[tensor.arange(flat_data.shape[0]), prediction.flatten()] return flat_data.reshape( (prediction.shape[0], prediction.shape[1])) attended = disconnected_grad(inputs.pop('attended')) attended_mask = disconnected_grad(inputs.pop('attended_mask')) # Compute the rewards rewards = self.reward_brick.apply(prediction, prediction_mask, groundtruth, groundtruth_mask)[:, :, 0] future_rewards = rewards[::-1].cumsum(axis=0)[::-1] # Compute the critic outputs if self.critic: padding = tensor.repeat(tensor.fill(prediction[0:1], self.bos_token), 1, axis=0) mask_padding = tensor.repeat(tensor.fill(prediction_mask[0:1], 1.), 1, axis=0) padded_prediction = tensor.concatenate([padding, prediction]) padded_prediction_mask = tensor.concatenate( [mask_padding, prediction_mask]) if self.critic_uses_groundtruth: critic_context = groundtruth critic_context_mask = groundtruth_mask else: critic_context = tensor.zeros_like(groundtruth[0:1]) critic_context_mask = tensor.zeros_like(groundtruth_mask[0:1]) critic_kwargs = dict(prediction=padded_prediction, prediction_mask=padded_prediction_mask, groundtruth=critic_context, groundtruth_mask=critic_context_mask, inputs=critic_context, inputs_mask=critic_context_mask) if self.critic_uses_actor_states: extra_inputs = disconnected_grad(inputs['states']) # We don't the very last hidden state of the actor # in extra_inputs. We have to add something instead for the shapes # to match. It doesn't matter at all, what exactly we add. critic_kwargs['extra_inputs'] = tensor.concatenate( [extra_inputs, tensor.zeros_like(extra_inputs[0:1])]) critic_cg = ComputationGraph(self.critic.costs(**critic_kwargs)) outputs, = VariableFilter( applications=[self.critic.generator.readout.all_outputs], roles=[OUTPUT])(critic_cg) # The first subtensor should be discarded, because it was outputted # for the padding. In addition to that Q-values from the first # 'critic_burnin_steps' will be ignored, see later in the code. outputs = outputs[1:] else: outputs = self.merge(**dict_subset(inputs, self.merge_names)) prediction_outputs = _prediction_subtensor(outputs) # Compute Q adjustments adjustments = outputs prediction_adjustments = prediction_outputs if self.accumulate_outputs: prediction_adjustments = prediction_outputs.cumsum(axis=0) adjustments = tensor.inc_subtensor( adjustments[1:], prediction_adjustments[:-1][:, :, None]) # Compute shared additive biases for all Q values if self.use_value_biases: value_biases = (self.value_summand.apply(attended)[:, :, 0] * attended_mask).sum(axis=0) else: value_biases = tensor.zeros_like(adjustments[0, :, 0]) values = adjustments + value_biases[None, :, None] prediction_values = prediction_adjustments + value_biases[None, :] rolled_prediction_mask = tensor.roll(prediction_mask, -1, axis=0) rolled_prediction_mask = tensor.set_subtensor( rolled_prediction_mask[-1], 0) # Compute probabilities logs = self.scores(use_epsilon=False, **inputs) probs = tensor.exp(logs) if not self.compute_policy: raise NotImplementedError("Not supported any more") prediction_logs = _prediction_subtensor(logs) # Compute value targets value_targets = (disconnected_grad(probs) * values).sum(axis=-1) value_targets = tensor.roll(value_targets, -1, axis=0) value_targets = ( self.discount * value_targets * rolled_prediction_mask + rewards) value_targets = value_targets.astype(theano.config.floatX) total_costs = 0 # Compute critic cost if not self.compute_targets: logger.debug("Using given targets") value_targets = tensor.matrix('value_targets') if self.solve_bellman == 'no': logger.debug("Not solving Bellman, just predicting the rewards") value_targets = rewards.copy(name='value_targets') elif self.solve_bellman == 'without_dp': future_rewards = rewards[::-1].cumsum(axis=0)[::-1] logger.debug("Solving Bellman, but without DP") value_targets = future_rewards elif self.solve_bellman is not True: raise ValueError() critic_costs_per_char = ( (prediction_values - value_targets)**2) * prediction_mask critic_costs = critic_costs_per_char[self.critic_burnin_steps:].sum( axis=0) if not self.freeze_critic: total_costs += critic_costs # Compute critic Monte-Carlo cost critic_monte_carlo_costs = ( (((prediction_values - future_rewards)**2) * prediction_mask)[self.critic_burnin_steps:].sum(axis=0)) # Value penalty if self.value_penalty: logger.debug("Use value penalty") value_deviations = (values - values.mean(axis=-1, keepdims=True))**2 if not self.freeze_critic: total_costs += ( self.value_penalty * (value_deviations.sum(axis=-1) * prediction_mask)[self.critic_burnin_steps:].sum(axis=0)) # Compute actor cost if self.critic: # The actor cost will be minimized, that's why values # must be negated. est_name = self.actor_grad_estimate if est_name == 'all_actions': disadvantages = disconnected_grad( values.max(axis=-1)[:, :, None] - values) actor_costs = ((probs * disadvantages).sum(axis=-1) * prediction_mask) actor_costs = actor_costs[self.critic_burnin_steps:] elif est_name.startswith('1_action'): # Here we do not provide a target for the first step for # the reason we lack an estimate of the value of the initial state. # This is how our critic works. # Hopefully the network won't unlearn # to produce a BOS first. future_reward_estimate = (future_rewards if est_name.endswith('unbiased') else prediction_values) weights = -disconnected_grad(future_reward_estimate[1:] + rewards[:-1] - prediction_values[:-1]) actor_costs = ((prediction_logs[1:] * weights) * prediction_mask[1:]) actor_costs = actor_costs[self.critic_burnin_steps + 1:] else: raise ValueError actor_costs = actor_costs.sum(axis=0) actor_entropies = (probs * -logs).sum(axis=-1) * prediction_mask actor_entropies = actor_entropies[self.critic_burnin_steps:].sum( axis=0) critic_policy = disconnected_grad( self.softmax.apply(self.critic_policy_t * values, extra_ndim=1)) critic_cross_entropies = ((critic_policy * -logs).sum(axis=-1) * prediction_mask) critic_cross_entropies = critic_cross_entropies[ self.critic_burnin_steps:].sum(axis=0) actor_costs_with_penalties = ( actor_costs - self.entropy_reward_coof * actor_entropies - self.cross_entropy_reward_coof * critic_cross_entropies) if not self.freeze_actor: total_costs += actor_costs_with_penalties else: total_costs += disconnected_grad(actor_costs_with_penalties) # Add auxiliary variables for intermediate steps of the computation application_call.add_auxiliary_variable(rewards, name='rewards') application_call.add_auxiliary_variable(value_biases, name='value_biases') application_call.add_auxiliary_variable(values.copy(), name='values') application_call.add_auxiliary_variable(outputs.copy(), name='outputs') application_call.add_auxiliary_variable(prediction_values, name='prediction_values') application_call.add_auxiliary_variable(prediction_outputs, name='prediction_outputs') application_call.add_auxiliary_variable(value_targets.copy(), name='value_targets') application_call.add_auxiliary_variable(probs.copy(), name='probs') application_call.add_auxiliary_variable(prediction_logs, name='prediction_log_probs') # Compute some statistics for debugging last_character_mask = prediction_mask - rolled_prediction_mask last_character_costs = (critic_costs_per_char * last_character_mask).sum(axis=0) mean2_output = (((prediction_outputs**2) * prediction_mask).sum() / prediction_mask.sum())**0.5 max_output = abs(prediction_outputs * prediction_mask).max() expected_reward = (probs[0] * values[0]).sum(axis=-1) application_call.add_auxiliary_variable(last_character_costs, name='last_character_costs') application_call.add_auxiliary_variable(critic_costs.mean(), name='mean_critic_cost') application_call.add_auxiliary_variable( critic_monte_carlo_costs.mean(), name='mean_critic_monte_carlo_cost') if self.critic: application_call.add_auxiliary_variable(actor_costs.mean(), name='mean_actor_cost') application_call.add_auxiliary_variable(actor_entropies.mean(), name='mean_actor_entropy') application_call.add_auxiliary_variable(expected_reward.mean(), name='mean_expected_reward') application_call.add_auxiliary_variable(mean2_output, name='mean2_output') application_call.add_auxiliary_variable(max_output, name='max_output') return total_costs
def __init__(self, reward_brick, compute_targets, compute_policy, solve_bellman, freeze_actor, freeze_critic, critic_uses_actor_states, critic_uses_groundtruth, critic=None, critic_burnin_steps=None, critic_policy_t=None, entropy_reward_coof=None, cross_entropy_reward_coof=None, discount=None, value_penalty=None, value_softmax=False, same_value_for_wrong=False, accumulate_outputs=False, use_value_biases=None, actor_grad_estimate=None, bos_token=None, **kwargs): super(ActorCriticReadout, self).__init__(**kwargs) self.reward_brick = reward_brick self.critic = critic self.freeze_actor = freeze_actor self.freeze_critic = freeze_critic self.critic_uses_actor_states = critic_uses_actor_states self.critic_uses_groundtruth = (critic_uses_groundtruth if critic_uses_groundtruth is not None else True) self.critic_burnin_steps = (critic_burnin_steps if critic_burnin_steps is not None else 0) self.value_summand = Linear(output_dim=1, name='summand') self.softmax_t = 1. self.critic_policy_t = (critic_policy_t if critic_policy_t is not None else 1.0) self.epsilon = 0. self.discount = (discount if discount is not None else 1.) self.entropy_reward_coof = (entropy_reward_coof if entropy_reward_coof is not None else 0.) self.cross_entropy_reward_coof = (cross_entropy_reward_coof if cross_entropy_reward_coof is not None else 0.) self.value_penalty = value_penalty self.value_softmax = value_softmax self.same_value_for_wrong = same_value_for_wrong self.compute_targets = compute_targets self.compute_policy = compute_policy self.solve_bellman = solve_bellman self.accumulate_outputs = accumulate_outputs self.use_value_biases = (use_value_biases if use_value_biases is not None else True) self.actor_grad_estimate = (actor_grad_estimate if actor_grad_estimate else 'all_actions') self.bos_token = bos_token self.softmax = NDimensionalSoftmax() self.children += [reward_brick, self.value_summand, self.softmax] if self.critic: self.children.append(self.critic) self.costs.inputs += ['attended', 'attended_mask']