def softmax_layer(h, y, x_mask, y_mask, lens, vocab_size, hidden_size, boosting): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() #y_hat = softmax.apply(linear_output, extra_ndim=1) #y_hat.name = 'y_hat' cost_a = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1) #produces correct average cost_a = cost_a * y_mask if boosting: #boosting step, must divide by length here lensMat = T.tile(lens, (y.shape[0], 1)) cost_a = cost_a / lensMat #only count cost of correctly masked entries cost = cost_a.sum() / y_mask.sum() cost.name = 'cost' return (linear_output, cost)
def create_rnn(hidden_dim, vocab_dim,mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name = "W1", #dim = hidden_dim*4, dim = hidden_dim, length = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) if mode == "lstm": # Long Short Term Memory H = LSTM( hidden_dim, name = 'H', weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0.0) ) else: # recurrent history weight H = SimpleRecurrent( name = "H", dim = hidden_dim, activation = Tanh(), weights_init = initialization.IsotropicGaussian(0.01) ) # S = Linear( name = "W2", input_dim = hidden_dim, output_dim = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) A = NDimensionalSoftmax( name = "softmax" ) initLayers([W,H,S]) activations = W.apply(x) hiddens = H.apply(activations)#[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def softmax_layer(h, y, frame_length, hidden_size): hidden_to_output = Linear(name="hidden_to_output", input_dim=hidden_size, output_dim=frame_length) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = "linear_output" softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = "y_hat" cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = "cost" return y_hat, cost
class NewSoftmaxEmitter(AbstractEmitter, Initializable, Random): """A softmax emitter for the case of integer outputs. Interprets readout elements as energies corresponding to their indices. Parameters ---------- initial_output : int or a scalar :class:`~theano.Variable` The initial output. """ def __init__(self, initial_output=0, **kwargs): super(NewSoftmaxEmitter, self).__init__(**kwargs) self.initial_output = initial_output self.softmax = NDimensionalSoftmax() self.children = [self.softmax] self.name = 'newbidirectional' @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) @application def emitProbs(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] self.pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=self.pvals_flat) return self.pvals_flat @application def emit(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] self.pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=self.pvals_flat) winning_index = generated.reshape(probs.shape).argmax(axis=-1) return winning_index, self.pvals_flat[0][winning_index] @application def cost(self, readouts, outputs): # WARNING: unfortunately this application method works # just fine when `readouts` and `outputs` have # different dimensions. Be careful! return self.softmax.categorical_cross_entropy( outputs, readouts, extra_ndim=readouts.ndim - 2) @application def initial_outputs(self, batch_size): return self.initial_output * tensor.ones((batch_size, ), dtype='int64') def get_dim(self, name): if name == 'outputs': return 0 return super(SoftmaxEmitter, self).get_dim(name)
def softmax_layer(h, y, vocab_size, hidden_size): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy( y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
def softmax_layer(h, y, vocab_size, hidden_size): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() y_hat = softmax.apply(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
class SoftmaxEmitter(AbstractEmitter, Initializable, Random): """A softmax emitter for the case of integer outputs. Interprets readout elements as energies corresponding to their indices. Parameters ---------- initial_output : int or a scalar :class:`~theano.Variable` The initial output. """ def __init__(self, initial_output=0, **kwargs): super(SoftmaxEmitter, self).__init__(**kwargs) self.initial_output = initial_output self.softmax = NDimensionalSoftmax() self.children = [self.softmax] @application def probs(self, readouts): return self.softmax.apply(readouts, extra_ndim=readouts.ndim - 2) @application def emit(self, readouts): probs = self.probs(readouts) batch_size = probs.shape[0] pvals_flat = probs.reshape((batch_size, -1)) generated = self.theano_rng.multinomial(pvals=pvals_flat) return generated.reshape(probs.shape).argmax(axis=-1) @application def cost(self, readouts, outputs): # WARNING: unfortunately this application method works # just fine when `readouts` and `outputs` have # different dimensions. Be careful! return self.softmax.categorical_cross_entropy( outputs, readouts, extra_ndim=readouts.ndim - 2) @application def costs(self, readouts): return -self.softmax.log_probabilities( readouts, extra_ndim=readouts.ndim - 2) @application def initial_outputs(self, batch_size): return self.initial_output * tensor.ones((batch_size,), dtype='int64') def get_dim(self, name): if name == 'outputs': return 0 return super(SoftmaxEmitter, self).get_dim(name)
def create_rnn(hidden_dim, vocab_dim, mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name="W1", #dim = hidden_dim*4, dim=hidden_dim, length=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) if mode == "lstm": # Long Short Term Memory H = LSTM(hidden_dim, name='H', weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0.0)) else: # recurrent history weight H = SimpleRecurrent( name="H", dim=hidden_dim, activation=Tanh(), weights_init=initialization.IsotropicGaussian(0.01)) # S = Linear(name="W2", input_dim=hidden_dim, output_dim=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) A = NDimensionalSoftmax(name="softmax") initLayers([W, H, S]) activations = W.apply(x) hiddens = H.apply(activations) #[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def softmax_output_layer(x, h, y, in_size, out_size, hidden_size, pred): if connect_h_to_o: hidden_to_output = Linear(name='hidden_to_output' + str(pred), input_dim=hidden_size * len(h), output_dim=out_size) hiddens = T.concatenate([hidden for hidden in h], axis=2) else: hidden_to_output = Linear(name='hidden_to_output' + str(pred), input_dim=hidden_size, output_dim=out_size) hiddens = h[-1] initialize([hidden_to_output]) linear_output = hidden_to_output.apply(hiddens) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() extra_ndim = 1 if single_dim_out else 2 y_hat = softmax.apply(linear_output, extra_ndim=extra_ndim) cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=extra_ndim).mean() return y_hat, cost
def softmax_layer(self, h, y): """ Perform Softmax over the hidden state in order to predict the next word in the sequence and compute the loss. :param h The hidden state sequence :param y The target words """ hidden_to_output = Linear(name='hidden_to_output', input_dim=self.hidden_size, output_dim=self.vocab_size) initialize(hidden_to_output, sqrt(6.0 / (self.hidden_size + self.vocab_size))) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax(name="lm_softmax") y_hat = softmax.log_probabilities(linear_output, extra_ndim=1) y_hat.name = 'y_hat' cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean() cost.name = 'cost' return y_hat, cost
class CharRNNModel(Initializable): """ A model for testing that the components of my more complex models work. This is just a model that predicts one character at a time using a LSTM layer """ def __init__(self, config_dict, init_type="xavier", **kwargs): super(CharRNNModel, self).__init__(**kwargs) self.batch_size = config_dict["batch_size"] self.num_subwords = config_dict["num_subwords"] self.num_words = config_dict["num_words"] self.subword_embedding_size = config_dict["subword_embedding_size"] self.input_vocab_size = config_dict["input_vocab_size"] self.output_vocab_size = config_dict["output_vocab_size"] self.subword_RNN_hidden_state_size = config_dict["subword_RNN_hidden_state_size"] self.table_width = config_dict["table_width"] self.max_out_dim = config_dict["max_out_dim"] self.max_out_K = config_dict["max_out_K"] self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name="input_lookup") self.lookup.weights_init = Uniform(width=self.table_width) self.lookup.biases_init = Constant(0) if init_type == "xavier": linear_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size) lstm_init = XavierInitializationOriginal(self.subword_embedding_size, self.subword_RNN_hidden_state_size) else: # default is gaussian linear_init = IsotropicGaussian() lstm_init = IsotropicGaussian() # The `inputs` are then split in this order: Input gates, forget gates, cells and output gates self.linear_forward = Linear( input_dim=self.subword_embedding_size, output_dim=self.subword_RNN_hidden_state_size * 4, name="linear_forward", weights_init=linear_init, biases_init=Constant(0.0), ) self.language_model = LSTM( dim=self.subword_RNN_hidden_state_size, activation=Tanh(), name="language_model_RNN", weights_init=lstm_init, biases_init=Constant(0.0), ) self.max_out = LinearMaxout( self.subword_RNN_hidden_state_size, self.max_out_dim, self.max_out_K, name="max_out", weights_init=IsotropicGaussian(), biases_init=Constant(0.0), ) self.softmax_linear = Linear( self.max_out_dim, self.output_vocab_size, name="soft_max_linear", weights_init=IsotropicGaussian(), biases_init=Constant(0.0), ) self.softmax = NDimensionalSoftmax() self.children = [ self.lookup, self.linear_forward, self.language_model, self.max_out, self.softmax_linear, self.softmax, ] @application(inputs=["features", "features_mask", "targets", "targets_mask"], outputs=["cost"]) def apply(self, features, features_mask, targets, targets_mask): subword_embeddings = self.lookup.apply(features) sentence_embeddings = self.language_model.apply( self.linear_forward.apply(subword_embeddings), mask=features_mask )[ 0 ] # [0] = hidden states, [1] = cells linear_output = self.softmax_linear.apply(self.max_out.apply(sentence_embeddings)) cost = self.softmax.categorical_cross_entropy(targets, linear_output, extra_ndim=1).mean() cost.name = "cost" return ((cost * targets_mask).sum()) / targets_mask.sum()
linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') from blocks.extensions import Timing, FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import TrainingDataMonitoring
class ExtractiveQAModel(Initializable): """The dictionary-equipped extractive QA model. Parameters ---------- dim : int The default dimensionality for the components. emd_dim : int The dimensionality for the embeddings. If 0, `dim` is used. coattention : bool Use the coattention mechanism. num_input_words : int The number of input words. If 0, `vocab.size()` is used. The vocabulary object. use_definitions : bool Triggers the use of definitions. reuse_word_embeddings : bool compose_type : str """ def __init__(self, dim, emb_dim, readout_dims, num_input_words, def_num_input_words, vocab, use_definitions, def_word_gating, compose_type, coattention, def_reader, reuse_word_embeddings, random_unk, **kwargs): self._vocab = vocab if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words self._coattention = coattention self._num_input_words = num_input_words self._use_definitions = use_definitions self._random_unk = random_unk self._reuse_word_embeddings = reuse_word_embeddings lookup_num_words = num_input_words if reuse_word_embeddings: lookup_num_words = max(num_input_words, def_num_input_words) if random_unk: lookup_num_words = vocab.size() # Dima: we can have slightly less copy-paste here if we # copy the RecurrentFromFork class from my other projects. children = [] self._lookup = LookupTable(lookup_num_words, emb_dim) self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._question_transform = Linear(dim, dim, name='question_transform') self._bidir_fork = Linear(3 * dim if coattention else 2 * dim, 4 * dim, name='bidir_fork') self._bidir = Bidirectional(LSTM(dim), name='bidir') children.extend([ self._lookup, self._encoder_fork, self._encoder_rnn, self._question_transform, self._bidir, self._bidir_fork ]) activations = [Rectifier()] * len(readout_dims) + [None] readout_dims = [2 * dim] + readout_dims + [1] self._begin_readout = MLP(activations, readout_dims, name='begin_readout') self._end_readout = MLP(activations, readout_dims, name='end_readout') self._softmax = NDimensionalSoftmax() children.extend( [self._begin_readout, self._end_readout, self._softmax]) if self._use_definitions: # A potential bug here: we pass the same vocab to the def reader. # If a different token is reserved for UNK in text and in the definitions, # we can be screwed. def_reader_class = eval(def_reader) def_reader_kwargs = dict( num_input_words=def_num_input_words, dim=dim, emb_dim=emb_dim, vocab=vocab, lookup=self._lookup if reuse_word_embeddings else None) if def_reader_class == MeanPoolReadDefinitions: def_reader_kwargs.update(dict(normalize=True, translate=False)) self._def_reader = def_reader_class(**def_reader_kwargs) self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, def_word_gating=def_word_gating, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) super(ExtractiveQAModel, self).__init__(children=children, **kwargs) # create default input variables self.contexts = tensor.lmatrix('contexts') self.context_mask = tensor.matrix('contexts_mask') self.questions = tensor.lmatrix('questions') self.question_mask = tensor.matrix('questions_mask') self.answer_begins = tensor.lvector('answer_begins') self.answer_ends = tensor.lvector('answer_ends') input_vars = [ self.contexts, self.context_mask, self.questions, self.question_mask, self.answer_begins, self.answer_ends ] if self._use_definitions: self.defs = tensor.lmatrix('defs') self.def_mask = tensor.matrix('def_mask') self.contexts_def_map = tensor.lmatrix('contexts_def_map') self.questions_def_map = tensor.lmatrix('questions_def_map') input_vars.extend([ self.defs, self.def_mask, self.contexts_def_map, self.questions_def_map ]) self.input_vars = OrderedDict([(var.name, var) for var in input_vars]) def set_embeddings(self, embeddings): self._lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def embeddings_var(self): return self._lookup.parameters[0] def def_reading_parameters(self): parameters = Selector(self._def_reader).get_parameters().values() parameters.extend(Selector(self._combiner).get_parameters().values()) if self._reuse_word_embeddings: lookup_parameters = Selector( self._lookup).get_parameters().values() parameters = [p for p in parameters if p not in lookup_parameters] return parameters @application def _encode(self, application_call, text, mask, def_embs=None, def_map=None, text_name=None): if not self._random_unk: text = (tensor.lt(text, self._num_input_words) * text + tensor.ge(text, self._num_input_words) * self._vocab.unk) if text_name: application_call.add_auxiliary_variable( unk_ratio(text, mask, self._vocab.unk), name='{}_unk_ratio'.format(text_name)) embs = self._lookup.apply(text) if self._random_unk: embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs + tensor.ge(text, self._num_input_words)[:, :, None] * disconnected_grad(embs)) if def_embs: embs = self._combiner.apply(embs, mask, def_embs, def_map) add_role(embs, EMBEDDINGS) encoded = flip01( self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)), mask=mask.T)[0]) return encoded @application def apply(self, application_call, contexts, contexts_mask, questions, questions_mask, answer_begins, answer_ends, defs=None, def_mask=None, contexts_def_map=None, questions_def_map=None): def_embs = None if self._use_definitions: def_embs = self._def_reader.apply(defs, def_mask) context_enc = self._encode(contexts, contexts_mask, def_embs, contexts_def_map, 'context') question_enc_pre = self._encode(questions, questions_mask, def_embs, questions_def_map, 'question') question_enc = tensor.tanh( self._question_transform.apply(question_enc_pre)) # should be (batch size, context length, question_length) affinity = tensor.batched_dot(context_enc, flip12(question_enc)) affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :] affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask) # soft-aligns every position in the context to positions in the question d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1) application_call.add_auxiliary_variable(d2q_att_weights.copy(), name='d2q_att_weights') # soft-aligns every position in the question to positions in the document q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1) application_call.add_auxiliary_variable(q2d_att_weights.copy(), name='q2d_att_weights') # question encoding "in the view of the document" question_enc_informed = tensor.batched_dot(q2d_att_weights, context_enc) question_enc_concatenated = tensor.concatenate( [question_enc, question_enc_informed], 2) # document encoding "in the view of the question" context_enc_informed = tensor.batched_dot(d2q_att_weights, question_enc_concatenated) if self._coattention: context_enc_concatenated = tensor.concatenate( [context_enc, context_enc_informed], 2) else: question_repr_repeated = tensor.repeat(question_enc[:, [-1], :], context_enc.shape[1], axis=1) context_enc_concatenated = tensor.concatenate( [context_enc, question_repr_repeated], 2) # note: forward and backward LSTMs share the # input weights in the current impl bidir_states = flip01( self._bidir.apply(self._bidir_fork.apply( flip01(context_enc_concatenated)), mask=contexts_mask.T)[0]) begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0] begin_readouts = begin_readouts * contexts_mask - 1000.0 * ( 1 - contexts_mask) begin_costs = self._softmax.categorical_cross_entropy( answer_begins, begin_readouts) end_readouts = self._end_readout.apply(bidir_states)[:, :, 0] end_readouts = end_readouts * contexts_mask - 1000.0 * (1 - contexts_mask) end_costs = self._softmax.categorical_cross_entropy( answer_ends, end_readouts) predicted_begins = begin_readouts.argmax(axis=-1) predicted_ends = end_readouts.argmax(axis=-1) exact_match = (tensor.eq(predicted_begins, answer_begins) * tensor.eq(predicted_ends, answer_ends)) application_call.add_auxiliary_variable(predicted_begins, name='predicted_begins') application_call.add_auxiliary_variable(predicted_ends, name='predicted_ends') application_call.add_auxiliary_variable(exact_match, name='exact_match') return begin_costs + end_costs def apply_with_default_vars(self): return self.apply(*self.input_vars.values())
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM( dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01) ) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
class LanguageModel(Initializable): """The dictionary-equipped language model. Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. retrieval The dictionary retrieval algorithm. If `None`, the language model does not use any dictionary. def_reader: either 'LSTM' or 'mean' standalone_def_rnn : bool If `True`, a standalone RNN with separate word embeddings is used to embed definition. If `False` the language model is reused. disregard_word_embeddings : bool If `True`, the word embeddings are not used, only the information from the definitions is used. compose_type : str If 'sum', the definition and word embeddings are averaged If 'fully_connected_linear', a learned perceptron compose the 2 embeddings linearly If 'fully_connected_relu', ... If 'fully_connected_tanh', ... """ def __init__(self, emb_dim, emb_def_dim, dim, num_input_words, def_num_input_words, num_output_words, vocab, retrieval=None, def_reader='LSTM', standalone_def_lookup=True, standalone_def_rnn=True, disregard_word_embeddings=False, compose_type='sum', very_rare_threshold=[10], cache_size=0, **kwargs): # TODO(tombosc): document if emb_dim == 0: emb_dim = dim if emb_def_dim == 0: emb_def_dim = emb_dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words if (num_input_words != def_num_input_words) and (not standalone_def_lookup): raise NotImplementedError() self._very_rare_threshold = very_rare_threshold self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._retrieval = retrieval self._disregard_word_embeddings = disregard_word_embeddings self._compose_type = compose_type self._word_to_id = WordToIdOp(self._vocab) self._word_to_count = WordToCountOp(self._vocab) children = [] self._cache = None if cache_size > 0: #TODO(tombosc) do we implement cache as LookupTable or theano matrix? #self._cache = theano.shared(np.zeros((def_num_input_words, emb_dim))) self._cache = LookupTable(cache_size, emb_dim, name='cache_def_embeddings') children.append(self._cache) if self._retrieval: self._retrieve = RetrievalOp(retrieval) self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._main_fork = Linear(emb_dim, 4 * dim, name='main_fork') self._main_rnn = DebugLSTM( dim, name='main_rnn') # TODO(tombosc): use regular LSTM? children.extend([self._main_lookup, self._main_fork, self._main_rnn]) if self._retrieval: if standalone_def_lookup: lookup = None else: if emb_dim != emb_def_dim: raise ValueError( "emb_dim != emb_def_dim: cannot share lookup") lookup = self._main_lookup if def_reader == 'LSTM': if standalone_def_rnn: fork_and_rnn = None else: fork_and_rnn = (self._main_fork, self._main_rnn) self._def_reader = LSTMReadDefinitions(def_num_input_words, emb_def_dim, dim, vocab, lookup, fork_and_rnn, cache=self._cache) elif def_reader == 'mean': self._def_reader = MeanPoolReadDefinitions( def_num_input_words, emb_def_dim, dim, vocab, lookup, translate=(emb_def_dim != dim), normalize=False) else: raise Exception("def reader not understood") self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs) def _push_initialization_config(self): super(LanguageModel, self)._push_initialization_config() if self._cache: self._cache.weights_init = Constant(0.) def set_def_embeddings(self, embeddings): self._def_reader._def_lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def get_def_embeddings_params(self): return self._def_reader._def_lookup.parameters[0] def get_cache_params(self): return self._cache.W def add_perplexity_measure(self, application_call, minus_logs, mask, name): costs = (minus_logs * mask).sum(axis=0) perplexity = tensor.exp(costs.sum() / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity(costs.sum(), mask.sum()) full_name = "perplexity_" + name application_call.add_auxiliary_variable(perplexity, name=full_name) return costs @application def apply(self, application_call, words, mask): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. """ if self._retrieval: defs, def_mask, def_map = self._retrieve(words) def_embeddings = self._def_reader.apply(defs, def_mask) # Auxililary variable for debugging application_call.add_auxiliary_variable(def_embeddings.shape[0], name="num_definitions") word_ids = self._word_to_id(words) # shortlisting input_word_ids = ( tensor.lt(word_ids, self._num_input_words) * word_ids + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk) output_word_ids = ( tensor.lt(word_ids, self._num_output_words) * word_ids + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk) application_call.add_auxiliary_variable(unk_ratio( input_word_ids, mask, self._vocab.unk), name='unk_ratio') # Run the main rnn with combined inputs word_embs = self._main_lookup.apply(input_word_ids) application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='word_emb_RMS') if self._retrieval: rnn_inputs, updated, positions = self._combiner.apply( word_embs, mask, def_embeddings, def_map) else: rnn_inputs = word_embs updates = [] if self._cache: flat_word_ids = word_ids.flatten() flat_word_ids_to_update = flat_word_ids[positions] # computing updates for cache updates = [ (self._cache.W, tensor.set_subtensor(self._cache.W[flat_word_ids_to_update], updated)) ] application_call.add_auxiliary_variable(masked_root_mean_square( word_embs, mask), name='main_rnn_in_RMS') main_rnn_states = self._main_rnn.apply(tensor.transpose( self._main_fork.apply(rnn_inputs), (1, 0, 2)), mask=mask.T)[0] # The first token is not predicted logits = self._pre_softmax.apply(main_rnn_states[:-1]) targets = output_word_ids.T[1:] out_softmax = self._softmax.apply(logits, extra_ndim=1) application_call.add_auxiliary_variable(out_softmax.copy(), name="proba_out") minus_logs = self._softmax.categorical_cross_entropy(targets, logits, extra_ndim=1) targets_mask = mask.T[1:] costs = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "") missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L) self.add_perplexity_measure(application_call, minus_logs, targets_mask * missing_embs.T[:-1], "after_mis_word_embs") self.add_perplexity_measure(application_call, minus_logs, targets_mask * (1 - missing_embs.T[:-1]), "after_word_embs") word_counts = self._word_to_count(words) very_rare_masks = [] for threshold in self._very_rare_threshold: very_rare_mask = tensor.lt(word_counts, threshold).astype('int32') very_rare_mask = targets_mask * (very_rare_mask.T[:-1]) very_rare_masks.append(very_rare_mask) self.add_perplexity_measure(application_call, minus_logs, very_rare_mask, "after_very_rare_" + str(threshold)) if self._retrieval: has_def = tensor.zeros_like(output_word_ids) has_def = tensor.inc_subtensor( has_def[def_map[:, 0], def_map[:, 1]], 1) mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs) self.add_perplexity_measure(application_call, minus_logs, mask_targets_has_def, "after_def_embs") for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks): self.add_perplexity_measure( application_call, minus_logs, very_rare_mask * mask_targets_has_def, "after_def_very_rare_" + str(thresh)) application_call.add_auxiliary_variable(mask_targets_has_def.T, name='mask_def_emb') return costs, updates
linear_output = Linear( name='linear_output', input_dim=hidden_layer_dim, output_dim=charset_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules) )
class Seq2Seq(Initializable): """ seq2seq model Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. """ def __init__(self, emb_dim, dim, num_input_words, num_output_words, vocab, **kwargs): if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if num_output_words == 0: num_output_words = vocab.size() self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._word_to_id = WordToIdOp(self._vocab) children = [] self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._decoder_fork = Linear(emb_dim, 4 * dim, name='decoder_fork') self._decoder_rnn = LSTM(dim, name='decoder_rnn') children.extend([self._main_lookup, self._encoder_fork, self._encoder_rnn, self._decoder_fork, self._decoder_rnn]) self._pre_softmax = Linear(dim, self._num_output_words) self._softmax = NDimensionalSoftmax() children.extend([self._pre_softmax, self._softmax]) super(LanguageModel, self).__init__(children=children, **kwargs) def set_def_embeddings(self, embeddings): self._main_lookup.parameters[0].set_value(embeddings.astype(theano.config.floatX)) def get_def_embeddings_params(self): return self._main_lookup.parameters[0] def add_perplexity_measure(self, application_call, minus_logs, mask, name): costs = (minus_logs * mask).sum(axis=0) perplexity = tensor.exp(costs.sum() / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity( costs.sum(), mask.sum()) application_call.add_auxiliary_variable(perplexity, name=name) return costs @application def apply(self, application_call, words, mask): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. """ word_ids = self._word_to_id(words) # shortlisting input_word_ids = (tensor.lt(word_ids, self._num_input_words) * word_ids + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk) output_word_ids = (tensor.lt(word_ids, self._num_output_words) * word_ids + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk) application_call.add_auxiliary_variable( unk_ratio(input_word_ids, mask, self._vocab.unk), name='unk_ratio') # Run the main rnn with combined inputs rnn_inputs = self._main_lookup.apply(input_word_ids) encoder_rnn_states = self._encoder_rnn.apply( tensor.transpose(self._encoder_fork.apply(rnn_inputs), (1, 0, 2)), mask=mask.T)[0] # The first token is not predicted logits = self._pre_softmax.apply(main_rnn_states[:-1]) targets = output_word_ids.T[1:] out_softmax = self._softmax.apply(logits, extra_ndim=1) application_call.add_auxiliary_variable( out_softmax.copy(), name="proba_out") minus_logs = self._softmax.categorical_cross_entropy( targets, logits, extra_ndim=1) targets_mask = mask.T[1:] costs = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "perplexity") missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L) self.add_perplexity_measure(application_call, minus_logs, targets_mask * missing_embs.T[:-1], "perplexity_after_mis_word_embs") self.add_perplexity_measure(application_call, minus_logs, targets_mask * (1-missing_embs.T[:-1]), "perplexity_after_word_embs") word_counts = self._word_to_count(words) very_rare_masks = [] for threshold in self._very_rare_threshold: very_rare_mask = tensor.lt(word_counts, threshold).astype('int32') very_rare_mask = targets_mask * (very_rare_mask.T[:-1]) very_rare_masks.append(very_rare_mask) self.add_perplexity_measure(application_call, minus_logs, very_rare_mask, "perplexity_after_very_rare_" + str(threshold)) if self._retrieval: has_def = tensor.zeros_like(output_word_ids) has_def = tensor.inc_subtensor(has_def[def_map[:,0], def_map[:,1]], 1) mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs) self.add_perplexity_measure(application_call, minus_logs, mask_targets_has_def, "perplexity_after_def_embs") for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks): self.add_perplexity_measure(application_call, minus_logs, very_rare_mask * mask_targets_has_def, "perplexity_after_def_very_rare_" + str(thresh)) application_call.add_auxiliary_variable( mask_targets_has_def.T, name='mask_def_emb') return costs, updates
def __init__(self, input_sources_list, input_sources_vocab_size_list, output_source, output_source_vocab_size, lookup_dim=200, hidden_size=256, recurrent_stack_size=1): self.InputSources = input_sources_list self.InputSourcesVocab = input_sources_vocab_size_list self.OutputSource = output_source self.OutputSourceVocab = output_source_vocab_size inputs = [tensor.lmatrix(source) for source in input_sources_list] output = tensor.lmatrix(output_source) lookups = self.get_lookups(lookup_dim, input_sources_vocab_size_list) for lookup in lookups: lookup.initialize() merge = Merge([lookup.name for lookup in lookups], [lookup.dim for lookup in lookups], hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() linear0 = Linear(input_dim=hidden_size, output_dim=hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear0') linear0.initialize() recurrent_blocks = [] for i in range(recurrent_stack_size): recurrent_blocks.append(SimpleRecurrent( dim=hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01), use_bias=False)) for i, recurrent_block in enumerate(recurrent_blocks): recurrent_block.name = 'recurrent'+str(i+1) recurrent_block.initialize() linear_out = Linear(input_dim=hidden_size, output_dim=output_source_vocab_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0), name='linear_out') linear_out.initialize() softmax = NDimensionalSoftmax(name='softmax') lookup_outputs = [lookup.apply(input) for lookup, input in zip(lookups, inputs)] m = merge.apply(*lookup_outputs) r = linear0.apply(m) for block in recurrent_blocks: r = block.apply(r) a = linear_out.apply(r) self.Cost = softmax.categorical_cross_entropy(output, a, extra_ndim=1).mean() self.Cost.name = 'cost' y_hat = softmax.apply(a, extra_ndim=1) y_hat.name = 'y_hat' self.ComputationGraph = ComputationGraph(self.Cost) self.Function = None self.MainLoop = None self.Model = Model(y_hat)