def make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''): bricks = [] curr_dim = [seq_dim] curr_hidden = [seq] hidden_list = [] for k, dim in enumerate(sizes): fwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_fwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)] fwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_fwd_lstm_%d'%(name,k)) bwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_bwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)] bwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_bwd_lstm_%d'%(name,k)) bricks = bricks + [fwd_lstm, bwd_lstm] + fwd_lstm_ins + bwd_lstm_ins fwd_tmp = sum(x.apply(v) for x, v in zip(fwd_lstm_ins, curr_hidden)) bwd_tmp = sum(x.apply(v) for x, v in zip(bwd_lstm_ins, curr_hidden)) fwd_hidden, _ = fwd_lstm.apply(fwd_tmp, mask=mask) bwd_hidden, _ = bwd_lstm.apply(bwd_tmp[::-1], mask=mask[::-1]) hidden_list = hidden_list + [fwd_hidden, bwd_hidden] if skip: curr_hidden = [seq, fwd_hidden, bwd_hidden[::-1]] curr_dim = [seq_dim, dim, dim] else: curr_hidden = [fwd_hidden, bwd_hidden[::-1]] curr_dim = [dim, dim] return bricks, hidden_list
def lstm_layer(in_size, dim, x, h, n, task, first_layer=False): if connect_h_to_h == 'all-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) elif connect_x_to_h: lstm_input = T.concatenate([x] + [hidden for hidden in h], axis=2) linear = Linear(input_dim=in_size + dim * (n), output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) else: lstm_input = T.concatenate([hidden for hidden in h], axis=2) linear = Linear(input_dim=dim * (n + 1), output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) elif connect_h_to_h == 'two-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) elif connect_x_to_h: lstm_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size + dim, output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) else: lstm_input = T.concatenate(h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=dim * 2 if n > 1 else dim, output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) elif connect_h_to_h == 'one-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) elif connect_x_to_h: lstm_input = T.concatenate([x] + [h[n - 1]], axis=2) linear = Linear(input_dim=in_size + dim, output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) else: lstm_input = h[n - 1] linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n) + '-' + str(task)) lstm = LSTM(dim=dim, name=layer_models[n] + str(n) + '-' + str(task)) initialize([linear, lstm]) if layer_models[n] == 'lstm': return lstm.apply(linear.apply(lstm_input)) elif layer_models[n] == 'mt_lstm': return lstm.apply(linear.apply(lstm_input), time_scale=layer_resolutions[n], time_offset=layer_execution_time_offset[n])
def lstm_layer(dim, h, n, x_mask, first, **kwargs): linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n)) lstm = LSTM(dim=dim, activation=Rectifier(), name='lstm' + str(n)) initialize([linear, lstm]) applyLin = linear.apply(h) if first: lstmApply = lstm.apply(applyLin, mask=x_mask, **kwargs)[0] else: lstmApply = lstm.apply(applyLin, **kwargs)[0] return lstmApply
class questionEncoder: def __init__(self, word_dim, hidden_dim): self.forward_lstm= LSTM(hidden_dim, name='question_forward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.backward_lstm= LSTM(hidden_dim, name='question_backward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_forward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_forward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_backward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_backward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.forward_lstm.initialize() self.backward_lstm.initialize() self.x_to_h_forward.initialize() self.x_to_h_backward.initialize() # variable question length # words: batch_size x q x word_dim # words_reverse: be the reverse sentence of words # padding with 0 to max length q # mask: batch_size def apply(self, words, words_reverse, mask_, batch_size): mask = mask_.flatten() # batch_size x q x hidden_dim Wx = self.x_to_h_forward.apply(words) Wx_r = self.x_to_h_backward.apply(words_reverse) # q x batch_size x hidden_dim Wx = Wx.swapaxes(0, 1) Wx_r = Wx_r.swapaxes(0, 1) # q x batch_size x hidden_dim hf, cf = self.forward_lstm.apply(Wx) hb, cb = self.backward_lstm.apply(Wx_r) for i in range(batch_size): T.set_subtensor(hb[0:mask[i]+1, i, :], hb[0:mask[i]+1, i, :][::-1]) # q x batch_size x (2 x hidden_dim) h = T.concatenate([hf, hb], axis=2) # batch_size x hidden_dim y_q = hf[mask, range(batch_size), :] y_1 = hb[0, range(batch_size), :] return h.swapaxes(0, 1), y_q, y_1
def example4(): """LSTM -> Plante lors de l'initialisation du lstm.""" x = tensor.tensor3('x') dim = 3 # gate_inputs = theano.function([x],x*4) gate_inputs = Linear(input_dim=dim, output_dim=dim * 4, name="linear", weights_init=initialization.Identity(), biases_init=Constant(2)) lstm = LSTM(dim=dim, activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0)) gate_inputs.initialize() hg = gate_inputs.apply(x) #print(gate_inputs.parameters) #print(gate_inputs.parameters[1].get_value()) lstm.initialize() h, cells = lstm.apply(hg) print(lstm.parameters) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(4 * np.ones((dim, 1, dim), dtype=theano.config.floatX))) print("Good Job!") # lstm_output = #Initial State h0 = tensor.matrix('h0') c = tensor.matrix('cells') h, c1 = lstm.apply( inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs) f = theano.function([x, h0, c], h) print("a") print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def __init__(self, input_size, hidden_size, output_size): self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype=floatX) x_to_lstm = Linear(name="x_to_lstm", input_dim=input_size, output_dim=4 * hidden_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm = LSTM(dim=hidden_size, name="lstm", weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm_to_output = Linear(name="lstm_to_output", input_dim=hidden_size, output_dim=output_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) x_transform = x_to_lstm.apply(x) h, c = lstm.apply(x_transform) y_hat = lstm_to_output.apply(h) y_hat = Logistic(name="y_hat").apply(y_hat) self.cost = BinaryCrossEntropy(name="cost").apply(y, y_hat) x_to_lstm.initialize() lstm.initialize() lstm_to_output.initialize() self.computation_graph = ComputationGraph(self.cost)
def create_model(self): input_dim = self.input_dim x = self.x x_to_h = Linear(input_dim, input_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(input_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(input_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification probs = h_to_o.apply(h[-1]) return probs
def apply(self, input_, target): x_to_h = Linear(name='x_to_h', input_dim=self.dims[0], output_dim=self.dims[1] * 4) pre_rnn = x_to_h.apply(input_) pre_rnn.name = 'pre_rnn' rnn = LSTM(activation=Tanh(), dim=self.dims[1], name=self.name) h, _ = rnn.apply(pre_rnn) h.name = 'h' h_to_y = Linear(name='h_to_y', input_dim=self.dims[1], output_dim=self.dims[2]) y_hat = h_to_y.apply(h) y_hat.name = 'y_hat' cost = SquaredError().apply(target, y_hat) cost.name = 'MSE' self.outputs = {} self.outputs['y_hat'] = y_hat self.outputs['cost'] = cost self.outputs['pre_rnn'] = pre_rnn self.outputs['h'] = h # Initialization for brick in (rnn, x_to_h, h_to_y): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize()
class CoreNetwork(BaseRecurrent, Initializable): def __init__(self, input_dim, dim, **kwargs): super(CoreNetwork, self).__init__(**kwargs) self.input_dim = input_dim self.dim = dim self.lstm = LSTM(dim=dim, name=self.name + '_lstm', weights_init=self.weights_init, biases_init=self.biases_init) self.proj = Linear(input_dim=input_dim, output_dim=dim*4, name=self.name + '_proj', weights_init=self.weights_init, biases_init=self.biases_init) self.children = [self.lstm, self.proj] def get_dim(self, name): if name == 'inputs': return self.input_dim elif name in ['state', 'cell']: return self.dim else: raise ValueError @recurrent(sequences=['inputs'], states=['state', 'cell'], contexts=[], outputs=['state', 'cell']) def apply(self, inputs, state, cell): state, cell = self.lstm.apply(self.proj.apply(inputs), state, cell, iterate=False) return state, cell
def lstm_layer(in_dim, h, h_dim, n, pref=""): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n) + pref) lstm = LSTM(dim=h_dim, name='lstm' + str(n) + pref) initialize([linear, lstm]) return lstm.apply(linear.apply(h))[0]
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset(generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset(generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar()]) main_loop.run() print 'Learned weights:' for layer in (x_to_h, lstm, h_to_o): print "Layer '%s':" % layer.name for param in layer.parameters: print param.name, ': ', param.get_value() print
class LinearLSTM(Initializable): def __init__(self, input_dim, output_dim, lstm_dim, print_intermediate=False, print_attrs=['__str__'], **kwargs): super(LinearLSTM, self).__init__(**kwargs) self.x_to_h = Linear(input_dim, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.h_to_o = Linear(lstm_dim, output_dim, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.children = [self.x_to_h, self.lstm, self.h_to_o] self.print_intermediate = print_intermediate self.print_attrs = print_attrs @application def apply(self, source): x_linear = self.x_to_h.apply( source.reshape( (source.shape[1], source.shape[0], source.shape[2]))) x_linear.name = 'x_linear' if self.print_intermediate: x_linear = Print(message='x_linear info', attrs=self.print_attrs)(x_linear) h, c = self.lstm.apply(x_linear) if self.print_intermediate: h = Print(message="hidden states info", attrs=self.print_attrs)(h) y_hat = self.h_to_o.apply(h) y_hat.name = 'y_hat' if self.print_intermediate: y_hat = Print(message="y_hat info", attrs=self.print_attrs)(y_hat) return y_hat def initialize(self): for child in self.children: child.initialize() def reset_allocation(self): for child in self.children: child.allocated = False
class seqDecoder: def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim): self.W = Linear(input_dim=feature_dim, output_dim=memory_dim * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='seqDecoder_W') self.GRU_A = LSTM(feature_dim, name='seqDecoder_A', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.GRU_B = LSTM(memory_dim, name='seqDecoder_B', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.W.initialize() self.GRU_A.initialize() self.GRU_B.initialize() self.fc1 = Linear(input_dim=memory_dim, output_dim=fc1_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc1') self.fc2 = Linear(input_dim=fc1_dim, output_dim=fc2_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc2') self.fc1.initialize() self.fc2.initialize() # A: the encoding of GRU_A, # B: the encoding of GRU_B # padding: the tensor constant def apply(self, output_length, A, B, padding): A_, garbage = self.GRU_A.apply(padding, states=A) WA_ = self.W.apply(A_) # output_length x batch_size x output_dim B_, garbage = self.GRU_B.apply(WA_, states=B) # batch_size x output_length x output_dim B_ = B_.swapaxes(0,1) fc1_r = relu(self.fc1.apply(B_)) fc2_r = relu(self.fc2.apply(fc1_r)) return fc2_r
def construct_model(activation_function, r_dim, hidden_dim, out_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx # Get a representation of r of size r_dim r = DAE(r) # r is now nj x r_dim # r_rep is nx x nj x r_dim r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (r_dim + 1) concat = tensor.concatenate([r_rep, x3], axis=2) # Change concat from Batch x Time x Features to T X B x F rnn_input = concat.dimshuffle(1, 0, 2) linear = Linear(input_dim=r_dim + 1, output_dim=4 * hidden_dim, name="input_linear") lstm = LSTM(dim=hidden_dim, activation=activation_function, name="hidden_recurrent") top_linear = Linear(input_dim=hidden_dim, output_dim=out_dim, name="out_linear") pre_rnn = linear.apply(rnn_input) states = lstm.apply(pre_rnn)[0] activations = top_linear.apply(states) activations = tensor.mean(activations, axis=0) cost = Softmax().categorical_cross_entropy(y, activations) pred = activations.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in (linear, lstm, top_linear): brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() return cost, error_rate
def create_rnn(hidden_dim, vocab_dim,mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name = "W1", #dim = hidden_dim*4, dim = hidden_dim, length = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) if mode == "lstm": # Long Short Term Memory H = LSTM( hidden_dim, name = 'H', weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0.0) ) else: # recurrent history weight H = SimpleRecurrent( name = "H", dim = hidden_dim, activation = Tanh(), weights_init = initialization.IsotropicGaussian(0.01) ) # S = Linear( name = "W2", input_dim = hidden_dim, output_dim = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) A = NDimensionalSoftmax( name = "softmax" ) initLayers([W,H,S]) activations = W.apply(x) hiddens = H.apply(activations)#[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def add_lstm(input_dim, input_var): linear = Linear(input_dim=input_dim,output_dim=input_dim*4,name="linear_layer") lstm = LSTM(dim=input_dim, name="lstm_layer") testing_init(linear) #linear.initialize() default_init(lstm) h = linear.apply(input_var) return lstm.apply(h)
def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = out return probs
def example4(): """LSTM -> Plante lors de l'initialisation du lstm.""" x = tensor.tensor3('x') dim=3 # gate_inputs = theano.function([x],x*4) gate_inputs = Linear(input_dim=dim,output_dim=dim*4, name="linear",weights_init=initialization.Identity(), biases_init=Constant(2)) lstm = LSTM(dim=dim,activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0)) gate_inputs.initialize() hg = gate_inputs.apply(x) #print(gate_inputs.parameters) #print(gate_inputs.parameters[1].get_value()) lstm.initialize() h, cells = lstm.apply(hg) print(lstm.parameters) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(4*np.ones((dim, 1, dim), dtype=theano.config.floatX))) print("Good Job!") # lstm_output = #Initial State h0 = tensor.matrix('h0') c = tensor.matrix('cells') h,c1 = lstm.apply(inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs) f = theano.function([x, h0, c], h) print("a") print(f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def lstm_layer(self, h, n): """ Performs the LSTM update for a batch of word sequences :param h The word embeddings for this update :param n The number of layers of the LSTM """ # Maps the word embedding to a dimensionality to be used in the LSTM linear = Linear(input_dim=self.hidden_size, output_dim=self.hidden_size * 4, name='linear_lstm' + str(n)) initialize(linear, sqrt(6.0 / (5 * self.hidden_size))) lstm = LSTM(dim=self.hidden_size, name='lstm' + str(n)) initialize(lstm, 0.08) return lstm.apply(linear.apply(h))
def create_rnn(hidden_dim, vocab_dim, mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name="W1", #dim = hidden_dim*4, dim=hidden_dim, length=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) if mode == "lstm": # Long Short Term Memory H = LSTM(hidden_dim, name='H', weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0.0)) else: # recurrent history weight H = SimpleRecurrent( name="H", dim=hidden_dim, activation=Tanh(), weights_init=initialization.IsotropicGaussian(0.01)) # S = Linear(name="W2", input_dim=hidden_dim, output_dim=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) A = NDimensionalSoftmax(name="softmax") initLayers([W, H, S]) activations = W.apply(x) hiddens = H.apply(activations) #[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
class Encoder(Initializable): def __init__(self, image_feature_dim, embedding_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.image_embedding = Linear( input_dim=image_feature_dim , output_dim=embedding_dim # , weights_init=IsotropicGaussian(0.02) # , biases_init=Constant(0.) , name="image_embedding" ) self.to_inputs = Linear( input_dim=embedding_dim , output_dim=embedding_dim*4 # gate_inputs = vstack(input, forget, cell, hidden) # , weights_init=IsotropicGaussian(0.02) # , biases_init=Constant(0.) , name="to_inputs" ) # Don't think this dim has to also be dimension, more arbitrary self.transition = LSTM( dim=embedding_dim, name="transition") self.children = [ self.image_embedding , self.to_inputs , self.transition ] @application(inputs=['image_vects', 'word_vects'], outputs=['image_embedding', 'sentence_embedding']) def apply(self, image_vects, word_vects): image_embedding = self.image_embedding.apply(image_vects) # inputs = word_vects inputs = self.to_inputs.apply(word_vects) inputs = inputs.dimshuffle(1, 0, 2) hidden, cells = self.transition.apply(inputs=inputs, mask=None) # the last hidden state represents the accumulation of all the words (i.e. the sentence) # grab all batches, grab the last value representing accumulation of the sequence, grab all features sentence_embedding = hidden[-1] # sentence_embedding = inputs.mean(axis=0) return image_embedding, sentence_embedding
class Encoder(Initializable): def __init__(self, image_feature_dim, embedding_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.image_embedding = Linear( input_dim=image_feature_dim , output_dim=embedding_dim , name="image_embedding" ) self.to_inputs = Linear( input_dim=embedding_dim , output_dim=embedding_dim*4 # times 4 cuz vstack(input, forget, cell, hidden) , name="to_inputs" ) self.transition = LSTM( dim=embedding_dim, name="transition") self.children = [ self.image_embedding , self.to_inputs , self.transition ] @application( inputs=['image_vects', 'word_vects'] , outputs=['image_embedding', 'sentence_embedding'] ) def apply(self, image_vects, word_vects): image_embedding = self.image_embedding.apply(image_vects) inputs = self.to_inputs.apply(word_vects) # shuffle dimensions to correspond to (sequence, batch, features) inputs = inputs.dimshuffle(1, 0, 2) hidden, cells = self.transition.apply(inputs=inputs, mask=None) # last hidden state represents the accumulation of word embeddings # (i.e. the sentence embedding) sentence_embedding = hidden[-1] return image_embedding, sentence_embedding
class Encoder(Initializable): def __init__(self, image_feature_dim, embedding_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.image_embedding = Linear(input_dim=image_feature_dim, output_dim=embedding_dim, name="image_embedding") self.to_inputs = Linear( input_dim=embedding_dim, output_dim=embedding_dim * 4 # times 4 cuz vstack(input, forget, cell, hidden) , name="to_inputs") self.transition = LSTM(dim=embedding_dim, name="transition") self.children = [self.image_embedding, self.to_inputs, self.transition] @application(inputs=['image_vects', 'word_vects'], outputs=['image_embedding', 'sentence_embedding']) def apply(self, image_vects, word_vects): image_embedding = self.image_embedding.apply(image_vects) inputs = self.to_inputs.apply(word_vects) # shuffle dimensions to correspond to (sequence, batch, features) inputs = inputs.dimshuffle(1, 0, 2) hidden, cells = self.transition.apply(inputs=inputs, mask=None) # last hidden state represents the accumulation of word embeddings # (i.e. the sentence embedding) sentence_embedding = hidden[-1] return image_embedding, sentence_embedding
def build_theano_functions(self) : #import pdb ; pdb.set_trace() x = T.fmatrix('x') s = T.fvector('s') mu = T.fvector('mu') mu = T.reshape(mu,(self.number_of_mix,1)) pi = T.fvector('pi') lstm = LSTM( dim=self.input_dim/4, weights_init=IsotropicGaussian(0.5), biases_init=Constant(1)) lstm.initialize() h, c = lstm.apply(x) h = h[0][0][-1] LL = T.sum(pi*(1./(T.sqrt(2.*np.pi)*s))*T.exp(\ -0.5*(h-mu)**2/T.reshape(s,(self.number_of_mix,1))**2.).sum(axis=1)) cost = -T.log(LL) #cg = ComputationGraph(cost) #self.cg = cg #parameters = cg.parameters model = Model(cost) self.model = model parameters = model.parameters grads = T.grad(cost, parameters) updates = [] for i in range(len(grads)) : updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) gradf = theano.function([x,s,mu,pi],[cost],updates=updates) f = theano.function([x],[h]) return gradf, f
def build_theano_functions(self): #import pdb ; pdb.set_trace() x = T.fmatrix('x') s = T.fvector('s') mu = T.fvector('mu') mu = T.reshape(mu, (self.number_of_mix, 1)) pi = T.fvector('pi') lstm = LSTM(dim=self.input_dim / 4, weights_init=IsotropicGaussian(0.5), biases_init=Constant(1)) lstm.initialize() h, c = lstm.apply(x) h = h[0][0][-1] LL = T.sum(pi*(1./(T.sqrt(2.*np.pi)*s))*T.exp(\ -0.5*(h-mu)**2/T.reshape(s,(self.number_of_mix,1))**2.).sum(axis=1)) cost = -T.log(LL) #cg = ComputationGraph(cost) #self.cg = cg #parameters = cg.parameters model = Model(cost) self.model = model parameters = model.parameters grads = T.grad(cost, parameters) updates = [] for i in range(len(grads)): updates.append( tuple([parameters[i], parameters[i] - self.lr * grads[i]])) gradf = theano.function([x, s, mu, pi], [cost], updates=updates) f = theano.function([x], [h]) return gradf, f
class LSTMReadDefinitions(Initializable): """ Converts definition into embeddings. Parameters ---------- num_input_words: int, default: -1 If non zero will (a bit confusing name) restrict dynamically vocab. WARNING: it assumes word ids are monotonical with frequency! emb_dim : int Dimensionality of word embeddings dim : int Dimensionality of the def rnn. lookup: None or LookupTable fork_and_rnn: None or tuple (Linear, RNN) """ def __init__(self, num_input_words, emb_dim, dim, vocab, lookup=None, fork_and_rnn=None, **kwargs): if num_input_words > 0: logger.info("Restricting def vocab to " + str(num_input_words)) self._num_input_words = num_input_words else: self._num_input_words = vocab.size() self._vocab = vocab children = [] if lookup is None: self._def_lookup = LookupTable(self._num_input_words, emb_dim, name='def_lookup') else: self._def_lookup = lookup if fork_and_rnn is None: self._def_fork = Linear(emb_dim, 4 * dim, name='def_fork') self._def_rnn = LSTM(dim, name='def_rnn') else: self._def_fork, self._def_rnn = fork_and_rnn children.extend([self._def_lookup, self._def_fork, self._def_rnn]) super(LSTMReadDefinitions, self).__init__(children=children, **kwargs) @application def apply(self, application_call, defs, def_mask): """ Returns vector per each word in sequence using the dictionary based lookup """ # Short listing defs = (T.lt(defs, self._num_input_words) * defs + T.ge(defs, self._num_input_words) * self._vocab.unk) application_call.add_auxiliary_variable(unk_ratio( defs, def_mask, self._vocab.unk), name='def_unk_ratio') embedded_def_words = self._def_lookup.apply(defs) def_embeddings = self._def_rnn.apply(T.transpose( self._def_fork.apply(embedded_def_words), (1, 0, 2)), mask=def_mask.T)[0][-1] return def_embeddings
class Rnn(Initializable, BaseRecurrent): def __init__(self, dims=(88, 100, 100), **kwargs): super(Rnn, self).__init__(**kwargs) self.dims = dims self.input_transform = Linear(input_dim=dims[0], output_dim=dims[1], weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), use_bias=False, name="input_transfrom") self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="gru_rnn_layer") # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=False, name="h2h_transform") self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="lstm_rnn_layer") self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]], weights_init=IsotropicGaussian(0.01), use_bias=True, biases_init=Constant(0.0), name="out_layer") self.children = [self.input_transform, self.gru_layer, self.linear_trans, self.lstm_layer, self.out_transform] # @recurrent(sequences=['inputs', 'input_mask'], contexts=[], # states=['gru_state', 'lstm_state', 'lstm_cells'], # outputs=['gru_state', 'lstm_state', 'lstm_cells']) def rnn_apply(self, inputs, mask=None, gru_state=None, lstm_state=None, lstm_cells=None): input_transform = self.input_transform.apply(inputs) gru_state = self.gru_layer.apply( inputs=input_transform, # update_inputs=input_transform, # reset_inputs=input_transform, states=gru_state, mask=mask, iterate=False) lstm_transform = self.linear_trans.apply(gru_state) lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, cells=lstm_cells, mask=mask, iterate=False) return gru_state, lstm_state, lstm_cells @recurrent(sequences=[], contexts=[], states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'], outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells']) def rnn_generate(self, inputs=None, gru_state=None, lstm_state=None, lstm_cells=None): output = self.apply(inputs=inputs, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells, iterate=False) return output, gru_state, lstm_state, lstm_cells @recurrent(sequences=['inputs', 'mask'], contexts=[], states=['gru_state', 'lstm_state', 'lstm_cells'], outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells']) def apply(self, inputs, mask, gru_state=None, lstm_state=None, lstm_cells=None): # input_transform = self.input_transform.apply(inputs) # gru_state = self.gru_layer.apply( # inputs=input_transform, # mask=mask, # states=gru_state, # iterate=False) # lstm_transform = self.linear_trans.apply(gru_state) # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, # cells=lstm_cells, # mask=mask, iterate=False) gru_state, lstm_state, lstm_cells = self.rnn_apply(inputs=inputs, mask=mask, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells) output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None] return output, gru_state, lstm_state, lstm_cells def get_dim(self, name): dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims)) dims['lstm_cells'] = dims['lstm_state'] return dims.get(name, None) or super(Rnn, self).get_dim(name)
k = k, const = 0.00001) bricks = [mlp_x, transition, mlp_gmm] for brick in bricks: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() ############## # Test model ############## x_g = mlp_x.apply(x) h = transition.apply(x_g) mu, sigma, coeff = mlp_gmm.apply(h[-2]) #from theano import function #x_tr, x_mask_tr, y_tr = next(data_stream.get_epoch_iterator()) #print function([x], x_g)(x_tr).shape #print function([x], h)(x_tr)[-2].shape #print function([x], mu)(x_tr).shape from play.utils import GMM cost = GMM(y, mu, sigma, coeff) cost = cost*x_mask cost = cost.sum()/x_mask.sum() cost.name = 'sequence_log_likelihood' cg = ComputationGraph(cost)
def rf_lstm_experiment(data_name, exp_network, in_dim, out_dim, num_layers, start_neurons, num_neurons, batch_size, num_epochs): """LSTM Experiment.""" # load dataset train_set = IterableDataset( ds.transform_sequence(data_name, "train", batch_size)) test_set = IterableDataset( ds.transform_sequence(data_name, "test", batch_size)) stream_train = DataStream(dataset=train_set) stream_test = DataStream(dataset=test_set) methods = ['sgd', 'momentum', 'adagrad', 'rmsprop'] for n_layers in xrange(1, num_layers + 1): for n_neurons in xrange(start_neurons, num_neurons + 5, 5): for method in methods: X = T.tensor3("features") y = T.matrix("targets") x_to_h = Linear(in_dim, n_neurons * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(n_neurons, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = nc.setup_ff_network(n_neurons, out_dim, n_layers - 1, n_neurons) X_trans = x_to_h.apply(X) h, c = lstm.apply(X_trans) y_hat = h_to_o.apply(h[-1]) cost, cg = nc.create_cg_and_cost(y, y_hat, "none") lstm.initialize() x_to_h.initialize() h_to_o.initialize() algorithm = nc.setup_algorithms(cost, cg, method, type="RNN") test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop( algorithm=algorithm, data_stream=stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() # Saving results exp_id = ds.create_exp_id(exp_network, n_layers, n_neurons, batch_size, num_epochs, method, "none") # prepare related functions predict = theano.function([X], y_hat) # prepare related data train_features, train_targets = ds.get_iter_data(train_set) test_features, test_targets = ds.get_iter_data(test_set) # Prediction of result train_predicted = gen_prediction(predict, train_features) test_predicted = gen_prediction(predict, test_features) # Get cost cost = ds.get_cost_data(test_monitor, train_set.num_examples, num_epochs) # logging ds.save_experiment(train_targets, train_predicted, test_targets, test_predicted, cost, exp_network, n_layers, n_neurons, batch_size, num_epochs, method, "none", exp_id, "../results/")
class ExtractiveQAModel(Initializable): """The dictionary-equipped extractive QA model. Parameters ---------- dim : int The default dimensionality for the components. emd_dim : int The dimensionality for the embeddings. If 0, `dim` is used. coattention : bool Use the coattention mechanism. num_input_words : int The number of input words. If 0, `vocab.size()` is used. The vocabulary object. use_definitions : bool Triggers the use of definitions. reuse_word_embeddings : bool compose_type : str """ def __init__(self, dim, emb_dim, readout_dims, num_input_words, def_num_input_words, vocab, use_definitions, def_word_gating, compose_type, coattention, def_reader, reuse_word_embeddings, random_unk, **kwargs): self._vocab = vocab if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if def_num_input_words == 0: def_num_input_words = num_input_words self._coattention = coattention self._num_input_words = num_input_words self._use_definitions = use_definitions self._random_unk = random_unk self._reuse_word_embeddings = reuse_word_embeddings lookup_num_words = num_input_words if reuse_word_embeddings: lookup_num_words = max(num_input_words, def_num_input_words) if random_unk: lookup_num_words = vocab.size() # Dima: we can have slightly less copy-paste here if we # copy the RecurrentFromFork class from my other projects. children = [] self._lookup = LookupTable(lookup_num_words, emb_dim) self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') self._question_transform = Linear(dim, dim, name='question_transform') self._bidir_fork = Linear(3 * dim if coattention else 2 * dim, 4 * dim, name='bidir_fork') self._bidir = Bidirectional(LSTM(dim), name='bidir') children.extend([ self._lookup, self._encoder_fork, self._encoder_rnn, self._question_transform, self._bidir, self._bidir_fork ]) activations = [Rectifier()] * len(readout_dims) + [None] readout_dims = [2 * dim] + readout_dims + [1] self._begin_readout = MLP(activations, readout_dims, name='begin_readout') self._end_readout = MLP(activations, readout_dims, name='end_readout') self._softmax = NDimensionalSoftmax() children.extend( [self._begin_readout, self._end_readout, self._softmax]) if self._use_definitions: # A potential bug here: we pass the same vocab to the def reader. # If a different token is reserved for UNK in text and in the definitions, # we can be screwed. def_reader_class = eval(def_reader) def_reader_kwargs = dict( num_input_words=def_num_input_words, dim=dim, emb_dim=emb_dim, vocab=vocab, lookup=self._lookup if reuse_word_embeddings else None) if def_reader_class == MeanPoolReadDefinitions: def_reader_kwargs.update(dict(normalize=True, translate=False)) self._def_reader = def_reader_class(**def_reader_kwargs) self._combiner = MeanPoolCombiner(dim=dim, emb_dim=emb_dim, def_word_gating=def_word_gating, compose_type=compose_type) children.extend([self._def_reader, self._combiner]) super(ExtractiveQAModel, self).__init__(children=children, **kwargs) # create default input variables self.contexts = tensor.lmatrix('contexts') self.context_mask = tensor.matrix('contexts_mask') self.questions = tensor.lmatrix('questions') self.question_mask = tensor.matrix('questions_mask') self.answer_begins = tensor.lvector('answer_begins') self.answer_ends = tensor.lvector('answer_ends') input_vars = [ self.contexts, self.context_mask, self.questions, self.question_mask, self.answer_begins, self.answer_ends ] if self._use_definitions: self.defs = tensor.lmatrix('defs') self.def_mask = tensor.matrix('def_mask') self.contexts_def_map = tensor.lmatrix('contexts_def_map') self.questions_def_map = tensor.lmatrix('questions_def_map') input_vars.extend([ self.defs, self.def_mask, self.contexts_def_map, self.questions_def_map ]) self.input_vars = OrderedDict([(var.name, var) for var in input_vars]) def set_embeddings(self, embeddings): self._lookup.parameters[0].set_value( embeddings.astype(theano.config.floatX)) def embeddings_var(self): return self._lookup.parameters[0] def def_reading_parameters(self): parameters = Selector(self._def_reader).get_parameters().values() parameters.extend(Selector(self._combiner).get_parameters().values()) if self._reuse_word_embeddings: lookup_parameters = Selector( self._lookup).get_parameters().values() parameters = [p for p in parameters if p not in lookup_parameters] return parameters @application def _encode(self, application_call, text, mask, def_embs=None, def_map=None, text_name=None): if not self._random_unk: text = (tensor.lt(text, self._num_input_words) * text + tensor.ge(text, self._num_input_words) * self._vocab.unk) if text_name: application_call.add_auxiliary_variable( unk_ratio(text, mask, self._vocab.unk), name='{}_unk_ratio'.format(text_name)) embs = self._lookup.apply(text) if self._random_unk: embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs + tensor.ge(text, self._num_input_words)[:, :, None] * disconnected_grad(embs)) if def_embs: embs = self._combiner.apply(embs, mask, def_embs, def_map) add_role(embs, EMBEDDINGS) encoded = flip01( self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)), mask=mask.T)[0]) return encoded @application def apply(self, application_call, contexts, contexts_mask, questions, questions_mask, answer_begins, answer_ends, defs=None, def_mask=None, contexts_def_map=None, questions_def_map=None): def_embs = None if self._use_definitions: def_embs = self._def_reader.apply(defs, def_mask) context_enc = self._encode(contexts, contexts_mask, def_embs, contexts_def_map, 'context') question_enc_pre = self._encode(questions, questions_mask, def_embs, questions_def_map, 'question') question_enc = tensor.tanh( self._question_transform.apply(question_enc_pre)) # should be (batch size, context length, question_length) affinity = tensor.batched_dot(context_enc, flip12(question_enc)) affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :] affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask) # soft-aligns every position in the context to positions in the question d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1) application_call.add_auxiliary_variable(d2q_att_weights.copy(), name='d2q_att_weights') # soft-aligns every position in the question to positions in the document q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1) application_call.add_auxiliary_variable(q2d_att_weights.copy(), name='q2d_att_weights') # question encoding "in the view of the document" question_enc_informed = tensor.batched_dot(q2d_att_weights, context_enc) question_enc_concatenated = tensor.concatenate( [question_enc, question_enc_informed], 2) # document encoding "in the view of the question" context_enc_informed = tensor.batched_dot(d2q_att_weights, question_enc_concatenated) if self._coattention: context_enc_concatenated = tensor.concatenate( [context_enc, context_enc_informed], 2) else: question_repr_repeated = tensor.repeat(question_enc[:, [-1], :], context_enc.shape[1], axis=1) context_enc_concatenated = tensor.concatenate( [context_enc, question_repr_repeated], 2) # note: forward and backward LSTMs share the # input weights in the current impl bidir_states = flip01( self._bidir.apply(self._bidir_fork.apply( flip01(context_enc_concatenated)), mask=contexts_mask.T)[0]) begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0] begin_readouts = begin_readouts * contexts_mask - 1000.0 * ( 1 - contexts_mask) begin_costs = self._softmax.categorical_cross_entropy( answer_begins, begin_readouts) end_readouts = self._end_readout.apply(bidir_states)[:, :, 0] end_readouts = end_readouts * contexts_mask - 1000.0 * (1 - contexts_mask) end_costs = self._softmax.categorical_cross_entropy( answer_ends, end_readouts) predicted_begins = begin_readouts.argmax(axis=-1) predicted_ends = end_readouts.argmax(axis=-1) exact_match = (tensor.eq(predicted_begins, answer_begins) * tensor.eq(predicted_ends, answer_ends)) application_call.add_auxiliary_variable(predicted_begins, name='predicted_begins') application_call.add_auxiliary_variable(predicted_ends, name='predicted_ends') application_call.add_auxiliary_variable(exact_match, name='exact_match') return begin_costs + end_costs def apply_with_default_vars(self): return self.apply(*self.input_vars.values())
# RECURRENT LAYERS rec_mask = conv_out_mask.dimshuffle(1, 0) rec_in = conv_out[:, :, :, 0].dimshuffle(2, 0, 1) rec_in_dim = conv_out_channels rb = [] for i, p in enumerate(recs): # RNN bricks if p["type"] == "lstm": pre_rec = Linear(input_dim=rec_in_dim, output_dim=4 * p["dim"], name="rnn_linear%d" % i) rec = LSTM(activation=Tanh(), dim=p["dim"], name="rnn%d" % i) rb = rb + [pre_rec, rec] rnn_in = pre_rec.apply(rec_in) rec_out, _ = rec.apply(inputs=rnn_in, mask=rec_mask) dropout_b = [rec] rec_out_dim = p["dim"] elif p["type"] == "simple": pre_rec = Linear(input_dim=rec_in_dim, output_dim=p["dim"], name="rnn_linear%d" % i) rec = SimpleRecurrent(activation=Tanh(), dim=p["dim"], name="rnn%d" % i) rb = rb + [pre_rec, rec] rnn_in = pre_rec.apply(rec_in) rec_out = rec.apply(inputs=rnn_in, mask=rec_mask) dropout_b = [rec] rec_out_dim = p["dim"] elif p["type"] == "blstm": pre_frec = Linear(input_dim=rec_in_dim, output_dim=4 * p["dim"], name="frnn_linear%d" % i) pre_brec = Linear(input_dim=rec_in_dim, output_dim=4 * p["dim"], name="brnn_linear%d" % i)
class LanguageModel(Initializable): """ This takes the word embeddings from LSTMCompositionalLayer and creates sentence embeddings using a LSTM compositional_layer_type can be: 1) 'BidirectionalLSTMCompositionalLayer' 2) 'UnidirectionalLSTMCompositionalLayer' 3) 'BaselineLSTMCompositionalLayer' Input is a 3d tensor with the dimensions of (num_words, num_subwords, batch_size) and a 3d tensor a mask of size (num_words, num_subwords, batch_size) All hidden state sizes are the same as the subword embedding size This returns a 3d tensor with dimensions of (num_words = num RNN states, batch_size, sentence embedding size = LM_RNN_hidden_state_size = subword_RNN_hidden_state_size * 2) """ def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, LM_RNN_hidden_state_size, table_width=0.08, compositional_layer_type='BidirectionalLSTMCompositionalLayer', init_type='xavier', **kwargs): super(LanguageModel, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size #i.e. word embedding size self.LM_RNN_hidden_state_size = LM_RNN_hidden_state_size #i.e sentence embedding size self.table_width = table_width self.name = 'Language_Model' if init_type == 'xavier': linear_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size, self.LM_RNN_hidden_state_size) lstm_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size, self.LM_RNN_hidden_state_size) else: # default is gaussian linear_init = IsotropicGaussian() lstm_init = IsotropicGaussian() self.compositional_layer = None self.linear = None if compositional_layer_type == 'BidirectionalLSTMCompositionalLayer': self.compositional_layer = BidirectionalLSTMCompositionalLayer(self.batch_size, self.num_subwords, self.num_words, self.subword_embedding_size, self.input_vocab_size, self.subword_RNN_hidden_state_size, self.table_width, init_type=init_type, name='compositional_layer') if init_type == 'xavier': linear_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size * 2, self.LM_RNN_hidden_state_size) lstm_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size * 2, self.LM_RNN_hidden_state_size) else: # default is gaussian linear_init = IsotropicGaussian() lstm_init = IsotropicGaussian() self.linear = Linear(input_dim=self.subword_RNN_hidden_state_size * 2, # 2 * for the bidirectional output_dim=self.LM_RNN_hidden_state_size * 4, name='linear', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) elif compositional_layer_type == 'UnidirectionalLSTMCompositionalLayer': self.compositional_layer = LSTMCompositionalLayer(self.batch_size, self.num_subwords, self.num_words, self.subword_embedding_size, self.input_vocab_size, self.subword_RNN_hidden_state_size, self.table_width, init_type=init_type, name='compositional_layer') self.linear = Linear(input_dim=self.subword_RNN_hidden_state_size, output_dim=self.LM_RNN_hidden_state_size * 4, name='linear', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) elif compositional_layer_type == 'BaselineLSTMCompositionalLayer': self.compositional_layer = BaselineLSTMCompositionalLayer(self.batch_size, self.num_subwords, self.num_words, self.subword_embedding_size, self.input_vocab_size, self.subword_RNN_hidden_state_size, self.table_width, init_type=init_type, name='compositional_layer') self.linear = Linear(input_dim=self.subword_RNN_hidden_state_size, output_dim=self.LM_RNN_hidden_state_size * 4, name='linear', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) else: print('ERROR: compositional_layer_type = ' + compositional_layer_type + ' is invalid') sys.exit() # has one RNN which reads the word embeddings into a sentence embedding, or partial sentence embeddings self.language_model_RNN = LSTM( dim=self.LM_RNN_hidden_state_size, activation=Identity(), name='language_model_RNN', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.children = [self.compositional_layer, self.linear, self.language_model_RNN] @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['sentence_embeddings', 'word_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): """ subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint16 or equivalent subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise. Returned is a 3d tensor of size (num_words = num RNN states, batch_size, sentence embedding size) Also returned is a 1d tensor of size (batch_size) describing if the sentence is valid of empty in the batch """ word_embeddings, word_embeddings_mask = self.compositional_layer.apply(subword_id_input_, subword_id_input_mask_) sentence_embeddings = self.language_model_RNN.apply( self.linear.apply(word_embeddings), mask=word_embeddings_mask)[0] #[0] = hidden states, [1] = cells # sentence_embeddings_mask = word_embeddings_mask.max(axis=0).T return sentence_embeddings, word_embeddings_mask
def construct_model(input_dim, out_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx # r_rep is nx x nj x nr r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (nr + 1) concat = tensor.concatenate([r_rep, x3], axis=2) # Change concat from Batch x Time x Features to T X B x F mlp_input = concat.dimshuffle(1, 0, 2) if use_ensembling: # Split time dimension into batches of size num_feats # Join that dimension with the B dimension ens_shape = (num_feats, mlp_input.shape[0]/num_feats, mlp_input.shape[1]) mlp_input = mlp_input.reshape(ens_shape + (input_dim+1,)) mlp_input = mlp_input.reshape((ens_shape[0], ens_shape[1] * ens_shape[2], input_dim+1)) mlp = MLP(dims=[input_dim+1] + mlp_hidden_dims, activations=[activation_function for _ in mlp_hidden_dims], name='mlp') lstm_bot_linear = Linear(input_dim=mlp_hidden_dims[-1], output_dim=4 * lstm_hidden_dim, name="lstm_input_linear") lstm = LSTM(dim=lstm_hidden_dim, activation=activation_function, name="hidden_recurrent") lstm_top_linear = Linear(input_dim=lstm_hidden_dim, output_dim=out_dim, name="out_linear") rnn_input = mlp.apply(mlp_input) pre_rnn = lstm_bot_linear.apply(rnn_input) states = lstm.apply(pre_rnn)[0] activations = lstm_top_linear.apply(states) if use_ensembling: activations = activations.reshape(ens_shape + (out_dim,)) # Unsplit batches (ensembling) activations = tensor.mean(activations, axis=1) # Mean over time activations = tensor.mean(activations, axis=0) cost = Softmax().categorical_cross_entropy(y, activations) pred = activations.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in (mlp, lstm_bot_linear, lstm, lstm_top_linear): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) apply_dropout(cg, [rnn_input], dropout) [cost_reg, error_rate_reg] = cg.outputs return cost_reg, error_rate_reg, cost, error_rate
iteration = 100 # number of epochs of gradient descent lr = 0.2 # learning rate print "Building Model" # Symbolic variables x = tensor.tensor3('x', dtype=floatX) target = tensor.tensor3('target', dtype=floatX) # Build the model linear = Linear(input_dim=n_u, output_dim=4 * n_h, name="first_layer") lstm = LSTM(dim=n_h, activation=Tanh()) linear2 = Linear(input_dim=n_h, output_dim=n_y, name="output_layer") sigm = Sigmoid() x_transform = linear.apply(x) h = lstm.apply(x_transform)[0] predict = sigm.apply(linear2.apply(h)) # only for generation B x h_dim h_initial = tensor.tensor3('h_initial', dtype=floatX) h_testing = lstm.apply(x_transform, states=h_initial, iterate=False)[0] y_hat_testing = linear2.apply(h_testing) y_hat_testing = sigm.apply(y_hat_testing) y_hat_testing.name = 'y_hat_testing' # Cost function cost = SquaredError().apply(predict, target) # Initialization
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) # embed.weights_init = Constant(init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs question_context_attention = att_weights_question.dimshuffle(2, 1, 0) question_context_attention.name = "question_context_attention" self.analyse_vars = [question_context_attention] attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_b1'], BIAS) add_role(tparams['ptr_b2'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, menc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' # self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset( generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset( generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() print('Learned weights:') for layer in (x_to_h, lstm, h_to_o): print("Layer '%s':" % layer.name) for param in layer.parameters: print(param.name, ': ', param.get_value()) print() return main_loop
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') #rnn = SimpleRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) #rnn = GatedRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) embedding_size = 300 #glove_version = "vectors.6B.100d.txt" glove_version = "glove.6B.300d.txt" #fork = Fork(weights_init=IsotropicGaussian(0.02), #biases_init=Constant(0.), #input_dim=embedding_size, #output_dims=[embedding_size]*3, #output_names=['inputs', 'reset_inputs', 'update_inputs'] #) rnn = LSTM( dim = embedding_size, activation=Tanh(), weights_init = IsotropicGaussian(std=0.02), ) rnn.initialize() #fork.initialize() wstd = 0.02 score_layer = Linear( input_dim = 128, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() gloveMapping = Linear( input_dim = embedding_size, output_dim = embedding_size, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.0), name="gloveMapping" ) gloveMapping.initialize() o = gloveMapping.apply(x) o = Rectifier(name="rectivfyglove").apply(o) forget_bias = np.zeros((embedding_size*4), dtype=theano.config.floatX) forget_bias[embedding_size:embedding_size*2] = 4.0 toLSTM = Linear( input_dim = embedding_size, output_dim = embedding_size*4, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(forget_bias), #biases_init = Constant(0.0), name="ToLSTM" ) toLSTM.initialize() rnn_states, rnn_cells = rnn.apply(toLSTM.apply(o) * T.shape_padright(m), mask=m) #inputs, reset_inputs, update_inputs = fork.apply(x) #rnn_states = rnn.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs, mask=m) #rnn_out = rnn_states[:, -1, :] rnn_out = (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1) / m.sum(axis=1).dimshuffle(0, 'x') #rnn_out = (rnn_states).mean(axis=1)# / m.sum(axis=1) hidden = Linear( input_dim = embedding_size, output_dim = 128, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) hidden.initialize() o = hidden.apply(rnn_out) o = Rectifier().apply(o) hidden = Linear( input_dim = 128, output_dim = 128, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="hiddenmap2") hidden.initialize() o = hidden.apply(o) o = Rectifier(name="rec2").apply(o) o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), AdaM(), #AdaDelta(), ]) ) # ======== print "setting up data" train_dataset = IMDBText('train') test_dataset = IMDBText('test') batch_size = 16 n_train = train_dataset.num_examples train_stream = DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=train_stream) train_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) test_stream = DataStream( dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=test_stream) test_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) print "setting up model" #import ipdb #ipdb.set_trace() lstm_norm = rnn.W_state.norm(2) lstm_norm.name = "lstm_norm" pre_norm= gloveMapping.W.norm(2) pre_norm.name = "pre_norm" #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification, lstm_norm, pre_norm], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) extensions.append(Plot("result", channels=[['train_cost', 'train_misclassification']], after_epoch=True)) main_loop = MainLoop( model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()
def build_theano_functions(self): x = T.fmatrix('time_sequence') x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim)) y = x[:,1:self.sequence_dim,:] x = x[:,:self.sequence_dim-1,:] # if we try to include the spectrogram features spec_dims = 0 if self.image_size is not None : print "Convolution activated" self.init_conv() spec = T.ftensor4('spectrogram') spec_features, spec_dims = self.conv.build_conv_layers(spec) print "Conv final dims =", spec_dims spec_dims = np.prod(spec_dims) spec_features = spec_features.reshape( (self.batch_dim, self.sequence_dim-1, spec_dims)) x = T.concatenate([x, spec_features], axis=2) layers_input = [x] dims =np.array([self.time_dim + spec_dims]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X sequence X time lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,:self.gmm_dim], ((self.sequence_dim-1)*self.batch_dim, self.gmm_dim))), (self.batch_dim, (self.sequence_dim-1), self.gmm_dim)) sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6 mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5*((y-mus)**2)/sig**2 coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean() LL.name = "summed_likelihood" model = Model(LL) self.model = model parameters = model.parameters algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=Adam()) f = theano.function([x],[pis, sig, mus]) return algorithm, f
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') better = tensor.imatrix('better') better_mask = tensor.imatrix('better_mask') worse = tensor.imatrix('worse') worse_mask = tensor.imatrix('worse_mask') b_left = tensor.imatrix('b_left') b_left_mask = tensor.imatrix('b_left_mask') b_right = tensor.imatrix('b_right') b_right_mask = tensor.imatrix('b_right_mask') w_left = tensor.imatrix('w_left') w_left_mask = tensor.imatrix('w_left_mask') w_right = tensor.imatrix('w_right') w_right_mask = tensor.imatrix('w_right_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) better = better.dimshuffle(1, 0) better_mask = better_mask.dimshuffle(1, 0) worse = worse.dimshuffle(1, 0) worse_mask = worse_mask.dimshuffle(1, 0) b_left = b_left.dimshuffle(1, 0) b_left_mask = b_left_mask.dimshuffle(1, 0) b_right = b_right.dimshuffle(1, 0) b_right_mask = b_right_mask.dimshuffle(1, 0) w_left = w_left.dimshuffle(1, 0) w_left_mask = w_left_mask.dimshuffle(1, 0) w_right = w_right.dimshuffle(1, 0) w_right_mask = w_right_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # candidate encoders candidates_hidden_list = [] candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0') candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0') candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0') candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins] #computing better encoding better_embed = embed.apply(better) better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed) better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed) better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX)) better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1]) better_hidden_list = [better_fwd_hidden, better_bwd_hidden] better_enc_dim = 2*sum(config.ctx_lstm_size) better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_enc.name = 'better_enc' candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden] #computing worse encoding worse_embed = embed.apply(worse) worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed) worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed) worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX)) worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1]) worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden] worse_enc_dim = 2*sum(config.ctx_lstm_size) worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1) worse_enc.name = 'worse_enc' candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden] #left encoders left_context_hidden_list = [] left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0') left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0') left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0') left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins] #right encoders right_context_hidden_list = [] right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0') right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0') right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0') right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins] #left half encodings better_left_embed = embed.apply(b_left) better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed) better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed) better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX)) better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1]) better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden] better_left_enc_dim = 2*sum(config.ctx_lstm_size) better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_left_enc.name = 'better_left_enc' left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden] worse_left_embed = embed.apply(w_left) worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed) worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed) worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX)) worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1]) worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden] worse_left_enc_dim = 2*sum(config.ctx_lstm_size) worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_left_enc.name = 'worse_left_enc' left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden] #right half encoding better_right_embed = embed.apply(b_right) better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed) better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed) better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX)) better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1]) better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden] better_right_enc_dim = 2*sum(config.ctx_lstm_size) better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_right_enc.name = 'better_right_enc' right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden] worse_right_embed = embed.apply(w_right) worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed) worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed) worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX)) worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1]) worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden] worse_right_enc_dim = 2*sum(config.ctx_lstm_size) worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_right_enc.name = 'worse_right_enc' right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden] # F1 prediction MLP prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1], activations=config.prediction_mlp_activations[1:] + [Identity()], name='prediction_mlp') prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq') prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand') prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft') prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright') bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear] better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1)) better_layer1.name = 'better_layer1' worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1)) worse_layer1.name = 'worse_layer1' better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size # numpy.set_printoptions(edgeitems=500) # better_pred_weights = theano.printing.Print('better')(better_pred_weights) # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights) # #cost : max(0,- score-better + score-worse + margin) margin = config.margin conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX) self.predictions = conditions cost = (-better_pred_weights + worse_pred_weights + margin) * conditions cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
class Model(Initializable): @lazy() def __init__(self, config, **kwargs): super(Model, self).__init__(**kwargs) self.config = config self.pre_context_embedder = ContextEmbedder( config.pre_embedder, name='pre_context_embedder') self.post_context_embedder = ContextEmbedder( config.post_embedder, name='post_context_embedder') in1 = 2 + sum(x[2] for x in config.pre_embedder.dim_embeddings) self.input_to_rec = MLP(activations=[Tanh()], dims=[in1, config.hidden_state_dim], name='input_to_rec') self.rec = LSTM(dim=config.hidden_state_dim, name='recurrent') in2 = config.hidden_state_dim + sum( x[2] for x in config.post_embedder.dim_embeddings) self.rec_to_output = MLP(activations=[Tanh()], dims=[in2, 2], name='rec_to_output') self.sequences = ['latitude', 'latitude_mask', 'longitude'] self.context = self.pre_context_embedder.inputs + self.post_context_embedder.inputs self.inputs = self.sequences + self.context self.children = [ self.pre_context_embedder, self.post_context_embedder, self.input_to_rec, self.rec, self.rec_to_output ] self.initial_state_ = shared_floatx_zeros((config.hidden_state_dim, ), name="initial_state") self.initial_cells = shared_floatx_zeros((config.hidden_state_dim, ), name="initial_cells") def _push_initialization_config(self): for mlp in [self.input_to_rec, self.rec_to_output]: mlp.weights_init = self.config.weights_init mlp.biases_init = self.config.biases_init self.rec.weights_init = self.config.weights_init def get_dim(self, name): return self.rec.get_dim(name) @application def initial_state(self, *args, **kwargs): return self.rec.initial_state(*args, **kwargs) @recurrent(states=['states', 'cells'], outputs=['destination', 'states', 'cells'], sequences=['latitude', 'longitude', 'latitude_mask']) def predict_all(self, latitude, longitude, latitude_mask, **kwargs): latitude = (latitude - data.train_gps_mean[0]) / data.train_gps_std[0] longitude = (longitude - data.train_gps_mean[1]) / data.train_gps_std[1] pre_emb = tuple(self.pre_context_embedder.apply(**kwargs)) latitude = tensor.shape_padright(latitude) longitude = tensor.shape_padright(longitude) itr = self.input_to_rec.apply( tensor.concatenate(pre_emb + (latitude, longitude), axis=1)) itr = itr.repeat(4, axis=1) (next_states, next_cells) = self.rec.apply(itr, kwargs['states'], kwargs['cells'], mask=latitude_mask, iterate=False) post_emb = tuple(self.post_context_embedder.apply(**kwargs)) rto = self.rec_to_output.apply( tensor.concatenate(post_emb + (next_states, ), axis=1)) rto = (rto * data.train_gps_std) + data.train_gps_mean return (rto, next_states, next_cells) @predict_all.property('contexts') def predict_all_inputs(self): return self.context @application(outputs=['destination']) def predict(self, latitude, longitude, latitude_mask, **kwargs): latitude = latitude.T longitude = longitude.T latitude_mask = latitude_mask.T res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0] return res[-1] @predict.property('inputs') def predict_inputs(self): return self.inputs @application(outputs=['cost_matrix']) def cost_matrix(self, latitude, longitude, latitude_mask, **kwargs): latitude = latitude.T longitude = longitude.T latitude_mask = latitude_mask.T res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0] target = tensor.concatenate( (kwargs['destination_latitude'].dimshuffle('x', 0, 'x'), kwargs['destination_longitude'].dimshuffle('x', 0, 'x')), axis=2) target = target.repeat(latitude.shape[0], axis=0) ce = error.erdist(target.reshape((-1, 2)), res.reshape((-1, 2))) ce = ce.reshape(latitude.shape) return ce * latitude_mask @cost_matrix.property('inputs') def cost_matrix_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude'] @application(outputs=['cost']) def cost(self, latitude_mask, **kwargs): return self.cost_matrix(latitude_mask=latitude_mask, ** kwargs).sum() / latitude_mask.sum() @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude'] @application(outputs=['cost']) def valid_cost(self, **kwargs): # Only works when batch_size is 1. return self.cost_matrix(**kwargs)[-1, 0] @valid_cost.property('inputs') def valid_cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']
class Model(Initializable): @lazy() def __init__(self, config, **kwargs): super(Model, self).__init__(**kwargs) self.config = config self.pre_context_embedder = ContextEmbedder(config.pre_embedder, name='pre_context_embedder') self.post_context_embedder = ContextEmbedder(config.post_embedder, name='post_context_embedder') in1 = 2 + sum(x[2] for x in config.pre_embedder.dim_embeddings) self.input_to_rec = MLP(activations=[Tanh()], dims=[in1, config.hidden_state_dim], name='input_to_rec') self.rec = LSTM( dim = config.hidden_state_dim, name = 'recurrent' ) in2 = config.hidden_state_dim + sum(x[2] for x in config.post_embedder.dim_embeddings) self.rec_to_output = MLP(activations=[Tanh()], dims=[in2, 2], name='rec_to_output') self.sequences = ['latitude', 'latitude_mask', 'longitude'] self.context = self.pre_context_embedder.inputs + self.post_context_embedder.inputs self.inputs = self.sequences + self.context self.children = [ self.pre_context_embedder, self.post_context_embedder, self.input_to_rec, self.rec, self.rec_to_output ] self.initial_state_ = shared_floatx_zeros((config.hidden_state_dim,), name="initial_state") self.initial_cells = shared_floatx_zeros((config.hidden_state_dim,), name="initial_cells") def _push_initialization_config(self): for mlp in [self.input_to_rec, self.rec_to_output]: mlp.weights_init = self.config.weights_init mlp.biases_init = self.config.biases_init self.rec.weights_init = self.config.weights_init def get_dim(self, name): return self.rec.get_dim(name) @application def initial_state(self, *args, **kwargs): return self.rec.initial_state(*args, **kwargs) @recurrent(states=['states', 'cells'], outputs=['destination', 'states', 'cells'], sequences=['latitude', 'longitude', 'latitude_mask']) def predict_all(self, latitude, longitude, latitude_mask, **kwargs): latitude = (latitude - data.train_gps_mean[0]) / data.train_gps_std[0] longitude = (longitude - data.train_gps_mean[1]) / data.train_gps_std[1] pre_emb = tuple(self.pre_context_embedder.apply(**kwargs)) latitude = tensor.shape_padright(latitude) longitude = tensor.shape_padright(longitude) itr = self.input_to_rec.apply(tensor.concatenate(pre_emb + (latitude, longitude), axis=1)) itr = itr.repeat(4, axis=1) (next_states, next_cells) = self.rec.apply(itr, kwargs['states'], kwargs['cells'], mask=latitude_mask, iterate=False) post_emb = tuple(self.post_context_embedder.apply(**kwargs)) rto = self.rec_to_output.apply(tensor.concatenate(post_emb + (next_states,), axis=1)) rto = (rto * data.train_gps_std) + data.train_gps_mean return (rto, next_states, next_cells) @predict_all.property('contexts') def predict_all_inputs(self): return self.context @application(outputs=['destination']) def predict(self, latitude, longitude, latitude_mask, **kwargs): latitude = latitude.T longitude = longitude.T latitude_mask = latitude_mask.T res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0] return res[-1] @predict.property('inputs') def predict_inputs(self): return self.inputs @application(outputs=['cost_matrix']) def cost_matrix(self, latitude, longitude, latitude_mask, **kwargs): latitude = latitude.T longitude = longitude.T latitude_mask = latitude_mask.T res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0] target = tensor.concatenate( (kwargs['destination_latitude'].dimshuffle('x', 0, 'x'), kwargs['destination_longitude'].dimshuffle('x', 0, 'x')), axis=2) target = target.repeat(latitude.shape[0], axis=0) ce = error.erdist(target.reshape((-1, 2)), res.reshape((-1, 2))) ce = ce.reshape(latitude.shape) return ce * latitude_mask @cost_matrix.property('inputs') def cost_matrix_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude'] @application(outputs=['cost']) def cost(self, latitude_mask, **kwargs): return self.cost_matrix(latitude_mask=latitude_mask, **kwargs).sum() / latitude_mask.sum() @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude'] @application(outputs=['cost']) def valid_cost(self, **kwargs): # Only works when batch_size is 1. return self.cost_matrix(**kwargs)[-1,0] @valid_cost.property('inputs') def valid_cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']
def lstm_layer(dim, h, n): linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n)) lstm = LSTM(dim=dim, name='lstm' + str(n)) initialize([linear, lstm]) return lstm.apply(linear.apply(h))[0]
def main(): x = T.imatrix('features') m = T.matrix('features_mask') y = T.imatrix('targets') #x_int = x.astype(dtype='int32').T x_int = x.T train_dataset = IMDB('train') n_voc = len(train_dataset.dict.keys()) n_h = 2 lookup = LookupTable( length=n_voc+2, dim = n_h*4, weights_init = Uniform(std=0.01), biases_init = Constant(0.) ) lookup.initialize() #rnn = SimpleRecurrent( #dim = n_h, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) rnn = LSTM( dim = n_h, activation=Tanh(), weights_init = Uniform(std=0.01), biases_init = Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim = n_h, output_dim = 1, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) score_layer.initialize() embedding = lookup.apply(x_int) * T.shape_padright(m.T) #embedding = lookup.apply(x_int) + m.T.mean()*0 #embedding = lookup.apply(x_int) + m.T.mean()*0 rnn_states = rnn.apply(embedding, mask=m.T) #rnn_states, rnn_cells = rnn.apply(embedding) rnn_out_mean_pooled = rnn_states[-1] #rnn_out_mean_pooled = rnn_states.mean() probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), Adam(), #AdaDelta(), ]) ) # ======== test_dataset = IMDB('test') batch_size = 64 n_train = train_dataset.num_examples train_stream = DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) train_padded = Padding( data_stream=train_stream, mask_sources=('features',) #mask_sources=[] ) test_stream = DataStream( dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) test_padded = Padding( data_stream=test_stream, mask_sources=('features',) #mask_sources=[] ) #import ipdb #ipdb.set_trace() #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) main_loop = MainLoop( model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims = np.array([self.time_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, :self.gmm_dim], (self.sequence_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, pis, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=4*n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() rnn = LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out[0], mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( [StepClipping(10.), Adam()]) ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) init_cells = rnn.initial_state('cells', batch_size) def sampling_step(g_noise, states, cells, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states, next_cells = rnn.apply(inputs=embedding_step, states=states, cells=cells, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_cells, next_samples [_, _, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_cells, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
class Seq2Seq(Initializable): """ seq2seq model Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. """ def __init__(self, emb_dim, dim, num_input_words, num_output_words, vocab, proximity_coef=0, proximity_distance='l2', encoder='lstm', decoder='lstm', shared_rnn=False, translate_layer=None, word_dropout=0., tied_in_out=False, vocab_keys=None, seed=0, reconstruction_coef=1., provide_targets=False, **kwargs): """ translate_layer: either a string containing the activation function to use either a list containg the list of activations for a MLP """ if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if num_output_words == 0: num_output_words = vocab.size() self._word_dropout = word_dropout self._tied_in_out = tied_in_out if not encoder: if proximity_coef: raise ValueError("Err: meaningless penalty term (no encoder)") if not vocab_keys: raise ValueError("Err: specify a key vocabulary (no encoder)") if tied_in_out and num_input_words != num_output_words: raise ValueError("Can't tie in and out embeddings. Different " "vocabulary size") if shared_rnn and (encoder != 'lstm' or decoder != 'lstm'): raise ValueError( "can't share RNN because either encoder or decoder" "is not an RNN") if shared_rnn and decoder == 'lstm_c': raise ValueError( "can't share RNN because the decoder takes different" "inputs") if word_dropout < 0 or word_dropout > 1: raise ValueError("invalid value for word dropout", str(word_dropout)) if proximity_distance not in ['l1', 'l2', 'cos']: raise ValueError( "unrecognized distance: {}".format(proximity_distance)) if proximity_coef and emb_dim != dim and not translate_layer: raise ValueError( """if proximity penalisation, emb_dim should equal dim or there should be a translate layer""") if encoder not in [ None, 'lstm', 'bilstm', 'mean', 'weighted_mean', 'max_bilstm', 'bilstm_sum', 'max_bilstm_sum' ]: raise ValueError('encoder not recognized') if decoder not in ['skip-gram', 'lstm', 'lstm_c']: raise ValueError('decoder not recognized') self._proximity_distance = proximity_distance self._decoder = decoder self._encoder = encoder self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._proximity_coef = proximity_coef self._reconstruction_coef = reconstruction_coef self._provide_targets = provide_targets self._word_to_id = WordToIdOp(self._vocab) if vocab_keys: self._key_to_id = WordToIdOp(vocab_keys) children = [] if encoder or (not encoder and decoder in ['lstm', 'lstm_c']): self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') children.append(self._main_lookup) if provide_targets: # this is useful to simulate Hill's baseline without pretrained embeddings # in the encoder, only as targets for the encoder. self._target_lookup = LookupTable(self._num_input_words, emb_dim, name='target_lookup') children.append(self._target_lookup) if not encoder: self._key_lookup = LookupTable(vocab_keys.size(), emb_dim, name='key_lookup') children.append(self._key_lookup) elif encoder == 'lstm': self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm', 'max_bilstm']: # dim is the dim of the concatenated vector self._encoder_fork = Linear(emb_dim, 2 * dim, name='encoder_fork') self._encoder_rnn = Bidirectional(LSTM(dim / 2, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm_sum', 'max_bilstm_sum']: self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = BidirectionalSum(LSTM(dim, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder == 'mean': pass elif encoder == 'weighted_mean': self._encoder_w = MLP([Logistic()], [dim, 1], name="encoder_weights") children.extend([self._encoder_w]) else: raise NotImplementedError() if decoder in ['lstm', 'lstm_c']: dim_after_translate = emb_dim if shared_rnn: self._decoder_fork = self._encoder_fork self._decoder_rnn = self._encoder_rnn else: if decoder == 'lstm_c': dim_2 = dim + emb_dim else: dim_2 = dim self._decoder_fork = Linear(dim_2, 4 * dim, name='decoder_fork') self._decoder_rnn = LSTM(dim, name='decoder_rnn') children.extend([self._decoder_fork, self._decoder_rnn]) elif decoder == 'skip-gram': dim_after_translate = emb_dim self._translate_layer = None activations = {'sigmoid': Logistic(), 'tanh': Tanh(), 'linear': None} if translate_layer: if type(translate_layer) == str: translate_layer = [translate_layer] assert (type(translate_layer) == list) activations_translate = [activations[a] for a in translate_layer] dims_translate = [ dim, ] * len(translate_layer) + [dim_after_translate] self._translate_layer = MLP(activations_translate, dims_translate, name="translate_layer") children.append(self._translate_layer) if not self._tied_in_out: self._pre_softmax = Linear(emb_dim, self._num_output_words) children.append(self._pre_softmax) if decoder in ['lstm', 'lstm_c']: self._softmax = NDimensionalSoftmax() elif decoder in ['skip-gram']: self._softmax = Softmax() children.append(self._softmax) super(Seq2Seq, self).__init__(children=children, **kwargs) def _allocate(self): pass def _initialize(self): pass def get_embeddings_entries(self): return self._vocab.words def set_def_embeddings(self, embeddings, lookup='main'): if lookup == 'main': self._main_lookup.parameters[0].set_value( embeddings.astype(floatX)) elif lookup == 'target': self._target_lookup.parameters[0].set_value( embeddings.astype(floatX)) else: raise ValueError('Requested embedding not understood') def get_def_embeddings_params(self, lookup='main'): if lookup == 'main': return self._main_lookup.parameters[0] elif lookup == 'key': return self._key_lookup.parameters[0] elif lookup == 'target': return self._target_lookup.parameters[0] else: raise ValueError('Requested embedding not understood') def add_perplexity_measure(self, application_call, minus_logs, mask, name): sum_ce = (minus_logs * mask).sum() perplexity = T.exp(sum_ce / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity(sum_ce, mask.sum()) application_call.add_auxiliary_variable(perplexity, name=name) return sum_ce / mask.sum() @application def apply(self, application_call, words, mask, keys=None, n_identical_keys=None, train_phase=True): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. keys An integer matrix of shape (B). It contains the words that are defined in the corresponding rows in words. """ if not keys and self._proximity_coef != 0: raise ValueError( "Err: should provide keys when using penalty term") if not self._encoder and not keys: raise ValueError("Err: should provide keys (no encoder)") word_ids = self._word_to_id(words) if keys: key_ids = self._word_to_id(keys) # dropout unk = self._vocab.unk if self._word_dropout > 0 and train_phase: dropout_mask = T.ones_like(word_ids, dtype=int) dropout_mask = get_dropout_mask(dropout_mask, self._word_dropout) # this gives a matrix of 0 (dropped word) and ones (kept words) # replace 0s by unk token and 1s by word ids word_ids_dropped = (T.eq(dropout_mask, 1) * word_ids + T.eq(dropout_mask, 0) * unk) word_ids_in = word_ids_dropped else: word_ids_in = word_ids # shortlisting # input_word_ids uses word dropout input_word_ids = ( T.lt(word_ids_in, self._num_input_words) * word_ids_in + T.ge(word_ids_in, self._num_input_words) * unk) output_word_ids = (T.lt(word_ids, self._num_output_words) * word_ids + T.ge(word_ids, self._num_output_words) * unk) if self._encoder or self._decoder != 'skip-gram': input_embeddings = self._main_lookup.apply(input_word_ids) # Encoder if self._encoder == 'lstm' or 'bilstm' in self._encoder: encoder_rnn_states = self._encoder_rnn.apply(T.transpose( self._encoder_fork.apply(input_embeddings), (1, 0, 2)), mask=mask.T)[0] if self._encoder in ['lstm', 'bilstm', 'bilstm_sum']: gen_embeddings = encoder_rnn_states[-1] elif self._encoder in ['max_bilstm', 'max_bilstm_sum']: mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2) # (bs,L,dim) gen_embeddings = (input_embeddings * mask_bc + (1 - mask_bc) * -10**8).max(axis=1) else: raise ValueError("encoder {} apply not specific".format( self._encoder)) elif self._encoder == 'mean': mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2) gen_embeddings = (input_embeddings * mask_bc).mean(axis=1) elif self._encoder == 'weighted_mean': mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2) weights = self._encoder_w.apply(input_embeddings) weights = T.addbroadcast(weights, 2) weights = weights * mask_bc gen_embeddings = (input_embeddings * weights).mean(axis=1) elif not self._encoder: gen_embeddings = self._key_lookup.apply(key_ids) else: raise NotImplementedError() # Optional translation layer if self._translate_layer: in_decoder = self._translate_layer.apply(gen_embeddings) else: in_decoder = gen_embeddings # (bs, dim) application_call.add_auxiliary_variable(in_decoder.copy(), name="embeddings") # Decoder if self._decoder in ['lstm', 'lstm_c']: if self._decoder == 'lstm_c': tiled_in_decoder = T.tile(in_decoder.dimshuffle(0, 'x', 1), (input_embeddings.shape[1], 1)) input_embeddings = T.concatenate( [input_embeddings, tiled_in_decoder], axis=2) decoded = self._decoder_rnn.apply( inputs=T.transpose(self._decoder_fork.apply(input_embeddings), (1, 0, 2)), mask=mask.T, states=in_decoder)[0] # size (L, bs, dim) n_dim_decoded = 3 elif self._decoder == 'skip-gram': decoded = in_decoder # size (bs, dim) n_dim_decoded = 2 else: raise NotImplementedError() # we ignore the <bos> token targets = output_word_ids.T[1:] # (L-1, bs) targets_mask = mask.T[1:] # (L-1,bs) # Compute log probabilities if n_dim_decoded == 2: # Case where we have only one distrib for all timesteps: skip-gram if self._tied_in_out: W_out = self.get_def_embeddings_params().transpose( ) # (dim, V) logits = T.dot(decoded, W_out) # (bs, dim) x (dim,V) = (bs,V) else: logits = self._pre_softmax.apply(decoded) # (bs, V) size_batch, length_sentence = output_word_ids.shape normalized_logits = self._softmax.log_probabilities( logits) # (bs, V) indices = (targets.T + T.addbroadcast( (T.arange(size_batch) * logits.shape[1]).dimshuffle( 0, 'x'), 1)).flatten() # (bs*L) minus_logs = -normalized_logits.flatten()[indices].reshape( (size_batch, length_sentence - 1)).T # (L-1, bs) elif n_dim_decoded == 3: # Case where decoding is time dependent: recurrent decoders if self._tied_in_out: raise NotImplementedError() # TODO: implement... seems annoying because we need to replace # in the already implemented block code else: logits = self._pre_softmax.apply(decoded[:-1]) # (L-1, bs, V) minus_logs = self._softmax.categorical_cross_entropy( targets, logits, extra_ndim=1) avg_CE = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "perplexity") costs = self._reconstruction_coef * avg_CE if self._proximity_coef > 0: if not self._encoder: key_ids = self._key_to_id(keys) else: key_ids = self._word_to_id(keys) # shortlist: if we don't use all the input embeddings, we need to shortlist # so that there isn't a key error key_ids = (T.lt(key_ids, self._num_input_words) * key_ids + T.ge(key_ids, self._num_input_words) * unk) if self._provide_targets: key_embeddings = self._target_lookup.apply( key_ids) #(bs, emb_dim) else: key_embeddings = self._main_lookup.apply( key_ids) #(bs, emb_dim) # don't penalize on UNK: mask = T.neq(key_ids, unk) * T.lt(key_ids, self._num_input_words) # average over dimension, and then manual averaging using the mask eps = T.constant(10**-6) if self._proximity_distance in ['l1', 'l2']: if self._proximity_distance == 'l1': diff_embeddings = T.abs_(key_embeddings - in_decoder) else: diff_embeddings = (key_embeddings - in_decoder)**2 mask = mask.reshape((-1, 1)) sum_proximity_term = T.sum( T.mean(diff_embeddings * mask, axis=1)) proximity_term = sum_proximity_term / (T.sum(mask) + eps) elif self._proximity_distance == 'cos': # numerator # TODO: debug mask = mask.reshape((-1, 1)) # (bs, 1) masked_keys = key_embeddings * mask masked_gen = in_decoder * mask dot_product_vector = T.sum(masked_keys * masked_gen, axis=1) #(bs) # denominator product_sqr_norms = T.sum((masked_keys)**2, axis=1) * T.sum( (masked_gen)**2, axis=1) denominator = T.sqrt(product_sqr_norms + eps) #(bs) proximity_term = -T.sum(dot_product_vector / denominator) / ( T.sum(mask) + eps) application_call.add_auxiliary_variable(proximity_term.copy(), name="proximity_term") costs = costs + self._proximity_coef * proximity_term return costs
class TestLSTM(unittest.TestCase): def setUp(self): self.lstm = LSTM(dim=3, weights_init=Constant(2), biases_init=Constant(0)) self.lstm.initialize() def test_one_step(self): h0 = tensor.matrix('h0') c0 = tensor.matrix('c0') x = tensor.matrix('x') h1, c1 = self.lstm.apply(x, h0, c0, iterate=False) next_h = theano.function(inputs=[x, h0, c0], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) c0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([range(12), range(12, 24)], dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) # omitting biases because they are zero activation = numpy.dot(h0_val, W_state_val) + x_val def sigmoid(x): return 1. / (1. + numpy.exp(-x)) i_t = sigmoid(activation[:, :3] + c0_val * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c0_val * W_cell_to_forget) next_cells = f_t * c0_val + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + next_cells * W_cell_to_out) h1_val = o_t * numpy.tanh(next_cells) assert_allclose(h1_val, next_h(x_val, h0_val, c0_val)[0], rtol=1e-6) def test_many_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') h, c = self.lstm.apply(x, mask=mask, iterate=True) calc_h = theano.function(inputs=[x, mask], outputs=[h]) x_val = (0.1 * numpy.asarray( list(itertools.islice(itertools.permutations(range(12)), 0, 24)), dtype=theano.config.floatX)) x_val = numpy.ones((24, 4, 12), dtype=theano.config.floatX) * x_val[:, None, :] mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) c_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX) W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX) W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX) def sigmoid(x): return 1. / (1. + numpy.exp(-x)) for i in range(1, 25): activation = numpy.dot(h_val[i-1], W_state_val) + x_val[i-1] i_t = sigmoid(activation[:, :3] + c_val[i-1] * W_cell_to_in) f_t = sigmoid(activation[:, 3:6] + c_val[i-1] * W_cell_to_forget) c_val[i] = f_t * c_val[i-1] + i_t * numpy.tanh(activation[:, 6:9]) o_t = sigmoid(activation[:, 9:12] + c_val[i] * W_cell_to_out) h_val[i] = o_t * numpy.tanh(c_val[i]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) c_val[i] = (mask_val[i - 1, :, None] * c_val[i] + (1 - mask_val[i - 1, :, None]) * c_val[i - 1]) h_val = h_val[1:] assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04) # Also test that initial state is a parameter initial1, initial2 = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial1) assert is_shared_variable(initial2) assert {initial1.name, initial2.name} == { 'initial_state', 'initial_cells'}
def __init__(self): inp = tensor.tensor3('input') inp = inp.dimshuffle(1,0,2) target = tensor.matrix('target') target = target.reshape((target.shape[0],)) product = tensor.lvector('product') missing = tensor.eq(inp, 0) train_input_mean = 1470614.1 train_input_std = 3256577.0 trans_1 = tensor.concatenate((inp[1:,:,:],tensor.zeros((1,inp.shape[1],inp.shape[2]))), axis=0) trans_2 = tensor.concatenate((tensor.zeros((1,inp.shape[1],inp.shape[2])), inp[:-1,:,:]), axis=0) inp = tensor.switch(missing,(trans_1+trans_2)/2, inp) lookup = LookupTable(length = 352, dim=4*hidden_dim) product_embed= lookup.apply(product) salut = tensor.concatenate((inp, missing),axis =2) linear = Linear(input_dim=input_dim+1, output_dim=4*hidden_dim, name="lstm_in") inter = linear.apply(salut) inter = inter + product_embed[None,:,:] lstm = LSTM(dim=hidden_dim, activation=activation_function, name="lstm") hidden, cells = lstm.apply(inter) linear2= Linear(input_dim = hidden_dim, output_dim = out_dim, name="ouput_linear") pred = linear2.apply(hidden[-1])*train_input_std + train_input_mean pred = pred.reshape((product.shape[0],)) cost = tensor.mean(abs((pred-target)/target)) # Initialize all bricks for brick in [linear, linear2, lstm, lookup]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) if i_dropout > 0: cg = apply_dropout(cg, [hidden], i_dropout) [cost_reg] = cg.outputs cost_reg += 1e-20 if cost_reg is not cost: self.cost = cost self.cost_reg = cost_reg cost_reg.name = 'cost_reg' cost.name = 'cost' self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg]] else: self.cost = cost cost.name = 'cost' self.sgd_cost = cost self.monitor_vars = [[cost]] self.pred = pred pred.name = 'pred'
class impatientLayer: # both visual and word feature are in the joint space # of dim: feature_dim # hidden_dim: dim of m # output_dim: final joint document query representation dim def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='m_to_s') self.attention_dist = Softmax(name='attention_dist_softmax') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() # the sequence to sequence LSTM self.seq = LSTM(output_dim, name='rewatcher_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rewatcher_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # doc: row major batch_size x doc_length x feature_dim # query: row major batch_size x q x feature_dim # mask: mask of query batch_size # mask: length of a sentence - 1 def apply(self, doc, query, mask_, batch_size): # batch_size x doc_length x hidden_dim mask = mask_.flatten() att1 = self.image_embed.apply(doc) # y_q_i: the ith token of question # batch_size x feature_dim # r_1: r_m_1 # batch_size x feature_dim # y_d: document # batch_size x doc_length x feature_dim # y_d_m: d-to-m # batch_size x doc_length x hidden_dim def one_step(y_q_i, r_1, y_d, y_d_m): # batch_size x hidden_dim att2 = self.r_embed.apply(r_1) # batch_size x hidden_dim att3 = self.word_embed.apply(y_q_i) att = y_d_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1) # batch_size x doc_length x hidden_dim m = T.tanh(att) # batch_size x doc_length x 1 s = self.m_to_s.apply(m) # batch_size x doc_length s = s.reshape((s.shape[0], s.shape[1])) s = self.attention_dist.apply(s) y_d_s = y_d.swapaxes(1, 2) # return batch_size x feature_dim return T.batched_dot(y_d_s, s) + T.tanh(self.r_to_r.apply(r_1)) # query: batch_size x q x feature_dim # r: q x batch_size x feature_dim r, updates = theano.scan(fn=one_step, sequences=[query.swapaxes(0,1)], outputs_info=T.zeros_like(doc[:, 0, :]), non_sequences=[doc, att1], n_steps=query.shape[1], name='impatient layer') # for the sequence encoder # q x batch_size x output_dim Wr = self.seq_embed.apply(r) # q x batch_size x output_dim seq_r, garbage = self.seq.apply(Wr) # batch_size x feature_dim r_q = r[mask, T.arange(batch_size), :] seq_r_q = seq_r[mask, T.arange(batch_size), :] # batch_size x output_dim return r_q, seq_r_q
y_len = y_mask.sum(axis=0) # inputt : T x B # input_mask : T x B # y : L x B # y_mask : L x B # Linear bricks in input_to_h = LookupTable(num_input_classes, h_dim, name='lookup') h = input_to_h.apply(inputt) # h : T x B x h_dim # RNN bricks pre_lstm = Linear(input_dim=h_dim, output_dim=4*rec_dim, name='LSTM_linear') lstm = LSTM(activation=Tanh(), dim=rec_dim, name="rnn") rnn_out, _ = lstm.apply(pre_lstm.apply(h), mask=input_mask) # Linear bricks out rec_to_o = Linear(name='rec_to_o', input_dim=rec_dim, output_dim=num_output_classes + 1) y_hat_pre = rec_to_o.apply(rnn_out) # y_hat_pre : T x B x C+1 # y_hat : T x B x C+1 y_hat = tensor.nnet.softmax( y_hat_pre.reshape((-1, num_output_classes + 1)) ).reshape((y_hat_pre.shape[0], y_hat_pre.shape[1], -1)) y_hat.name = 'y_hat' y_hat_mask = input_mask