def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear( input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def __init__(self, input_size, hidden_size, output_size): self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype=floatX) x_to_lstm = Linear(name="x_to_lstm", input_dim=input_size, output_dim=4 * hidden_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm = LSTM(dim=hidden_size, name="lstm", weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm_to_output = Linear(name="lstm_to_output", input_dim=hidden_size, output_dim=output_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) x_transform = x_to_lstm.apply(x) h, c = lstm.apply(x_transform) y_hat = lstm_to_output.apply(h) y_hat = Logistic(name="y_hat").apply(y_hat) self.cost = BinaryCrossEntropy(name="cost").apply(y, y_hat) x_to_lstm.initialize() lstm.initialize() lstm_to_output.initialize() self.computation_graph = ComputationGraph(self.cost)
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset(generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset(generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar()]) main_loop.run() print 'Learned weights:' for layer in (x_to_h, lstm, h_to_o): print "Layer '%s':" % layer.name for param in layer.parameters: print param.name, ': ', param.get_value() print
def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear(input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
class AttentionEUTHM(EUTHM): ''' EUTHM with user attention ''' def __init__(self, config, dataset, *args, **kwargs): super(EUTHM, self).__init__(config, dataset) def _get_doc_embed(self, *args, **kwargs): text_vec = self._get_text_vec() user_vec = self.user_embed.apply(self.user) text_vec = tensor.concatenate([ text_vec, user_vec[None, :, :][tensor.zeros( shape=(text_vec.shape[0], ), dtype='int32')] ], axis=2) return self._encode_text_vec(text_vec) def _build_bricks(self, *args, **kwargs): super(AttentionEUTHM, self)._build_bricks() self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim + self.config.user_embed_dim, output_dim=4 * self.config.lstm_dim, name='mlstm_in') self.mlstm_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.user_embed_dim + self.config.lstm_dim)) self.mlstm_ins.biases_init = Constant(0) self.mlstm_ins.initialize()
def lllistool(i, inp, func): if func == LSTM: NUMS[i+1] *= 4 sdim = DIMS[i] if func == SimpleRecurrent or func == LSTM: sdim = DIMS[i] + DIMS[i+1] l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=sdim**(-0.5)), biases_init=IsotropicGaussian(std=sdim**(-0.5)), name='Lin{}'.format(i)) l.initialize() if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() ret = gong.apply(l.apply(inp)) elif func == LSTM: gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() print(inp) ret, _ = gong.apply( l.apply(inp), T.zeros((inp.shape[1], DIMS[i+1])), T.zeros((inp.shape[1], DIMS[i+1])), ) elif func == SequenceGenerator: gong = func( readout=None, transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1))) ret = None elif func == None: ret = l.apply(inp) else: gong = func() ret = gong.apply(l.apply(inp)) return ret
class embeddingLayer: def __init__(self, word_dim, visual_dim, joint_dim): self.word_embed = Linear(word_dim, joint_dim, name='word_to_joint', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.visual_embed = Linear(visual_dim, joint_dim, name='visual_to_joint', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.word_embed.initialize() self.visual_embed.initialize() # words: batch_size x q x word_dim # video: batch_size x video_length x visual_dim def apply(self, words, video, u1, u2): w = self.word_embed.apply(words) v = self.visual_embed.apply(video) w = T.tanh(w) v = T.tanh(v) u = T.concatenate([u1, u2], axis=1) u = self.word_embed.apply(u) return w, v, u def apply_sentence(self, words, u1, u2): w = self.word_embed.apply(words) w = T.tanh(w) u = T.concatenate([u1, u2], axis=1) u = self.word_embed.apply(u) return w, u
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear( input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin,gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def decoder_network(latent_sample, latent_dim=J): # bernoulli case hidden2 = get_typical_layer(latent_sample, latent_dim, 500, Logistic()) hidden2_to_output = Linear(name="last", input_dim=500, output_dim=784) hidden2_to_output.weights_init = IsotropicGaussian(0.01) hidden2_to_output.biases_init = Constant(0) hidden2_to_output.initialize() return Logistic().apply(hidden2_to_output.apply(hidden2))
def create_model(self): input_dim = self.input_dim x = self.x y = self.y p = self.p mask = self.mask hidden_dim = self.hidden_dim embedding_dim = self.embedding_dim lookup = LookupTable(self.dict_size, embedding_dim, weights_init=IsotropicGaussian(0.001), name='LookupTable') x_to_h = Linear(embedding_dim, hidden_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) lstm = LSTM(hidden_dim, name='lstm', weights_init=IsotropicGaussian(0.001), biases_init=Constant(0.0)) h_to_o = MLP([Logistic()], [hidden_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0), name='h_to_o') lookup.initialize() x_to_h.initialize() lstm.initialize() h_to_o.initialize() embed = lookup.apply(x).reshape( (x.shape[0], x.shape[1], self.embedding_dim)) embed.name = "embed_vec" x_transform = x_to_h.apply(embed.transpose(1, 0, 2)) x_transform.name = "Transformed X" self.lookup = lookup self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o #if mask is None: h, c = lstm.apply(x_transform) #else: #h, c = lstm.apply(x_transform, mask=mask) h.name = "hidden_state" c.name = "cell state" # only values of hidden units of the last timeframe are used for # the classification indices = T.sum(mask, axis=0) - 1 rel_hid = h[indices, T.arange(h.shape[1])] out = self.h_to_o.apply(rel_hid) probs = out return probs
class MyRecurrent(Brick): def __init__(self, recurrent, dims, activations=[Identity(), Identity()], **kwargs): super(MyRecurrent, self).__init__(**kwargs) self.dims = dims self.recurrent = recurrent self.activations = activations if isinstance(self.recurrent, (SimpleRecurrent, SimpleRecurrentBatchNorm)): output_dim = dims[1] elif isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_dim = 4 * dims[1] else: raise NotImplementedError self.input_trans = Linear(name='input_trans', input_dim=dims[0], output_dim=output_dim, weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.output_trans = Linear(name='output_trans', input_dim=dims[1], output_dim=dims[2], weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.children = ( [self.input_trans, self.recurrent, self.output_trans] + self.activations) def _initialize(self): self.input_trans.initialize() self.output_trans.initialize() #self.recurrent.initialize() @application def apply(self, input_, input_mask=None, *args, **kwargs): input_recurrent = self.input_trans.apply(input_) try: input_recurrent = self.activations[0].apply(input_recurrent, input_mask=input_mask) except TypeError: input_recurrent = self.activations[0].apply(input_recurrent) output_recurrent = self.recurrent.apply(inputs=input_recurrent, mask=input_mask) if isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_recurrent = output_recurrent[0] output = self.output_trans.apply(output_recurrent) try: output = self.activations[1].apply(output, input_mask=input_mask) except TypeError: output = self.activations[1].apply(output) return output
class questionEncoder: def __init__(self, word_dim, hidden_dim): self.forward_lstm= LSTM(hidden_dim, name='question_forward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.backward_lstm= LSTM(hidden_dim, name='question_backward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_forward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_forward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_backward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_backward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.forward_lstm.initialize() self.backward_lstm.initialize() self.x_to_h_forward.initialize() self.x_to_h_backward.initialize() # variable question length # words: batch_size x q x word_dim # words_reverse: be the reverse sentence of words # padding with 0 to max length q # mask: batch_size def apply(self, words, words_reverse, mask_, batch_size): mask = mask_.flatten() # batch_size x q x hidden_dim Wx = self.x_to_h_forward.apply(words) Wx_r = self.x_to_h_backward.apply(words_reverse) # q x batch_size x hidden_dim Wx = Wx.swapaxes(0, 1) Wx_r = Wx_r.swapaxes(0, 1) # q x batch_size x hidden_dim hf, cf = self.forward_lstm.apply(Wx) hb, cb = self.backward_lstm.apply(Wx_r) for i in range(batch_size): T.set_subtensor(hb[0:mask[i]+1, i, :], hb[0:mask[i]+1, i, :][::-1]) # q x batch_size x (2 x hidden_dim) h = T.concatenate([hf, hb], axis=2) # batch_size x hidden_dim y_q = hf[mask, range(batch_size), :] y_1 = hb[0, range(batch_size), :] return h.swapaxes(0, 1), y_q, y_1
def lllistool(i, inp, func): l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), name='Lin{}'.format(i)) l.initialize() func.name='Fun{}'.format(i) if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=(DIMS[i]+DIMS[i+1])**(-0.5))) else: gong = func() ret = gong.apply(l.apply(inp)) return ret
def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name): # Since we pass this path twice (clean and corr encoder), we # want to make sure that parameters of both layers are shared. layer = self.shareds.get(layer_name) if layer is None: if layer_type == "fc": linear = Linear(use_bias=False, name=layer_name, input_dim=in_dim, output_dim=out_dim, seed=1) linear.weights_init = Glorot(self.rng, in_dim, out_dim) linear.initialize() layer = linear self.shareds[layer_name] = layer return layer.apply(input_)
def example4(): """LSTM -> Plante lors de l'initialisation du lstm.""" x = tensor.tensor3('x') dim = 3 # gate_inputs = theano.function([x],x*4) gate_inputs = Linear(input_dim=dim, output_dim=dim * 4, name="linear", weights_init=initialization.Identity(), biases_init=Constant(2)) lstm = LSTM(dim=dim, activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0)) gate_inputs.initialize() hg = gate_inputs.apply(x) #print(gate_inputs.parameters) #print(gate_inputs.parameters[1].get_value()) lstm.initialize() h, cells = lstm.apply(hg) print(lstm.parameters) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(4 * np.ones((dim, 1, dim), dtype=theano.config.floatX))) print("Good Job!") # lstm_output = #Initial State h0 = tensor.matrix('h0') c = tensor.matrix('cells') h, c1 = lstm.apply( inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs) f = theano.function([x, h0, c], h) print("a") print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
class MyRecurrent(Brick): def __init__(self, recurrent, dims, activations=[Identity(), Identity()], **kwargs): super(MyRecurrent, self).__init__(**kwargs) self.dims = dims self.recurrent = recurrent self.activations = activations if isinstance(self.recurrent, (SimpleRecurrent, SimpleRecurrentBatchNorm)): output_dim = dims[1] elif isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_dim = 4*dims[1] else: raise NotImplementedError self.input_trans = Linear(name='input_trans', input_dim=dims[0], output_dim=output_dim, weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.output_trans = Linear(name='output_trans', input_dim=dims[1], output_dim=dims[2], weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.children = ([self.input_trans, self.recurrent, self.output_trans] + self.activations) def _initialize(self): self.input_trans.initialize() self.output_trans.initialize() #self.recurrent.initialize() @application def apply(self, input_, input_mask=None, *args, **kwargs): input_recurrent = self.input_trans.apply(input_) try: input_recurrent = self.activations[0].apply(input_recurrent, input_mask=input_mask) except TypeError: input_recurrent = self.activations[0].apply(input_recurrent) output_recurrent = self.recurrent.apply(inputs=input_recurrent, mask=input_mask) if isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_recurrent = output_recurrent[0] output = self.output_trans.apply(output_recurrent) try: output = self.activations[1].apply(output, input_mask=input_mask) except TypeError: output = self.activations[1].apply(output) return output
def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512): self.hidden_size = hidden_size self.input1_size = input1_size self.input2_size = input2_size self.lookup1_dim = lookup1_dim self.lookup2_dim = lookup2_dim x1 = tensor.lmatrix('durations') x2 = tensor.lmatrix('syllables') y = tensor.lmatrix('pitches') lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup1.initialize() lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2', weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup2.initialize() merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) merge.initialize() recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3) recurrent_block.initialize() linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear.initialize() softmax = NDimensionalSoftmax() l1 = lookup1.apply(x1) l2 = lookup2.apply(x2) m = merge.apply(l1, l2) h = recurrent_block.apply(m) a = linear.apply(h) y_hat = softmax.apply(a, extra_ndim=1) # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D) self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean() self.ComputationGraph = ComputationGraph(self.Cost) self.Model = Model(y_hat)
def lllistool(i, inp, func): l = Linear(input_dim=DIMS[i], output_dim=DIMS[i + 1] * NUMS[i + 1], weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), name='Lin{}'.format(i)) l.initialize() func.name = 'Fun{}'.format(i) if func == SimpleRecurrent: gong = func(dim=DIMS[i + 1], activation=Rectifier(), weights_init=IsotropicGaussian( std=(DIMS[i] + DIMS[i + 1])**(-0.5))) else: gong = func() ret = gong.apply(l.apply(inp)) return ret
def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name): # Since we pass this path twice (clean and corr encoder), we # want to make sure that parameters of both layers are shared. layer = self.shareds.get(layer_name) if layer is None: if layer_type == 'fc': linear = Linear(use_bias=False, name=layer_name, input_dim=in_dim, output_dim=out_dim, seed=1) linear.weights_init = Glorot(self.rng, in_dim, out_dim) linear.initialize() layer = linear self.shareds[layer_name] = layer return layer.apply(input_)
def test_linear(): x = tensor.matrix() linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2), biases_init=Constant(1)) y = linear.apply(x) linear.initialize() x_val = numpy.ones((4, 16), dtype=theano.config.floatX) assert_allclose( y.eval({x: x_val}), x_val.dot(2 * numpy.ones((16, 8))) + numpy.ones((4, 8))) linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2), use_bias=False) y = linear.apply(x) linear.initialize() x_val = numpy.ones((4, 16), dtype=theano.config.floatX) assert_allclose(y.eval({x: x_val}), x_val.dot(2 * numpy.ones((16, 8))))
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def apply_layer(self, layer_type, input_, in_dim, out_dim, layer_name): # Since we pass this path twice (clean and corr encoder),we # want to make sure that parameters of both layers are shared. layer = self.shareds.get(layer_name) if layer is not None: return layer else: if layer_type == 'fc': linear = Linear(use_bias=False, name=layer_name, input_dim=in_dim, output_dim=out_dim, seed=1) layer = linear.apply(input_) linear.weights_init = IsotropicGaussian(1.0 / np.sqrt(in_dim)) linear.initialize() self.shareds[layer_name] = layer return layer
def example4(): """LSTM -> Plante lors de l'initialisation du lstm.""" x = tensor.tensor3('x') dim=3 # gate_inputs = theano.function([x],x*4) gate_inputs = Linear(input_dim=dim,output_dim=dim*4, name="linear",weights_init=initialization.Identity(), biases_init=Constant(2)) lstm = LSTM(dim=dim,activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0)) gate_inputs.initialize() hg = gate_inputs.apply(x) #print(gate_inputs.parameters) #print(gate_inputs.parameters[1].get_value()) lstm.initialize() h, cells = lstm.apply(hg) print(lstm.parameters) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(4*np.ones((dim, 1, dim), dtype=theano.config.floatX))) print("Good Job!") # lstm_output = #Initial State h0 = tensor.matrix('h0') c = tensor.matrix('cells') h,c1 = lstm.apply(inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs) f = theano.function([x, h0, c], h) print("a") print(f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
class TextRNN(object): def __init__(self, dim_in, dim_hidden, dim_out, **kwargs): self.dim_in = dim_in self.dim_hidden = dim_hidden self.dim_out = dim_out self.input_layer = Linear(input_dim=self.dim_in, output_dim=self.dim_hidden, weights_init=initialization.IsotropicGaussian(), biases_init=initialization.Constant(0)) self.input_layer.initialize() sparse_init = initialization.Sparse(num_init=15, weights_init=initialization.IsotropicGaussian()) self.recurrent_layer = SimpleRecurrent( dim=self.dim_hidden, activation=Tanh(), name="first_recurrent_layer", weights_init=sparse_init, biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer = LSTM(dim=self.dim_hidden, activation=Tanh(), weights_init=initialization.IsotropicGaussian(std=0.001), biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer.initialize() self.output_layer = Linear(input_dim=self.dim_hidden, output_dim=self.dim_out, weights_init=initialization.Uniform(width=0.01), biases_init=initialization.Constant(0.01)) self.output_layer.initialize() self.children = [self.input_layer, self.recurrent_layer, self.output_layer] ''' @recurrent(sequences=['inputs'], states=['states'], contexts=[], outputs=['states', 'output']) ''' def run(self, inputs): output = self.output_layer.apply( self.recurrent_layer.apply(self.input_layer.apply(inputs)) ) return output
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or ( args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
class seqDecoder: def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim): self.W = Linear(input_dim=feature_dim, output_dim=memory_dim * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='seqDecoder_W') self.GRU_A = LSTM(feature_dim, name='seqDecoder_A', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.GRU_B = LSTM(memory_dim, name='seqDecoder_B', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.W.initialize() self.GRU_A.initialize() self.GRU_B.initialize() self.fc1 = Linear(input_dim=memory_dim, output_dim=fc1_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc1') self.fc2 = Linear(input_dim=fc1_dim, output_dim=fc2_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc2') self.fc1.initialize() self.fc2.initialize() # A: the encoding of GRU_A, # B: the encoding of GRU_B # padding: the tensor constant def apply(self, output_length, A, B, padding): A_, garbage = self.GRU_A.apply(padding, states=A) WA_ = self.W.apply(A_) # output_length x batch_size x output_dim B_, garbage = self.GRU_B.apply(WA_, states=B) # batch_size x output_length x output_dim B_ = B_.swapaxes(0,1) fc1_r = relu(self.fc1.apply(B_)) fc2_r = relu(self.fc2.apply(fc1_r)) return fc2_r
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = (X - 128) / 128.0 self.scaled = o #n_hidden = 64 n_hidden = 2048 * 2 n_zs = 1024 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=32 * 32 * 3, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() h_decode_produce = Linear(input_dim=n_hidden, output_dim=32 * 32 * 3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) h_decode_produce = Linear(input_dim=n_hidden, output_dim=32 * 32 * 3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #self.produced = Sigmoid().apply(o) seq = Sequence([ z_to_h1_decode.apply, Rectifier().apply, h1_decode_to_h_decode.apply, Rectifier().apply, h_decode_produce.apply, Sigmoid().apply ]) seq.initialize() self.produced = seq.apply(z) self.cost = T.mean(T.sqr(self.produced - self.scaled)) #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, self.scaled)) #T.sum(T.sqr(self.produced - self.scaled)) self.cost.name = "cost" self.variational_cost = - 0.5 * T.mean(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + self.cost self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [ get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var) ] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
## Let's initialize the variables lookup.allocate() #print("lookup.parameters=", lookup.parameters) # ('lookup.parameters=', [W]) #lookup.weights_init = FUNCTION #lookup.initialize() #lookup.params[0].set_value( np.random.normal( scale = 0.1, size=(vocab_size, embedding_dim) ).astype(np.float32) ) #lookup.params[0].set_value( embedding ) # See : https://github.com/mila-udem/blocks/blob/master/tests/bricks/test_lookup.py #lookup.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX)) lookup.W.set_value(embedding.astype(theano.config.floatX)) rnn.initialize() gather.initialize() ## Now for the application of these units # Define the shape of x specifically... :: the data has format (batch, features). x.tag.test_value = np.random.randint(vocab_size, size=batch_of_sentences).astype(np.int32) x_extra.tag.test_value = np.zeros( (max_sentence_length, mini_batch_size, 1)).astype(np.float32) x_mask.tag.test_value = np.random.choice( [0.0, 1.0], size=batch_of_sentences).astype(np.float32) print("x shape", x.shape.tag.test_value) # array([29, 16])) word_embedding = lookup.apply(x) print("word_embedding shape",
## Let's initialize the variables lookup.allocate() #print("lookup.parameters=", lookup.parameters) # ('lookup.parameters=', [W]) #lookup.weights_init = FUNCTION #lookup.initialize() #lookup.params[0].set_value( np.random.normal( scale = 0.1, size=(vocab_size, embedding_dim) ).astype(np.float32) ) #lookup.params[0].set_value( embedding ) # See : https://github.com/mila-udem/blocks/blob/master/tests/bricks/test_lookup.py #lookup.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX)) lookup.W.set_value( embedding.astype(theano.config.floatX) ) rnn.initialize() gather.initialize() ## Now for the application of these units # Define the shape of x specifically... :: the data has format (batch, features). x.tag.test_value = np.random.randint(vocab_size, size=batch_of_sentences ).astype(np.int32) x_extra.tag.test_value = np.zeros( (max_sentence_length, mini_batch_size, 1) ).astype(np.float32) x_mask.tag.test_value = np.random.choice( [0.0, 1.0], size=batch_of_sentences ).astype(np.float32) print("x shape", x.shape.tag.test_value) # array([29, 16])) word_embedding = lookup.apply(x) print("word_embedding shape", word_embedding.shape.tag.test_value) # array([ 29, 16, 100])) print("x_extra shape", x_extra.shape.tag.test_value) # array([ 29, 16, 1]))
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
x = tensor.tensor4('features') y = tensor.lmatrix('targets') l1 = Linear(name='l1', input_dim=784, output_dim=500, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l2 = Linear(name='l2', input_dim=500, output_dim=500, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l3 = Linear(name='l3', input_dim=500, output_dim=500, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l4 = Linear(name='l4', input_dim=500, output_dim=500, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l5 = Linear(name='l5', input_dim=500, output_dim=10, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l1.initialize() l2.initialize() l3.initialize() l4.initialize() l5.initialize() a1 = l1.apply(x.flatten(2)/255) a1.name = 'a1' n1, M1, S1 = normalize(a1, output_dim = 500) o1 = Logistic().apply(n1) a2 = l2.apply(o1) n2, M2, S2 = normalize(a2, output_dim = 500) o2 = Logistic().apply(n2) a3 = l3.apply(o2)
x = tensor.lmatrix('syllables') y = tensor.lmatrix('durations') lookup_input = LookupTable(name='lookup_input', length=train_dataset.syllables_vocab_size() + 1, dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup_input.initialize() linear_input = Linear(name='linear_input', input_dim=hidden_layer_dim, output_dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_input.initialize() rnn = SimpleRecurrent(name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax')
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
class ETHM(EUTHM): '''Model with only textual-hashtag information''' def __init__(self, config, dataset, *args, **kwargs): super(ETHM, self).__init__(config, dataset) def _build_model(self, *args, **kwargs): # Define inputs self._define_inputs() self._build_bricks() self._set_OV_value() # Transpose text self.text = self.text.dimshuffle(1, 0) self.text_mask = self.text_mask.dimshuffle(1, 0) self.sparse_word = self.sparse_word.dimshuffle(1, 0) self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0) # Turn word, and hashtag into vector representation text_vec = self.word_embed.apply(self.text) # Apply word and hashtag word and url text_vec = self._apply_hashtag_word(text_vec) text_vec = self._apply_sparse_word(text_vec) # Encode text mlstm_hidden, mlstm_cell = self.mlstm.apply( inputs=self.mlstm_ins.apply(text_vec), mask=self.text_mask.astype(theano.config.floatX)) text_encodes = mlstm_hidden[-1] input_vec = text_encodes self._get_cost(input_vec, None, None) def _define_inputs(self, *args, **kwargs): self.hashtag = tensor.ivector('hashtag') self.text = tensor.imatrix('text') self.text_mask = tensor.matrix('text_mask', dtype=theano.config.floatX) self.hashtag_word = tensor.ivector('hashtag_word') self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask', dtype=theano.config.floatX) self.hashtag_word_left_idx = tensor.ivector( 'hashtag_word_idx_left_idx') self.hashtag_word_right_idx = tensor.ivector( 'hashtag_word_idx_right_idx') self.sparse_word = tensor.imatrix('sparse_word') self.sparse_word_sparse_mask = tensor.vector( 'sparse_word_sparse_mask', dtype=theano.config.floatX) self.sparse_word_mask = tensor.matrix('sparse_word_mask', dtype=theano.config.floatX) self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx') self.sparse_word_right_idx = tensor.ivector( 'sparse_word_idx_right_idx') def _build_bricks(self, *args, **kwargs): # Build lookup tables self.word_embed = self._embed(len(self.dataset.word2index), self.config.word_embed_dim, name='word_embed') self.hashtag_embed = self._embed(len(self.dataset.hashtag2index), self.config.lstm_dim, name='hashtag_embed') # Build text encoder self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim, output_dim=4 * self.config.lstm_dim, name='mlstm_in') self.mlstm_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm_ins.biases_init = Constant(0) self.mlstm_ins.initialize() self.mlstm = MLSTM(self.config.lstm_time, self.config.lstm_dim, shared=False) self.mlstm.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm.biases_init = Constant(0) self.mlstm.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[self.config.lstm_dim, self.config.word_embed_dim], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize() def _apply_dropout(self, outputs, *args, **kwargs): variables = [self.word_embed.W, self.hashtag_embed.W] cgs = ComputationGraph(outputs) cg_dropouts = apply_dropout(cgs, variables, drop_prob=self.config.dropout_prob, seed=123).outputs return cg_dropouts def _apply_reg(self, cost, params=None, *args, **kwargs): try: if self.config.l2_norm > 0: cost = cost + self.config.l2_norm * theano_expressions.l2_norm( tensors=[self.hashtag_embed.W, self.word_embed.W])**2 else: pass except Exception: pass return cost
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"): brown = BrownDataset(corpus) INPUT_DIMS = brown.get_vocabulary_size() OUTPUT_DIMS = brown.get_vocabulary_size() # These are theano variables x = tensor.lmatrix('context') y = tensor.ivector('output') # Construct the graph input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS, dim=HIDDEN_DIMS) # Compute the weight matrix for every word in the context and then compute # the average. h = tensor.mean(input_to_hidden.apply(x), axis=1) hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS, output_dim=OUTPUT_DIMS) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' mini_batch = SequentialScheme(brown.num_instances(), 512) data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch) # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ ProgressBar(), FinishAfter(after_n_epochs=epochs), Printing(), # TrainingDataMonitoring(variables=[cost]), SaveWeights(layers=[input_to_hidden, hidden_to_output], prefixes=['%sfirst' % path, '%ssecond' % path]), # Plot( # 'Word Embeddings', # channels=[ # [ # 'cost_with_regularization' # ] # ]) ] logger.info("Starting main loop...") main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' #定义一个多层神经网络,层与层之间的计算公式已经被之前定义。 #激活函数集activations定义了每一层的非线性转换函数,多层感知器每一层的输出都包含了两部分,第一部分是线性计算,然后将线性计算的结果进行非线性转换 #x是多层感知器的输入 mlp = MLP(activations = [Rectifier(),Softmax()], dims = [784, 100, 10]).apply(x) #定义完整个神经网络的流程后,需要设置其线性转换的参数的初始值。 input_to_hidden.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = Constant(0); hidden_to_output.weights_init = IsotropicGaussian(0.01) hidden_to_output.biases_init = Constant(0) #对设置进行初始化设置,必须要做这一步,否则之前的设置都没有用 input_to_hidden.initialize() hidden_to_output.initialize() print W1.get_value() #然后开始进行模型训练,这里使用现有的内置的数据集 MNIST,如果想要使用别的数据集,需要使用fuel对数据进行预处理 mnist = MNIST(("train",)) #定义迭代计算的方式,使用mini-batch的方法计算,每一次mini-batch使用1024条数据。以此获得数据流,data_stream data_stream = Flatten(DataStream.default_stream(mnist, iteration_scheme = SequentialScheme(mnist.num_examples, batch_size = 256))) #定义优化函数的最优值计算方法,这边使用SGD来做 #algorithm = GradientDescent(cost = cost, parameters = [cg.parameters], step_rule = Scale(learning_rate = 0.01)) algorithm = GradientDescent(cost = cost, parameters = cg.parameters, step_rule = Scale(learning_rate = 0.01)) # define to use gradient dscent to compute print "------"
class impatientLayer: # both visual and word feature are in the joint space # of dim: feature_dim # hidden_dim: dim of m # output_dim: final joint document query representation dim def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='m_to_s') self.attention_dist = Softmax(name='attention_dist_softmax') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() # the sequence to sequence LSTM self.seq = LSTM(output_dim, name='rewatcher_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rewatcher_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # doc: row major batch_size x doc_length x feature_dim # query: row major batch_size x q x feature_dim # mask: mask of query batch_size # mask: length of a sentence - 1 def apply(self, doc, query, mask_, batch_size): # batch_size x doc_length x hidden_dim mask = mask_.flatten() att1 = self.image_embed.apply(doc) # y_q_i: the ith token of question # batch_size x feature_dim # r_1: r_m_1 # batch_size x feature_dim # y_d: document # batch_size x doc_length x feature_dim # y_d_m: d-to-m # batch_size x doc_length x hidden_dim def one_step(y_q_i, r_1, y_d, y_d_m): # batch_size x hidden_dim att2 = self.r_embed.apply(r_1) # batch_size x hidden_dim att3 = self.word_embed.apply(y_q_i) att = y_d_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1) # batch_size x doc_length x hidden_dim m = T.tanh(att) # batch_size x doc_length x 1 s = self.m_to_s.apply(m) # batch_size x doc_length s = s.reshape((s.shape[0], s.shape[1])) s = self.attention_dist.apply(s) y_d_s = y_d.swapaxes(1, 2) # return batch_size x feature_dim return T.batched_dot(y_d_s, s) + T.tanh(self.r_to_r.apply(r_1)) # query: batch_size x q x feature_dim # r: q x batch_size x feature_dim r, updates = theano.scan(fn=one_step, sequences=[query.swapaxes(0,1)], outputs_info=T.zeros_like(doc[:, 0, :]), non_sequences=[doc, att1], n_steps=query.shape[1], name='impatient layer') # for the sequence encoder # q x batch_size x output_dim Wr = self.seq_embed.apply(r) # q x batch_size x output_dim seq_r, garbage = self.seq.apply(Wr) # batch_size x feature_dim r_q = r[mask, T.arange(batch_size), :] seq_r_q = seq_r[mask, T.arange(batch_size), :] # batch_size x output_dim return r_q, seq_r_q
def shroom_mlp(shrooms_train, shrooms_test, num_epochs, hidden_dims, activation_function): # These are theano variables x = tensor.matrix('features') y = tensor.lmatrix('targets') # Construct the graph input_to_hidden = Linear(name='input_to_hidden', input_dim=117, output_dim=hidden_dims) h = activation_function.apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_dims, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) # Not needed for now: W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) # The data streams give us access to our corpus and allow us to perform a # mini-batch training. data_stream = Flatten(DataStream.default_stream( shrooms_train, iteration_scheme=SequentialScheme(shrooms_train.num_examples, batch_size=128))) test_data_stream = Flatten(DataStream.default_stream( shrooms_test, iteration_scheme=SequentialScheme(shrooms_test.num_examples, batch_size=1000))) extensions = [ ProgressBar(), PlotWeights(after_epoch=True, folder="results_logistic_40_interpolation", computation_graph=cg, folder_per_layer=True, dpi=150), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring(variables=[cost, error_rate], data_stream=test_data_stream, prefix="test"), Printing() ] # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() return 0
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
lookup_input = LookupTable( name='lookup_input', length=charset_size+1, dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) lookup_input.initialize() linear_input = Linear( name='linear_input', input_dim=hidden_layer_dim, output_dim=hidden_layer_dim, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_input.initialize() rnn = SimpleRecurrent( name='hidden', dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear( name='linear_output', input_dim=hidden_layer_dim, output_dim=charset_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize()
def rf_lstm_experiment(data_name, exp_network, in_dim, out_dim, num_layers, start_neurons, num_neurons, batch_size, num_epochs): """LSTM Experiment.""" # load dataset train_set = IterableDataset( ds.transform_sequence(data_name, "train", batch_size)) test_set = IterableDataset( ds.transform_sequence(data_name, "test", batch_size)) stream_train = DataStream(dataset=train_set) stream_test = DataStream(dataset=test_set) methods = ['sgd', 'momentum', 'adagrad', 'rmsprop'] for n_layers in xrange(1, num_layers + 1): for n_neurons in xrange(start_neurons, num_neurons + 5, 5): for method in methods: X = T.tensor3("features") y = T.matrix("targets") x_to_h = Linear(in_dim, n_neurons * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(n_neurons, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = nc.setup_ff_network(n_neurons, out_dim, n_layers - 1, n_neurons) X_trans = x_to_h.apply(X) h, c = lstm.apply(X_trans) y_hat = h_to_o.apply(h[-1]) cost, cg = nc.create_cg_and_cost(y, y_hat, "none") lstm.initialize() x_to_h.initialize() h_to_o.initialize() algorithm = nc.setup_algorithms(cost, cg, method, type="RNN") test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop( algorithm=algorithm, data_stream=stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() # Saving results exp_id = ds.create_exp_id(exp_network, n_layers, n_neurons, batch_size, num_epochs, method, "none") # prepare related functions predict = theano.function([X], y_hat) # prepare related data train_features, train_targets = ds.get_iter_data(train_set) test_features, test_targets = ds.get_iter_data(test_set) # Prediction of result train_predicted = gen_prediction(predict, train_features) test_predicted = gen_prediction(predict, test_features) # Get cost cost = ds.get_cost_data(test_monitor, train_set.num_examples, num_epochs) # logging ds.save_experiment(train_targets, train_predicted, test_targets, test_predicted, cost, exp_network, n_layers, n_neurons, batch_size, num_epochs, method, "none", exp_id, "../results/")
print "vocab size:", VOCAB_DIM EMBEDDING_DIM = 100 Xs = tensor.imatrix("context") y = tensor.ivector('center') w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM) w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM) hidden = tensor.mean(w1.apply(Xs), axis=1) y_hat = Softmax().apply(w2.apply(hidden)) w1.weights_init = w2.weights_init = IsotropicGaussian(0.01) w1.biases_init = w2.biases_init = Constant(0) w1.initialize() w2.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = "loss" # # the actual training of the model # main = MainLoop(data_stream = DataStream.default_stream( dataset,
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') #rnn = SimpleRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) #rnn = GatedRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) embedding_size = 300 #glove_version = "vectors.6B.100d.txt" glove_version = "glove.6B.300d.txt" #fork = Fork(weights_init=IsotropicGaussian(0.02), #biases_init=Constant(0.), #input_dim=embedding_size, #output_dims=[embedding_size]*3, #output_names=['inputs', 'reset_inputs', 'update_inputs'] #) rnn = LSTM( dim = embedding_size, activation=Tanh(), weights_init = IsotropicGaussian(std=0.02), ) rnn.initialize() #fork.initialize() wstd = 0.02 score_layer = Linear( input_dim = 128, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() gloveMapping = Linear( input_dim = embedding_size, output_dim = embedding_size, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.0), name="gloveMapping" ) gloveMapping.initialize() o = gloveMapping.apply(x) o = Rectifier(name="rectivfyglove").apply(o) forget_bias = np.zeros((embedding_size*4), dtype=theano.config.floatX) forget_bias[embedding_size:embedding_size*2] = 4.0 toLSTM = Linear( input_dim = embedding_size, output_dim = embedding_size*4, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(forget_bias), #biases_init = Constant(0.0), name="ToLSTM" ) toLSTM.initialize() rnn_states, rnn_cells = rnn.apply(toLSTM.apply(o) * T.shape_padright(m), mask=m) #inputs, reset_inputs, update_inputs = fork.apply(x) #rnn_states = rnn.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs, mask=m) #rnn_out = rnn_states[:, -1, :] rnn_out = (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1) / m.sum(axis=1).dimshuffle(0, 'x') #rnn_out = (rnn_states).mean(axis=1)# / m.sum(axis=1) hidden = Linear( input_dim = embedding_size, output_dim = 128, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) hidden.initialize() o = hidden.apply(rnn_out) o = Rectifier().apply(o) hidden = Linear( input_dim = 128, output_dim = 128, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="hiddenmap2") hidden.initialize() o = hidden.apply(o) o = Rectifier(name="rec2").apply(o) o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), AdaM(), #AdaDelta(), ]) ) # ======== print "setting up data" train_dataset = IMDBText('train') test_dataset = IMDBText('test') batch_size = 16 n_train = train_dataset.num_examples train_stream = DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=train_stream) train_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) test_stream = DataStream( dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=test_stream) test_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) print "setting up model" #import ipdb #ipdb.set_trace() lstm_norm = rnn.W_state.norm(2) lstm_norm.name = "lstm_norm" pre_norm= gloveMapping.W.norm(2) pre_norm.name = "pre_norm" #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification, lstm_norm, pre_norm], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) extensions.append(Plot("result", channels=[['train_cost', 'train_misclassification']], after_epoch=True)) main_loop = MainLoop( model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()
def train(self): x = self.sharedBatch['x'] x.name = 'x_myinput' xmini = self.sharedBatch['xmini'] xmini.name = 'xmini_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx, self.dim, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) xmini_to_h = Linear(self.input_dimxmini, self.mini_dim, name='xmini_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) rnnwmini = RNNwMini(dim=self.dim, mini_dim=self.mini_dim, summary_dim=self.summary_dim) h_to_o = Linear(self.summary_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) xmini_transform = xmini_to_h.apply(xmini) h = rnnwmini.apply(x=x_transform, xmini=xmini_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) #y_hat = Logistic().apply(y_hat) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' rnnwmini.initialize() x_to_h.initialize() xmini_to_h.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(): x = T.tensor3('features') #m = T.matrix('features_mask') y = T.imatrix('targets') #x = x+m.mean()*0 embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" wstd = 0.02 #vaguely normalize x = x / 3.0 - .5 #gloveMapping = Linear( #input_dim = embedding_size, #output_dim = 128, #weights_init = Orthogonal(), #biases_init = Constant(0.0), #name="gloveMapping" #) #gloveMapping.initialize() #o = gloveMapping.apply(x) #o = Rectifier(name="gloveRec").apply(o) o = x input_dim = 300 gru = GatedRecurrentFull( hidden_dim=input_dim, activation=Tanh(), #activation=bricks.Identity(), gate_activation=Sigmoid(), state_to_state_init=IsotropicGaussian(0.02), state_to_reset_init=IsotropicGaussian(0.02), state_to_update_init=IsotropicGaussian(0.02), input_to_state_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_update_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_reset_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0))) gru.initialize() rnn_in = o.dimshuffle(1, 0, 2) #rnn_in = o #rnn_out = gru.apply(rnn_in, mask=m.T) rnn_out = gru.apply(rnn_in) state_to_state = gru.rnn.state_to_state state_to_state.name = "state_to_state" #o = rnn_out[-1, :, :] o = rnn_out[-1] #o = rnn_out[:, -1, :] #o = rnn_out.mean(axis=1) #print rnn_last_out.eval({ #x: np.ones((3, 101, 300), dtype=theano.config.floatX), #m: np.ones((3, 101), dtype=theano.config.floatX)}) #raw_input() #o = rnn_out.mean(axis=1) score_layer = Linear(input_dim=300, output_dim=1, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.), use_bias=True, name="linear_score") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print rnn_in.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #}) #print rnn_out.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters for p in params: p.name += "___" + p.tag.annotations[0].name algorithm = GradientDescent( cost=cg.outputs[0], params=params, step_rule=CompositeRule([ StepClipping(threshold=4), AdaM(), #NAG(lr=0.1, momentum=0.9), #AdaDelta(), ])) #algorithm.initialize() print params f = theano.function([x, y], algorithm.cost) ipdb.set_trace() print "making plots" #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png') theano.printing.pydotprint(f, outfile='opt.png', scan_graphs=True)
def main(): nvis, nhid, nlat, learn_prior = 784, 200, 100, False theano_rng = MRG_RandomStreams(134663) # Initialize prior prior_mu = shared_floatx(numpy.zeros(nlat), name='prior_mu') prior_log_sigma = shared_floatx(numpy.zeros(nlat), name='prior_log_sigma') if learn_prior: add_role(prior_mu, PARAMETER) add_role(prior_log_sigma, PARAMETER) # Initialize encoding network encoding_network = MLP(activations=[Rectifier()], dims=[nvis, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_network.initialize() encoding_parameter_mapping = Fork( output_names=['mu_phi', 'log_sigma_phi'], input_dim=nhid, output_dims=dict(mu_phi=nlat, log_sigma_phi=nlat), prototype=Linear(), weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) encoding_parameter_mapping.initialize() # Initialize decoding network decoding_network = MLP(activations=[Rectifier()], dims=[nlat, nhid], weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_network.initialize() decoding_parameter_mapping = Linear( input_dim=nhid, output_dim=nvis, name='mu_theta', weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0)) decoding_parameter_mapping.initialize() # Encode / decode x = tensor.matrix('features') h_phi = encoding_network.apply(x) mu_phi, log_sigma_phi = encoding_parameter_mapping.apply(h_phi) epsilon = theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) epsilon.name = 'epsilon' z = mu_phi + epsilon * tensor.exp(log_sigma_phi) z.name = 'z' h_theta = decoding_network.apply(z) mu_theta = decoding_parameter_mapping.apply(h_theta) # Compute cost kl_term = ( prior_log_sigma - log_sigma_phi + 0.5 * ( tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu) ** 2 ) / tensor.exp(2 * prior_log_sigma) - 0.5 ).sum(axis=1) kl_term.name = 'kl_term' kl_term_mean = kl_term.mean() kl_term_mean.name = 'avg_kl_term' reconstruction_term = - ( x * tensor.nnet.softplus(-mu_theta) + (1 - x) * tensor.nnet.softplus(mu_theta)).sum(axis=1) reconstruction_term.name = 'reconstruction_term' reconstruction_term_mean = -reconstruction_term.mean() reconstruction_term_mean.name = 'avg_reconstruction_term' cost = -(reconstruction_term - kl_term).mean() cost.name = 'nll_upper_bound' # Datasets and data streams mnist_train = MNIST( 'train', start=0, stop=50000, binary=True, sources=('features',)) train_loop_stream = DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100)) train_monitor_stream = DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500)) mnist_valid = MNIST( 'train', start=50000, stop=60000, binary=True, sources=('features',)) valid_monitor_stream = DataStream( dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500)) mnist_test = MNIST('test', binary=True, sources=('features',)) test_monitor_stream = DataStream( dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500)) # Get parameters computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitored_quantities = [cost, reconstruction_term_mean, kl_term_mean] main_loop = MainLoop( model=None, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=200), DataStreamMonitoring( monitored_quantities, train_monitor_stream, prefix="train"), DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid"), DataStreamMonitoring( monitored_quantities, test_monitor_stream, prefix="test"), Printing()]) main_loop.run()
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = (X - 128) / 128.0 self.scaled = o #n_hidden = 64 n_hidden = 2048 * 2 n_zs = 1024 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=32*32*3, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() h_decode_produce = Linear(input_dim=n_hidden, output_dim=32*32*3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) h_decode_produce = Linear(input_dim=n_hidden, output_dim=32*32*3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #self.produced = Sigmoid().apply(o) seq = Sequence([z_to_h1_decode.apply, Rectifier().apply, h1_decode_to_h_decode.apply, Rectifier().apply, h_decode_produce.apply, Sigmoid().apply]) seq.initialize() self.produced = seq.apply(z) self.cost = T.mean(T.sqr(self.produced - self.scaled)) #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, self.scaled)) #T.sum(T.sqr(self.produced - self.scaled)) self.cost.name = "cost" self.variational_cost = - 0.5 * T.mean(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + self.cost self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var)] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
class EUTHM(UTHM): ''' UTH model with extend information ''' def __init__(self, config, dataset, *args, **kwargs): super(EUTHM, self).__init__(config, dataset) def _define_inputs(self, *args, **kwargs): super(EUTHM, self)._define_inputs() self.user_word = tensor.ivector('user_word') self.user_word_sparse_mask = tensor.vector('user_word_sparse_mask', dtype=theano.config.floatX) self.user_word_left_idx = tensor.ivector('user_word_idx_left_idx') self.user_word_right_idx = tensor.ivector('user_word_idx_right_idx') self.hashtag_word = tensor.ivector('hashtag_word') self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask', dtype=theano.config.floatX) self.hashtag_word_left_idx = tensor.ivector( 'hashtag_word_idx_left_idx') self.hashtag_word_right_idx = tensor.ivector( 'hashtag_word_idx_right_idx') self.sparse_word = tensor.imatrix('sparse_word') self.sparse_word_sparse_mask = tensor.vector( 'sparse_word_sparse_mask', dtype=theano.config.floatX) self.sparse_word_mask = tensor.matrix('sparse_word_mask', dtype=theano.config.floatX) self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx') self.sparse_word_right_idx = tensor.ivector( 'sparse_word_idx_right_idx') def _build_bricks(self, *args, **kwargs): # Build lookup tables super(EUTHM, self)._build_bricks() self.user2word = MLP( activations=[Tanh('user2word_tanh')], dims=[self.config.user_embed_dim, self.config.word_embed_dim], name='user2word_mlp') self.user2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.user2word.biases_init = Constant(0) self.user2word.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[ self.config.user_embed_dim + self.config.word_embed_dim, self.config.word_embed_dim ], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.user2word_bias = Bias(dim=1, name='user2word_bias') self.user2word_bias.biases_init = Constant(0) self.user2word_bias.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize() def _set_OV_value(self, *args, **kwargs): '''Train a <unk> representation''' tensor.set_subtensor( self.char_embed.W[self.dataset.char2index['<unk>']], numpy.zeros(self.config.char_embed_dim, dtype=theano.config.floatX)) def _get_text_vec(self, *args, **kwargs): # Transpose text self.text = self.text.dimshuffle(1, 0) self.text_mask = self.text_mask.dimshuffle(1, 0) self.sparse_word = self.sparse_word.dimshuffle(1, 0) self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0) # Turn word, user and hashtag into vector representation text_vec = self.word_embed.apply(self.text) # Apply user word, hashtag word and url text_vec = self._apply_user_word(text_vec) text_vec = self._apply_hashtag_word(text_vec) text_vec = self._apply_sparse_word(text_vec) return text_vec @abstractmethod def _apply_user_word(self, text_vec, *args, **kwargs): ''' Replace @a with transformed author vector :param text_vec: :param args: :param kwargs: :return: ''' user_word_vec = self.user2word.apply(self.user_embed.apply(self.user_word)) + \ self.user2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.user_word_right_idx, self.user_word_left_idx], text_vec[self.user_word_right_idx, self.user_word_left_idx] * (1 - self.user_word_sparse_mask[:, None]) + user_word_vec * self.user_word_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_hashtag_word(self, text_vec, *args, **kwargs): ''' Replace #h with transformed hashtag vector :param text_vec: :param args: :param kwargs: :return: ''' hashtag_word_vec = self.hashtag2word.apply(self.hashtag_embed.apply(self.hashtag_word)) +\ self.hashtag2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx], text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx] * (1 - self.hashtag_sparse_mask[:, None]) + hashtag_word_vec * self.hashtag_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_sparse_word(self, text_vec, *args, **kwargs): ''' Replace sparse word encoding with character embedding. (maybe lstm) :param text_vec: :param args: :param kwargs: :return: ''' sparse_word_vec = self.char_embed.apply(self.sparse_word) sparse_word_hiddens = self.rnn.apply( inputs=self.rnn_ins.apply(sparse_word_vec), mask=self.sparse_word_mask) tmp = sparse_word_hiddens[-1] text_vec = tensor.set_subtensor( text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx], text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx] * (1 - self.sparse_word_sparse_mask[:, None]) + tmp * self.sparse_word_sparse_mask[:, None]) return text_vec
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError('z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)') z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream(IndexableDataset(indexables=OrderedDict([ ('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord( stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream,) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*4, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d'%l, ogates_zoneout=ogates_zoneout) for l in range(num_layers)] elif rnn_type.lower() == 'gru': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*3, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d'%l) for l in range(num_layers)] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d'%l) for l in range(num_layers)] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply(rnn_embedding, zoneouts_states[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size : (l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply( y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost/np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt(T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states'%l in [o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states'%l: norms = _magnitude(output) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy('cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x*x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [cost_train, cost, bpc, perp, learning_rate, aggregation.mean(algorithm.total_gradient_norm).copy("gradient_norm_mean")] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring( variables=observed_vars, prefix="train", after_epoch=True ) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring( variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates ) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend([FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates ) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append(SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append(RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value()/lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError('Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset( generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset( generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() print('Learned weights:') for layer in (x_to_h, lstm, h_to_o): print("Layer '%s':" % layer.name) for param in layer.parameters: print(param.name, ': ', param.get_value()) print() return main_loop
def main(): x = T.tensor3('features') #m = T.matrix('features_mask') y = T.imatrix('targets') #x = x+m.mean()*0 embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" wstd = 0.02 #vaguely normalize x = x / 3.0 - .5 #gloveMapping = Linear( #input_dim = embedding_size, #output_dim = 128, #weights_init = Orthogonal(), #biases_init = Constant(0.0), #name="gloveMapping" #) #gloveMapping.initialize() #o = gloveMapping.apply(x) #o = Rectifier(name="gloveRec").apply(o) o = x input_dim = 300 gru = GatedRecurrentFull( hidden_dim = input_dim, activation=Tanh(), #activation=bricks.Identity(), gate_activation=Sigmoid(), state_to_state_init=IsotropicGaussian(0.02), state_to_reset_init=IsotropicGaussian(0.02), state_to_update_init=IsotropicGaussian(0.02), input_to_state_transform = Linear( input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_update_transform = Linear( input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_reset_transform = Linear( input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)) ) gru.initialize() rnn_in = o.dimshuffle(1, 0, 2) #rnn_in = o #rnn_out = gru.apply(rnn_in, mask=m.T) rnn_out = gru.apply(rnn_in) state_to_state = gru.rnn.state_to_state state_to_state.name = "state_to_state" #o = rnn_out[-1, :, :] o = rnn_out[-1] #o = rnn_out[:, -1, :] #o = rnn_out.mean(axis=1) #print rnn_last_out.eval({ #x: np.ones((3, 101, 300), dtype=theano.config.floatX), #m: np.ones((3, 101), dtype=theano.config.floatX)}) #raw_input() #o = rnn_out.mean(axis=1) score_layer = Linear( input_dim = 300, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), use_bias=True, name="linear_score") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print rnn_in.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #}) #print rnn_out.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters for p in params: p.name += "___" + p.tag.annotations[0].name algorithm = GradientDescent( cost = cg.outputs[0], params=params, step_rule = CompositeRule([ StepClipping(threshold=4), AdaM(), #NAG(lr=0.1, momentum=0.9), #AdaDelta(), ]) ) #algorithm.initialize() print params f = theano.function([x, y], algorithm.cost) ipdb.set_trace() print "making plots" #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png') theano.printing.pydotprint(f , outfile='opt.png', scan_graphs=True)
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T - 2 train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 10 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = Bidirectional(LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) )) rnn.initialize() score_layer = Linear( input_dim=2*n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = tensor.mean(rnn_out[0], axis=0) probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100, 110), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(110, 120), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()