def __init__(self, rng, input, n_in, n_out, mu_activation, n_components): self.input = input shape=(n_in, n_out, n_components) self.W_mu=u.init_weight(shape,rng=rng,name='whid',sample='glorot') # b_values = np.zeros((n_out,n_components), dtype=theano.config.floatX) # self.b_mu = theano.shared(value=b_values, name='b_mu', borrow=True) # self.mu = mu_activation(T.tensordot(input, self.W_mu,axes = [[1],[0]]) +self.b_mu.dimshuffle('x',0,1)) self.mu = mu_activation(T.tensordot(input, self.W_mu,axes = [[1],[0]]) ) shape=(n_in, n_components) self.W_sigma=u.init_weight(shape,rng=rng,name='W_sigma',sample='glorot') self.W_mixing=u.init_weight(shape,rng=rng,name='W_mixing',sample='glorot') # b_values = np.zeros((n_components,), dtype=theano.config.floatX) # self.b_sigma = theano.shared(value=b_values.copy(), name='b_sigma', borrow=True) # self.b_mixing = theano.shared(value=b_values.copy(), name='b_mixing',borrow=True) # self.sigma = T.nnet.softplus(T.dot(input, self.W_sigma)+self.b_sigma.dimshuffle('x',0)) # self.mixing = T.nnet.softmax(T.dot(input, self.W_mixing)+self.b_mixing.dimshuffle('x',0)) self.sigma = T.nnet.softplus(T.dot(input, self.W_sigma)) self.mixing = T.nnet.softmax(T.dot(input, self.W_mixing)) # parameters of the model # self.params = [self.W_mu, self.b_mu, self.W_sigma, self.b_sigma,self.W_mixing, self.b_mixing] self.params = [self.W_mu, self.W_sigma, self.W_mixing]
def __init__(self,rng,params,cost_function='mse',optimizer = RMSprop): lr=params["lr"] batch_size=params["batch_size"] n_output=params['n_output'] corruption_level=params["corruption_level"] X = T.matrix(name="input",dtype=dtype) # batch of sequence of vector Y = T.matrix(name="output",dtype=dtype) # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction bin_noise=rng.binomial(size=(batch_size,n_output/3,1), n=1,p=1 - corruption_level,dtype=theano.config.floatX) #bin_noise_3d= T.reshape(T.concatenate((bin_noise, bin_noise,bin_noise),axis=1),(batch_size,n_output/3,3)) bin_noise_3d= T.concatenate((bin_noise, bin_noise,bin_noise),axis=2) noise= rng.normal(size=(batch_size,n_output), std=0.03, avg=0.0,dtype=theano.config.floatX) noise_bin=T.reshape(noise,(batch_size,n_output/3,3))*bin_noise_3d X_train=T.reshape(noise_bin,(batch_size,n_output))+X X_tilde= T.switch(T.neq(is_train, 0), X_train, X) W_1_e =u.init_weight(shape=(n_output,1024),rng=rng,name="w_hid",sample="glorot") b_1_e=u.init_bias(1024,rng) W_2_e =u.init_weight(shape=(1024,2048),rng=rng,name="w_hid",sample="glorot") b_2_e=u.init_bias(2048,rng) W_2_d = W_2_e.T b_2_d=u.init_bias(1024,rng) W_1_d = W_1_e.T b_1_d=u.init_bias(n_output,rng) h_1_e=HiddenLayer(rng,X_tilde,0,0, W=W_1_e,b=b_1_e,activation=nn.relu) h_2_e=HiddenLayer(rng,h_1_e.output,0,0, W=W_2_e,b=b_2_e,activation=nn.relu) h_2_d=HiddenLayer(rng,h_2_e.output,0,0, W=W_2_d,b=b_2_d,activation=u.do_nothing) h_1_d=LogisticRegression(rng,h_2_d.output,0,0, W=W_1_d,b=b_1_d) self.output = h_1_d.y_pred self.params =h_1_e.params+h_2_e.params self.params.append(b_2_d) self.params.append(b_1_d) cost=get_err_fn(self,cost_function,Y) L2_reg=0.0001 L2_sqr = theano.shared(0.) for param in self.params: L2_sqr += (T.sum(param[0] ** 2)+T.sum(param[1] ** 2)) cost += L2_reg*L2_sqr _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X,Y,is_train],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train], outputs = self.output,allow_input_downcast=True) self.mid_layer = theano.function(inputs = [X,is_train], outputs = h_2_e.output,allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self,rng, params,cost_function='mse',optimizer = RMSprop): batch_size=params['batch_size'] sequence_length=params["seq_length"] lr=params['lr'] self.n_in = params['n_output'] self.n_lstm = params['n_hidden'] self.n_out = params['n_output'] self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) layer2=LSTMLayer(rng,1,self.n_lstm,self.n_lstm) layer3=LSTMLayer(rng,2,self.n_lstm,self.n_lstm) self.params = layer1.params+layer2.params+layer3.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,mask,h_tm1_1,c_tm1_1,h_tm1_2,c_tm1_2,h_tm1_3,c_tm1_3): [h_t_1,c_t_1,y_t_1]=layer1.run(x_t,h_tm1_1,c_tm1_1) dl1=DropoutLayer(rng,input=y_t_1,prob=0.5,is_train=is_train,mask=mask) [h_t_2,c_t_2,y_t_2]=layer2.run(dl1.output,h_tm1_2,c_tm1_2) [h_t_3,c_t_3,y_t_3]=layer3.run(y_t_2,h_tm1_3,c_tm1_3) y = T.dot(y_t_3, self.W_hy) + self.b_y return [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction h0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state h0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state h0_3 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_3 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state mask_shape=(sequence_length,batch_size,self.n_lstm) p_1=0.5 mask= rng.binomial(size=mask_shape, p=p_1, dtype=X.dtype) [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y_vals], _ = theano.scan(fn=step_lstm, sequences=[X.dimshuffle(1,0,2),mask], outputs_info=[h0_1, c0_1,h0_2, c0_2, h0_3, c0_3, None]) self.output = y_vals.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X, Y,is_train],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self,rng, params,cost_function='mse',optimizer = RMSprop): batch_size=params['batch_size'] sequence_length=params["seq_length"] lr=params['lr'] self.n_in = 1024 self.n_lstm = params['n_hidden'] self.n_out = params['n_output'] self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) layer2=LSTMLayer(rng,1,self.n_lstm,self.n_lstm) layer3=LSTMLayer(rng,2,self.n_lstm,self.n_lstm) self.params = layer1.params+layer2.params+layer3.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,mask,h_tm1_1,c_tm1_1,h_tm1_2,c_tm1_2,h_tm1_3,c_tm1_3): [h_t_1,c_t_1,y_t_1]=layer1.run(x_t,h_tm1_1,c_tm1_1) dl1=DropoutLayer(rng,input=y_t_1,prob=0.5,is_train=is_train,mask=mask) [h_t_2,c_t_2,y_t_2]=layer2.run(dl1.output,h_tm1_2,c_tm1_2) [h_t_3,c_t_3,y_t_3]=layer3.run(y_t_2,h_tm1_3,c_tm1_3) y = T.dot(y_t_3, self.W_hy) + self.b_y return [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction h0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state h0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state h0_3 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_3 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state mask_shape=(sequence_length,batch_size,self.n_lstm) p_1=0.5 mask= rng.binomial(size=mask_shape, p=p_1, dtype=X.dtype) [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y_vals], _ = theano.scan(fn=step_lstm, sequences=[X.dimshuffle(1,0,2),mask], outputs_info=[h0_1, c0_1,h0_2, c0_2, h0_3, c0_3, None]) self.output = y_vals.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X, Y,is_train],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self, rng,input, n_in, n_out,W=None,b=None): shape=(n_in, n_out) if(W ==None): W = u.init_weight(shape=shape,rng=rng,name='W_xreg',sample='glorot') b=u.init_bias(n_out,rng=rng) self.W = W self.b = b self.y_pred = T.dot(input, self.W) + self.b self.params = [self.W, self.b] self.input = input
def __init__(self,rng, params,cost_function='mse',optimizer = RMSprop): batch_size=params['batch_size'] sequence_length=params["seq_length"] lr=params['lr'] self.n_in = 48 self.n_lstm = params['n_hidden'] self.n_out = params['n_output'] self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) self.params = layer1.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,h_tm1_1,c_tm1_1): [h_t_1,c_t_1,y_t_1]=layer1.run(x_t,h_tm1_1,c_tm1_1) y = T.dot(y_t_1, self.W_hy) + self.b_y return [h_t_1,c_t_1,y] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction H = T.matrix(name="H",dtype=dtype) # initial hidden state C = T.matrix(name="C",dtype=dtype) # initial hidden state noise= rng.normal(size=(batch_size,sequence_length,self.n_in), std=0.0002, avg=0.0,dtype=theano.config.floatX) X_train=noise+X X_tilde= T.switch(T.neq(is_train, 0), X_train, X) # h0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state # c0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state [h_t_1,c_t_1,y_vals], _ = theano.scan(fn=step_lstm, sequences=[X_tilde.dimshuffle(1,0,2)], outputs_info=[H, C, None]) self.output = y_vals.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X,Y,is_train,H,C],outputs=[cost,h_t_1[-1],c_t_1[-1]],updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train,H,C], outputs = [self.output,h_t_1[-1],c_t_1[-1]],allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self,rng, n_in, n_lstm, n_out, lr=0.00001, batch_size=64, output_activation=theano.tensor.nnet.relu,cost_function='mse',optimizer = RMSprop): # rng = RandomStreams(seed=1234) self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xr = init_weight((self.n_in, self.n_lstm),rng=rng,name='W_xi',sample= 'glorot') self.W_hr = init_weight((self.n_lstm, self.n_lstm),rng=rng, name='W_hr', sample='glorot') self.b_r = init_bias(self.n_lstm,rng=rng, sample='zero') self.W_xz = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xz', sample='glorot') self.W_hz = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hz', sample='glorot') self.b_z = init_bias(self.n_lstm,rng=rng, sample='zero') self.W_xh = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xh', sample='glorot') self.W_hh = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hh',sample= 'glorot') self.b_h = init_bias(self.n_lstm,rng=rng, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') self.one_mat=T.ones((batch_size,n_lstm),dtype=dtype) self.params = [self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h, self.W_hy, self.b_y] def step_lstm(x_t, h_tm1): r_t = T.nnet.sigmoid(T.dot(x_t, self.W_xr) + T.dot(h_tm1, self.W_hr) + self.b_r) z_t = T.nnet.sigmoid(T.dot(x_t, self.W_xz) + T.dot(h_tm1, self.W_hz) + self.b_z) h_t = T.tanh(T.dot(x_t, self.W_xh) + T.dot((r_t*h_tm1),self.W_hh) + self.b_h) hh_t = z_t * h_t + (1-z_t)*h_tm1 y_t = T.tanh(T.dot(hh_t, self.W_hy) + self.b_y) return [hh_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector #Y_NaN= T.tensor3() # batch of sequence of vector h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1,0,2), outputs_info=[h0, None]) self.output = y_vals.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X, Y],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.n_param=n_lstm*n_lstm*3+n_in*n_lstm*3+n_lstm*n_out+n_lstm*3
def __init__(self, rng, input, n_in, n_out,W=None,b=None,activation=T.tanh): self.input = input shape=[n_in,n_out] if(W ==None): W =u.init_weight(shape=shape,rng=rng,name="w_hid",sample="glorot") b=u.init_bias(n_out,rng) self.W = W self.b = b lin_output = T.dot(input, self.W) + self.b self.output = activation(lin_output) # parameters of the model self.params = [self.W, self.b]
def __init__(self, rng, input,filter_shape,input_shape,border_mode,subsample, activation=nn.relu,W=None,b=None,only_conv=0): # e.g. input_shape= (samples, channels, rows, cols) # assert border_mode in {'same', 'valid'} self.input = input nb_filter=filter_shape[0] # W,b=None,None if(W ==None): W =u.init_weight(filter_shape,rng=rng, name="w_conv", sample='glorot') b=u.init_bias(nb_filter,rng=rng) self.W = W self.b = b b_mode=border_mode if(border_mode=='same'): b_mode='half' #image_shape: (batch size, num input feature maps,image height, image width) conv_out = conv2d( input=input, filters=self.W, filter_shape=filter_shape, input_shape=input_shape, border_mode=b_mode,subsample=subsample ) if border_mode == 'same': if filter_shape[2] % 2 == 0: conv_out = conv_out[:, :, :(input.shape[2] + subsample[0] - 1) // subsample[0], :] if filter_shape[3] % 2 == 0: conv_out = conv_out[:, :, :, :(input.shape[3] + subsample[1] - 1) // subsample[1]] if(only_conv==0): output = conv_out + b.dimshuffle('x', 0, 'x', 'x') self.output = activation(output, 0) else: self.output = conv_out # parameters of the model self.params = [self.W, self.b] rows = input_shape[2] cols = input_shape[3] rows = u.conv_output_length(rows, filter_shape[2],border_mode, subsample[0]) cols = u.conv_output_length(cols, filter_shape[3], border_mode, subsample[1]) self.output_shape=(input_shape[0], nb_filter, rows, cols)
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, output_activation=theano.tensor.nnet.relu,cost_function='mse',optimizer = RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xi = init_weight((self.n_in, self.n_lstm),'W_xi', 'glorot') self.W_hi = init_weight((self.n_lstm, self.n_lstm),'W_hi', 'ortho') self.b_i = init_bias(self.n_lstm, sample='zero') self.W_xf = init_weight((self.n_in, self.n_lstm),'W_xf', 'glorot') self.W_hf = init_weight((self.n_lstm, self.n_lstm),'W_hf', 'ortho') self.b_f = init_bias(self.n_lstm, sample='one') self.W_xc = init_weight((self.n_in, self.n_lstm),'W_xc', 'glorot') self.W_hc = init_weight((self.n_lstm, self.n_lstm),'W_hc', 'ortho') self.b_c = init_bias(self.n_lstm, sample='zero') self.W_xo = init_weight((self.n_in, self.n_lstm),'W_xo', 'glorot') self.W_ho = init_weight((self.n_lstm, self.n_lstm),'W_ho', 'ortho') self.b_o = init_bias(self.n_lstm, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_out),'W_hy', 'glorot') self.b_y = init_bias(self.n_out, sample='zero') self.params = [self.W_xi, self.W_hi, self.b_i, self.W_xf, self.W_hf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, self.b_o, self.W_hy, self.b_y] def step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + self.b_i) f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo)+ T.dot(h_tm1, self.W_ho) + self.b_o) h_t = o_t * T.tanh(c_t) y_t = T.tanh(T.dot(h_t, self.W_hy) + self.b_y) return [h_t, c_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1,0,2), outputs_info=[h0, c0, None]) self.output = y_vals.dimshuffle(1,0,2) cxe = T.mean(T.nnet.binary_crossentropy(self.output, Y)) nll = -T.mean(Y * T.log(self.output)+ (1.- Y) * T.log(1. - self.output)) mse = T.mean((self.output - Y) ** 2) cost = 0 if cost_function == 'mse': cost = mse elif cost_function == 'cxe': cost = cxe else: cost = nll _optimizer = optimizer( cost, self.params, lr=lr ) def reset(): h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) c0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) self.train = theano.function(inputs=[X, Y],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.n_param=n_lstm*n_lstm*4+n_in*n_lstm*4+n_lstm*n_out+n_lstm*3
def __init__(self,rng,params,cost_function='mse',optimizer = RMSprop): lr=params["lr"] n_lstm=params['n_hidden'] n_out=params['n_output'] batch_size=params["batch_size"] sequence_length=params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction #CNN global parameters. subsample=(1,1) p_1=0.5 border_mode="valid" cnn_batch_size=batch_size*sequence_length pool_size=(2,2) #Layer1: conv2+pool+drop filter_shape=(64,1,9,9) input_shape=(cnn_batch_size,1,120,60) #input_shape= (samples, channels, rows, cols) input= X.reshape(input_shape) c1=ConvLayer(rng, input,filter_shape, input_shape,border_mode,subsample, activation=nn.relu) p1=PoolLayer(c1.output,pool_size=pool_size,input_shape=c1.output_shape) dl1=DropoutLayer(rng,input=p1.output,prob=p_1) retain_prob = 1. - p_1 test_output = p1.output*retain_prob d1_output = T.switch(T.neq(is_train, 0), dl1.output, test_output) #Layer2: conv2+pool filter_shape=(128,p1.output_shape[1],3,3) c2=ConvLayer(rng, d1_output, filter_shape,p1.output_shape,border_mode,subsample, activation=nn.relu) p2=PoolLayer(c2.output,pool_size=pool_size,input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape=(128,p2.output_shape[1],3,3) c3=ConvLayer(rng, p2.output,filter_shape,p2.output_shape,border_mode,subsample, activation=nn.relu) p3=PoolLayer(c3.output,pool_size=pool_size,input_shape=c3.output_shape) #Layer4: hidden n_in= reduce(lambda x, y: x*y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1=HiddenLayer(rng,x_flat,n_in,1024,activation=nn.relu) n_in=1024 rnn_input = h1.output.reshape((batch_size,sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xr = init_weight((self.n_in, self.n_lstm),rng=rng,name='W_xi',sample= 'glorot') self.W_hr = init_weight((self.n_lstm, self.n_lstm),rng=rng, name='W_hr', sample='glorot') self.b_r = init_bias(self.n_lstm,rng=rng, sample='zero') self.W_xz = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xz', sample='glorot') self.W_hz = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hz', sample='glorot') self.b_z = init_bias(self.n_lstm,rng=rng, sample='zero') self.W_xh = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xh', sample='glorot') self.W_hh = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hh',sample= 'glorot') self.b_h = init_bias(self.n_lstm,rng=rng, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') self.params = [self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h, self.W_hy, self.b_y] def step_lstm(x_t, h_tm1): r_t = T.nnet.sigmoid(T.dot(x_t, self.W_xr) + T.dot(h_tm1, self.W_hr) + self.b_r) z_t = T.nnet.sigmoid(T.dot(x_t, self.W_xz) + T.dot(h_tm1, self.W_hz) + self.b_z) h_t = T.tanh(T.dot(x_t, self.W_xh) + T.dot((r_t*h_tm1),self.W_hh) + self.b_h) hh_t = z_t * h_t + (1-z_t)*h_tm1 y_t = T.dot(hh_t, self.W_hy) + self.b_y return [hh_t, y_t] h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=rnn_input.dimshuffle(1,0,2), outputs_info=[h0, None]) self.output = y_vals.dimshuffle(1,0,2) self.params =c1.params+c2.params+c3.params+h1.params+self.params cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X,Y,is_train],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train], outputs = self.output,allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, single_output=True, output_activation=theano.tensor.tanh,cost_function='nll',optimizer = RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.n_fc1=512 self.n_fc2=256 self.n_prefc1=128 self.n_prefc2=128 self.W_prefc1 = init_weight((self.n_in, self.n_prefc1),'W_prefc1', 'glorot') self.b_prefc1 = init_bias(self.n_prefc1, sample='zero') self.W_prefc2 = init_weight((self.n_prefc1, self.n_prefc2),'W_prefc2', 'glorot') self.b_prefc2 =init_bias(self.n_prefc2, sample='zero') self.W_fc1 = init_weight((self.n_fc1, self.n_fc2),'W_fc1', 'glorot') self.b_fc1 = init_bias(self.n_fc2, sample='zero') self.W_fc2 = init_weight((self.n_fc2, self.n_out),'W_fc2', 'glorot') self.b_fc2 =init_bias(self.n_out, sample='zero') self.W_xi = init_weight((self.n_prefc2, self.n_lstm),'W_xi', 'glorot') self.W_hi = init_weight((self.n_lstm, self.n_lstm),'W_hi', 'glorot') self.W_ci = init_weight((self.n_lstm, self.n_lstm),'W_ci', 'glorot') self.b_i = init_bias(self.n_lstm, sample='zero') self.W_xf = init_weight((self.n_prefc2, self.n_lstm),'W_xf', 'glorot') self.W_hf = init_weight((self.n_lstm, self.n_lstm),'W_hf', 'glorot') self.W_cf = init_weight((self.n_lstm, self.n_lstm),'W_cf', 'glorot') self.b_f =init_bias(self.n_lstm, sample='zero') self.W_xc = init_weight((self.n_prefc2, self.n_lstm),'W_xc', 'glorot') self.W_hc = init_weight((self.n_lstm, self.n_lstm),'W_hc', 'glorot') self.b_c = shared(np.zeros(n_lstm, dtype=dtype)) self.W_xo = init_weight((self.n_prefc2, self.n_lstm),'W_xo', 'glorot') self.W_ho = init_weight((self.n_lstm, self.n_lstm),'W_ho', 'glorot') self.W_co = init_weight((self.n_lstm, self.n_lstm),'W_co', 'glorot') self.b_o = init_bias(self.n_lstm, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_fc1),'W_hy', 'glorot') self.b_y = init_bias(self.n_fc1, sample='zero') self.params = [self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c,self.W_xo, self.W_ho, self.W_co, self.b_o, self.W_hy, self.b_y,self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2, self.W_prefc1, self.b_prefc1,self.W_prefc2, self.b_prefc2] sigma = lambda x: 1 / (1 + T.exp(-x)) def step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) + self.b_i) f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo)+ T.dot(h_tm1, self.W_ho) + T.dot(c_t, self.W_co) + self.b_o) h_t = o_t * T.tanh(c_t) y_t = T.tanh(T.dot(h_t, self.W_hy) + self.b_y) return [h_t, c_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state #Hidden layer prefc1_out = T.tanh(T.dot(X.dimshuffle(1,0,2), self.W_prefc1) + self.b_prefc1) prefc2_out = T.tanh(T.dot(prefc1_out, self.W_prefc2) + self.b_prefc2) [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=prefc2_out, outputs_info=[h0, c0, None]) #Hidden layer fc1_out = T.tanh(T.dot(y_vals, self.W_fc1) + self.b_fc1) fc2_out = T.tanh(T.dot(fc1_out, self.W_fc2) + self.b_fc2) self.output=fc2_out.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X, Y],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X], outputs = self.output,allow_input_downcast=True) self.n_param=n_lstm*n_lstm*4+n_in*n_lstm*4+n_lstm*n_out+n_lstm*3
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, single_output=True, output_activation=theano.tensor.nnet.relu,cost_function='mse',optimizer = RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.n_fc1=256 self.n_fc2=256 self.n_fc3=256 self.W_fc1 = init_weight((self.n_fc1, self.n_fc2),'W_fc1', 'glorot') self.b_fc1 = init_bias(self.n_fc2, sample='zero') self.W_fc2 = init_weight((self.n_fc2, self.n_fc3),'W_fc2', 'glorot') self.b_fc2 =init_bias(self.n_fc3, sample='zero') self.W_fc3 = init_weight((self.n_fc3, self.n_out),'w_fc3', 'glorot') self.b_fc3 =init_bias(self.n_out, sample='zero') self.W_xr = init_weight((self.n_in, self.n_lstm), 'W_xr', 'glorot') self.W_hr = init_weight((self.n_lstm, self.n_lstm), 'W_hr', 'ortho') self.b_r = init_bias(self.n_lstm, sample='zero') self.W_xz = init_weight((self.n_in, self.n_lstm), 'W_xz', 'glorot') self.W_hz = init_weight((self.n_lstm, self.n_lstm), 'W_hz', 'ortho') self.b_z = init_bias(self.n_lstm, sample='zero') self.W_xh = init_weight((self.n_in, self.n_lstm), 'W_xh', 'glorot') self.W_hh = init_weight((self.n_lstm, self.n_lstm), 'W_hh', 'ortho') self.b_h = init_bias(self.n_lstm, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_fc1),'W_hy', 'glorot') self.b_y = init_bias(self.n_fc1, sample='zero') self.params = [self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h, self.W_hy, self.b_y,self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2,self.W_fc3, self.b_fc3] def step_lstm(x_t, h_tm1): r_t = T.nnet.sigmoid(T.dot(x_t, self.W_xr) + T.dot(h_tm1, self.W_hr) + self.b_r) z_t = T.nnet.sigmoid(T.dot(x_t, self.W_xz) + T.dot(h_tm1, self.W_hz) + self.b_z) h_t = T.tanh(T.dot(x_t, self.W_xh) + T.dot((r_t*h_tm1),self.W_hh) + self.b_h) hh_t = z_t * h_t + (1-z_t)*h_tm1 y_t = T.tanh(T.dot(hh_t, self.W_hy) + self.b_y) return [hh_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector #Y_NaN= T.tensor3() # batch of sequence of vector h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1,0,2), outputs_info=[h0, None]) #Hidden layer fc1_out = T.tanh(T.dot(y_vals, self.W_fc1) + self.b_fc1) fc2_out = T.tanh(T.dot(fc1_out, self.W_fc2) + self.b_fc2) fc3_out = T.tanh(T.dot(fc2_out, self.W_fc3) + self.b_fc3) self.output=fc3_out.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X, Y],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X], outputs = self.output,allow_input_downcast=True) self.n_param=n_lstm*n_lstm*4+n_in*n_lstm*4+n_lstm*n_out+n_lstm*3
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, single_output=True, output_activation=theano.tensor.nnet.relu, cost_function='nll'): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xi = init_weight((self.n_in, self.n_lstm), 'W_xi') self.W_hi = init_weight((self.n_lstm, self.n_lstm), 'W_hi', 'svd') self.W_ci = init_weight((self.n_lstm, self.n_lstm), 'W_ci', 'svd') self.b_i = shared(np.cast[dtype](np.random.uniform(-0.5, .5, size=n_lstm))) self.W_xf = init_weight((self.n_in, self.n_lstm), 'W_xf') self.W_hf = init_weight((self.n_lstm, self.n_lstm), 'W_hf', 'svd') self.W_cf = init_weight((self.n_lstm, self.n_lstm), 'W_cf', 'svd') self.b_f = shared(np.cast[dtype](np.random.uniform(0, 1., size=n_lstm))) self.W_xc = init_weight((self.n_in, self.n_lstm), 'W_xc') self.W_hc = init_weight((self.n_lstm, self.n_lstm), 'W_hc', 'svd') self.b_c = shared(np.zeros(n_lstm, dtype=dtype)) self.W_xo = init_weight((self.n_in, self.n_lstm), 'W_xo') self.W_ho = init_weight((self.n_lstm, self.n_lstm), 'W_ho', 'svd') self.W_co = init_weight((self.n_lstm, self.n_lstm), 'W_co', 'svd') self.b_o = shared(np.cast[dtype](np.random.uniform(-0.5, .5, size=n_lstm))) self.W_hy = init_weight((self.n_lstm, self.n_out), 'W_hy') self.b_y = shared(np.zeros(n_out, dtype=dtype)) self.params = [ self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_ho, self.W_co, self.b_o, self.W_hy, self.b_y ] def step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid( T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) + self.b_i) f_t = T.nnet.sigmoid( T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh( T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) o_t = T.nnet.sigmoid( T.dot(x_t, self.W_xo) + T.dot(h_tm1, self.W_ho) + T.dot(c_t, self.W_co) + self.b_o) h_t = o_t * T.tanh(c_t) y_t = output_activation(T.dot(h_t, self.W_hy) + self.b_y) return [h_t, c_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3( ) # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1, 0, 2), outputs_info=[h0, c0, None]) if single_output: self.output = y_vals[-1] else: self.output = y_vals.dimshuffle(1, 0, 2) cxe = T.mean(T.nnet.binary_crossentropy(self.output, Y)) nll = -T.mean(Y * T.log(self.output) + (1. - Y) * T.log(1. - self.output)) mse = T.mean((self.output - Y)**2) cost = 0 if cost_function == 'mse': cost = mse elif cost_function == 'cxe': cost = cxe else: cost = nll optimizer = RMSprop(cost, self.params, lr=lr) # gparams = T.grad(cost, self.params) # updates = OrderedDict() # for param, gparam in zip(self.params, gparams): # updates[param] = param - gparam * lr # self.loss = theano.function(inputs = [X, Y], outputs = [cxe, mse, cost]) # self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates,allow_input_downcast=True) self.train = theano.function(inputs=[X, Y], outputs=cost, updates=optimizer.getUpdates(), allow_input_downcast=True) #self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates,allow_input_downcast=True) self.predictions = theano.function(inputs=[X], outputs=y_vals.dimshuffle(1, 0, 2), allow_input_downcast=True) self.debug = theano.function( inputs=[X, Y], outputs=[X.shape, Y.shape, y_vals.shape, cxe.shape])
def __init__(self, n_in, n_lstm, n_out, lr=0.00001, batch_size=64, output_activation=theano.tensor.nnet.relu, cost_function='mse', optimizer=RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xr_1 = init_weight((self.n_in, self.n_lstm), 'W_xr_1', 'glorot') self.W_hr_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hr_1', 'ortho') self.b_r_1 = init_bias(self.n_lstm, sample='zero') self.W_xz_1 = init_weight((self.n_in, self.n_lstm), 'W_xz_1', 'glorot') self.W_hz_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hz_1', 'ortho') self.b_z_1 = init_bias(self.n_lstm, sample='zero') self.W_xh_1 = init_weight((self.n_in, self.n_lstm), 'W_xh_1', 'glorot') self.W_hh_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hh_1', 'ortho') self.b_h_1 = init_bias(self.n_lstm, sample='zero') # self.W_hy_1 = init_weight((self.n_lstm, self.n_out),'W_hy_1', 'glorot') # self.b_y_1 = init_bias(self.n_out, sample='zero') self.W_xr_2 = init_weight((self.n_in, self.n_lstm), 'W_xr', 'glorot') self.W_hr_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hr', 'ortho') self.b_r_2 = init_bias(self.n_lstm, sample='zero') self.W_xz_2 = init_weight((self.n_in, self.n_lstm), 'W_xz', 'glorot') self.W_hz_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hz', 'ortho') self.b_z_2 = init_bias(self.n_lstm, sample='zero') self.W_xh_2 = init_weight((self.n_in, self.n_lstm), 'W_xh', 'glorot') self.W_hh_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hh', 'ortho') self.b_h_2 = init_bias(self.n_lstm, sample='zero') self.W_hy_2 = init_weight((self.n_lstm, self.n_out), 'W_hy', 'glorot') self.b_y_2 = init_bias(self.n_out, sample='zero') self.params = [ self.W_xr_1, self.W_hr_1, self.b_r_1, self.W_xz_1, self.W_hz_1, self.b_z_1, self.W_xh_1, self.W_hh_1, self.b_h_1, self.W_xr_2, self.W_hr_2, self.b_r_2, self.W_xz_2, self.W_hz_2, self.b_z_2, self.W_xh_2, self.W_hh_2, self.b_h_2, self.W_hy_f, self.W_hy_b, self.b_y ] def f_step_lstm(x_t, h_tm1_1): r_t_1 = T.nnet.sigmoid( T.dot(x_t, self.W_xr_1) + T.dot(h_tm1_1, self.W_hr_1) + self.b_r_1) z_t_1 = T.nnet.sigmoid( T.dot(x_t, self.W_xz_1) + T.dot(h_tm1_1, self.W_hz_1) + self.b_z_1) h_t_1 = T.tanh( T.dot(x_t, self.W_xh_1) + T.dot((r_t_1 * h_tm1_1), self.W_hh_1) + self.b_h_1) hh_t_1 = z_t_1 * h_t_1 + (1 - z_t_1) * h_tm1_1 return [hh_t_1] def b_step_lstm(x_t, h_tm1_2): r_t_2 = T.nnet.sigmoid( T.dot(x_t, self.W_xr_2) + T.dot(h_tm1_2, self.W_hr_2) + self.b_r_2) z_t_2 = T.nnet.sigmoid( T.dot(x_t, self.W_xz_2) + T.dot(h_tm1_2, self.W_hz_2) + self.b_z_2) h_t_2 = T.tanh( T.dot(x_t, self.W_xh_2) + T.dot((r_t_2 * h_tm1_2), self.W_hh_2) + self.b_h_2) hh_t_2 = z_t_2 * h_t_2 + (1 - z_t_2) * h_tm1_2 return [hh_t_2] X_f = T.tensor3() # batch of sequence of vector X_b = T.tensor3() # batch of sequence of vector Y = T.tensor3( ) # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_f, c_vals], _ = theano.scan(fn=f_step_lstm, sequences=X_f.dimshuffle(1, 0, 2), outputs_info=[h0, c0]) [h_b, c_vals], _ = theano.scan(fn=b_step_lstm, sequences=X_b.dimshuffle(1, 0, 2), outputs_info=[h0, c0]) h_b = h_b[:, ::-1] y_vals = T.tanh( T.dot(h_f, self.W_hy_f) + T.dot(h_b, self.W_hy_b) + self.b_y) self.output = y_vals.dimshuffle(1, 0, 2) cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X_f, X_b, Y], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X_f, X_b], outputs=y_vals.dimshuffle(1, 0, 2), allow_input_downcast=True) self.n_param = n_lstm * n_lstm * 3 + n_in * n_lstm * 3 + n_lstm * n_out + n_lstm * 3
def __init__(self,rng, params,cost_function='mse',optimizer = RMSprop): batch_size=params['batch_size'] sequence_length=params["seq_length"] lr=params['lr'] self.n_in = 48 self.n_lstm = params['n_hidden'] self.n_out = params['n_output'] n_fc=512 X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction self.W_hy = init_weight((self.n_lstm, n_fc), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(n_fc,rng=rng, sample='zero') self.numOfLayers=3 layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) layer2=LSTMLayer(rng,1,self.n_lstm,self.n_lstm) layer3=LSTMLayer(rng,2,self.n_lstm,self.n_lstm) self.params = layer1.params+layer2.params+layer3.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,mask,h_tm1_1,c_tm1_1,h_tm1_2,c_tm1_2,h_tm1_3,c_tm1_3): [h_t_1,c_t_1,y_t_1]=layer1.run(x_t,h_tm1_1,c_tm1_1) dl1=DropoutLayer(rng,input=y_t_1,prob=0.5,is_train=is_train,mask=mask) [h_t_2,c_t_2,y_t_2]=layer2.run(dl1.output,h_tm1_2,c_tm1_2) [h_t_3,c_t_3,y_t_3]=layer3.run(y_t_2,h_tm1_3,c_tm1_3) y = T.dot(y_t_3, self.W_hy) + self.b_y return [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y] h0_1 = T.matrix(name="h0_1",dtype=dtype) # initial hidden state c0_1 = T.matrix(name="c0_1",dtype=dtype) # initial hidden state h0_2 = T.matrix(name="h0_2",dtype=dtype) # initial hidden state c0_2 = T.matrix(name="c0_2",dtype=dtype) # initial hidden state h0_3 = T.matrix(name="h0_3",dtype=dtype) # initial hidden state c0_3 = T.matrix(name="c0_3",dtype=dtype) # initial hidden state mask_shape=(sequence_length,batch_size,self.n_lstm) p_1=0.5 mask= rng.binomial(size=mask_shape, p=p_1, dtype=X.dtype) noise= rng.normal(size=(batch_size,sequence_length,self.n_in), std=0.008, avg=0.0,dtype=theano.config.floatX) X_train=noise+X X_tilde= T.switch(T.neq(is_train, 0), X_train, X) [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y_vals], _ = theano.scan(fn=step_lstm, sequences=[X_tilde.dimshuffle(1,0,2),mask], outputs_info=[h0_1, c0_1,h0_2, c0_2, h0_3, c0_3, None]) # self.output = y_vals.dimshuffle(1,0,2) mdn_input=y_vals.dimshuffle(1,0,2) mdn_input=T.reshape(mdn_input,(batch_size*sequence_length,n_fc)) Y_ll=T.reshape(Y,(batch_size*sequence_length,params['n_output'])) mdn = MDNoutputLayer(rng=rng, input=mdn_input, n_in=n_fc, n_out=params['n_output'], mu_activation=do_nothing, n_components=5) self.params=self.params+mdn.params # self.params.append(mdn.W_mixing) # self.params.append(mdn.W_mu) # self.params.append(mdn.W_sigma) cost = nll(mu = mdn.mu, sigma = mdn.sigma, mixing = mdn.mixing, y = Y_ll) #+ L2_reg * self.frame_pred.L2_sqr _optimizer = optimizer( cost, self.params, lr=lr ) #Sampling from the GMM component = rng.multinomial(pvals=mdn.mixing) component_mean = T.sum(mdn.mu * component.dimshuffle(0,'x',1),axis=2) component_std = T.sum(mdn.sigma * component, axis=1, keepdims=True) samples=rng.normal(size=(batch_size*sequence_length,params['n_output']),avg = component_mean, std=component_std) self.output = T.reshape(samples,(batch_size,sequence_length,params['n_output'])) self.train = theano.function(inputs=[X,Y,is_train,h0_1, c0_1,h0_2, c0_2, h0_3, c0_3],outputs=[cost,h_t_1[-1],c_t_1[-1],h_t_2[-1],c_t_2[-1],h_t_3[-1],c_t_3[-1]],updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train,h0_1, c0_1,h0_2, c0_2, h0_3, c0_3], outputs = [self.output,h_t_1[-1],c_t_1[-1],h_t_2[-1],c_t_2[-1],h_t_3[-1],c_t_3[-1]],allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self, rng, n_in, n_lstm, n_out, lr=0.00001, batch_size=64, output_activation=theano.tensor.nnet.relu, cost_function='mse', optimizer=RMSprop): # rng = RandomStreams(seed=1234) self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xr = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xi', sample='glorot') self.W_hr = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hr', sample='glorot') self.b_r = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_xz = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xz', sample='glorot') self.W_hz = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hz', sample='glorot') self.b_z = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_xh = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xh', sample='glorot') self.W_hh = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hh', sample='glorot') self.b_h = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng, name='W_hy', sample='glorot') self.b_y = init_bias(self.n_out, rng=rng, sample='zero') self.one_mat = T.ones((batch_size, n_lstm), dtype=dtype) self.params = [ self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h, self.W_hy, self.b_y ] def step_lstm(x_t, h_tm1): r_t = T.nnet.sigmoid( T.dot(x_t, self.W_xr) + T.dot(h_tm1, self.W_hr) + self.b_r) z_t = T.nnet.sigmoid( T.dot(x_t, self.W_xz) + T.dot(h_tm1, self.W_hz) + self.b_z) h_t = T.tanh( T.dot(x_t, self.W_xh) + T.dot((r_t * h_tm1), self.W_hh) + self.b_h) hh_t = z_t * h_t + (1 - z_t) * h_tm1 y_t = T.tanh(T.dot(hh_t, self.W_hy) + self.b_y) return [hh_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector #Y_NaN= T.tensor3() # batch of sequence of vector h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1, 0, 2), outputs_info=[h0, None]) self.output = y_vals.dimshuffle(1, 0, 2) cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X], outputs=y_vals.dimshuffle(1, 0, 2), allow_input_downcast=True) self.n_param = n_lstm * n_lstm * 3 + n_in * n_lstm * 3 + n_lstm * n_out + n_lstm * 3
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, single_output=True, output_activation=theano.tensor.nnet.relu, cost_function='nll', optimizer=RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.nzeros_fc1 = 500 self.n_fc1 = 1024 self.n_fc2 = 512 self.W_fc1 = init_weight((self.n_fc1, self.n_fc2), 'W_fc1') self.b_fc1 = init_bias(self.n_fc2, sample='zero') self.W_fc2 = init_weight((self.n_fc2, self.n_out), 'W_fc2') self.b_fc2 = init_bias(self.n_out, sample='zero') self.W_xi = init_weight((self.n_in, self.n_lstm), 'W_xi') self.W_hi = init_weight((self.n_lstm, self.n_lstm), 'W_hi', 'svd') self.W_ci = init_weight((self.n_lstm, self.n_lstm), 'W_ci', 'svd') self.b_i = init_bias(self.n_lstm, sample='zero') self.W_xf = init_weight((self.n_in, self.n_lstm), 'W_xf') self.W_hf = init_weight((self.n_lstm, self.n_lstm), 'W_hf', 'svd') self.W_cf = init_weight((self.n_lstm, self.n_lstm), 'W_cf', 'svd') self.b_f = init_bias(self.n_lstm, sample='one') self.W_xc = init_weight((self.n_in, self.n_lstm), 'W_xc') self.W_hc = init_weight((self.n_lstm, self.n_lstm), 'W_hc', 'svd') self.b_c = init_bias(self.n_lstm, sample='zero') self.W_xo = init_weight((self.n_in, self.n_lstm), 'W_xo') self.W_ho = init_weight((self.n_lstm, self.n_lstm), 'W_ho', 'svd') self.W_co = init_weight((self.n_lstm, self.n_lstm), 'W_co', 'svd') self.b_o = init_bias(self.n_lstm, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_fc1), 'W_hy') self.b_y = init_bias(self.n_fc1, sample='zero') self.params = [ self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, self.W_co, self.b_o, self.W_hy, self.b_y, self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2 ] def step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid( T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) + self.b_i) f_t = T.nnet.sigmoid( T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh( T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) o_t = T.nnet.sigmoid( T.dot(x_t, self.W_xo) + T.dot(h_tm1, self.W_ho) + T.dot(c_t, self.W_co) + self.b_o) h_t = o_t * T.tanh(c_t) y_t = T.tanh(T.dot(h_t, self.W_hy) + self.b_y) return [h_t, c_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3( ) # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1, 0, 2), outputs_info=[h0, c0, None]) fc_in = y_vals * X.dimshuffle(1, 0, 2) #Hidden layer fc1_out = T.tanh(T.dot(fc_in, self.W_fc1) + self.b_fc1) fc2_out = T.tanh(T.dot(fc1_out, self.W_fc2) + self.b_fc2) self.output = fc2_out.dimshuffle(1, 0, 2) cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X], outputs=self.output, allow_input_downcast=True) self.n_param = n_lstm * n_lstm * 4 + n_in * n_lstm * 4 + n_lstm * n_out + n_lstm * 3
def __init__(self, rng, layer_id, n_in, n_lstm): layer_id=str(layer_id) self.n_in = n_in self.n_lstm = n_lstm self.W_xi = init_weight((self.n_in, self.n_lstm),rng=rng,name='W_xi_'+layer_id,sample= 'glorot') self.W_hi = init_weight((self.n_lstm, self.n_lstm),rng=rng,name='W_hi_'+layer_id, sample='glorot') self.W_ci = init_weight((self.n_lstm, self.n_lstm),rng=rng,name='W_ci_'+layer_id,sample= 'glorot') self.b_i = init_bias(self.n_lstm,rng=rng, sample='zero',name='b_i_'+layer_id) self.W_xf = init_weight((self.n_in, self.n_lstm),rng=rng,name='W_xf_'+layer_id,sample= 'glorot') self.W_hf = init_weight((self.n_lstm, self.n_lstm),rng=rng,name='W_hf_'+layer_id,sample= 'glorot') self.W_cf = init_weight((self.n_lstm, self.n_lstm),rng=rng,name='W_cf_'+layer_id, sample='glorot') self.b_f = init_bias(self.n_lstm, rng=rng,sample='one',name='b_f_'+layer_id) self.W_xc = init_weight((self.n_in, self.n_lstm),rng=rng,name='W_xc_'+layer_id, sample='glorot') self.W_hc = init_weight((self.n_lstm, self.n_lstm),rng=rng,name='W_hc_'+layer_id, sample='ortho') self.b_c = init_bias(self.n_lstm, rng=rng,sample='zero',name='b_c_'+layer_id) self.W_xo = init_weight((self.n_in, self.n_lstm),rng=rng,name='W_xo_'+layer_id,sample= 'glorot') self.W_ho = init_weight((self.n_lstm, self.n_lstm),rng=rng,name='W_ho_'+layer_id, sample='glorot') self.W_co = init_weight((self.n_lstm, self.n_lstm),rng=rng,name='W_co_'+layer_id,sample= 'glorot') self.b_o = init_bias(self.n_lstm,rng=rng, sample='zero',name='b_o_'+layer_id) self.params = [self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, self.W_co, self.b_o]
def __init__(self, rng, params, cost_function='mse', optimizer=RMSprop): lr = params["lr"] n_lstm = params['n_hidden'] n_out = params['n_output'] batch_size = params["batch_size"] sequence_length = params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction #CNN global parameters. subsample = (1, 1) p_1 = 0.5 border_mode = "valid" cnn_batch_size = batch_size * sequence_length pool_size = (2, 2) #Layer1: conv2+pool+drop filter_shape = (64, 1, 9, 9) input_shape = (cnn_batch_size, 1, 120, 60 ) #input_shape= (samples, channels, rows, cols) input = X.reshape(input_shape) c1 = ConvLayer(rng, input, filter_shape, input_shape, border_mode, subsample, activation=nn.relu) p1 = PoolLayer(c1.output, pool_size=pool_size, input_shape=c1.output_shape) dl1 = DropoutLayer(rng, input=p1.output, prob=p_1, is_train=is_train) retain_prob = 1. - p_1 test_output = p1.output * retain_prob d1_output = T.switch(T.neq(is_train, 0), dl1.output, test_output) #Layer2: conv2+pool filter_shape = (128, p1.output_shape[1], 3, 3) c2 = ConvLayer(rng, d1_output, filter_shape, p1.output_shape, border_mode, subsample, activation=nn.relu) p2 = PoolLayer(c2.output, pool_size=pool_size, input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape = (128, p2.output_shape[1], 3, 3) c3 = ConvLayer(rng, p2.output, filter_shape, p2.output_shape, border_mode, subsample, activation=nn.relu) p3 = PoolLayer(c3.output, pool_size=pool_size, input_shape=c3.output_shape) #Layer4: hidden n_in = reduce(lambda x, y: x * y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1 = HiddenLayer(rng, x_flat, n_in, 1024, activation=nn.relu) n_in = 1024 rnn_input = h1.output.reshape((batch_size, sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng, name='W_hy', sample='glorot') self.b_y = init_bias(self.n_out, rng=rng, sample='zero') layer1 = LSTMLayer(rng, 0, self.n_in, self.n_lstm) layer2 = LSTMLayer(rng, 1, self.n_lstm, self.n_lstm) layer3 = LSTMLayer(rng, 2, self.n_lstm, self.n_lstm) self.params = layer1.params + layer2.params + layer3.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t, mask, h_tm1_1, c_tm1_1, h_tm1_2, c_tm1_2, h_tm1_3, c_tm1_3): [h_t_1, c_t_1, y_t_1] = layer1.run(x_t, h_tm1_1, c_tm1_1) dl1 = DropoutLayer(rng, input=y_t_1, prob=0.5, is_train=is_train, mask=mask) [h_t_2, c_t_2, y_t_2] = layer2.run(dl1.output, h_tm1_2, c_tm1_2) [h_t_3, c_t_3, y_t_3] = layer3.run(y_t_2, h_tm1_3, c_tm1_3) y = T.dot(y_t_3, self.W_hy) + self.b_y return [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, y] h0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state h0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state h0_3 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_3 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state mask_shape = (sequence_length, batch_size, self.n_lstm) p_1 = 0.5 mask = rng.binomial(size=mask_shape, p=p_1, dtype=X.dtype) #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, y_vals], _ = theano.scan( fn=step_lstm, sequences=[rnn_input.dimshuffle(1, 0, 2), mask], outputs_info=[h0_1, c0_1, h0_2, c0_2, h0_3, c0_3, None]) self.output = y_vals.dimshuffle(1, 0, 2) self.params = c1.params + c2.params + c3.params + h1.params + self.params cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y, is_train], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X, is_train], outputs=self.output, allow_input_downcast=True) self.n_param = count_params(self.params)
def __init__(self,rng, params,cost_function='mse',optimizer = RMSprop): batch_size=params['batch_size'] sequence_length=params["seq_length"] lr=params['lr'] self.n_in = 48 self.n_lstm = params['n_hidden'] self.n_out = params['n_output'] n_fc=512 self.W_hy = init_weight((self.n_lstm, n_fc), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(n_fc,rng=rng, sample='zero') self.numOfLayers=1 layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) self.params = layer1.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,h_tm1_1,c_tm1_1): [h_t_1,c_t_1,y_t_1]=layer1.run(x_t,h_tm1_1,c_tm1_1) y = T.dot(y_t_1, self.W_hy) + self.b_y return [h_t_1,c_t_1,y] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction H = T.matrix(name="H",dtype=dtype) # initial hidden state C = T.matrix(name="C",dtype=dtype) # initial hidden state noise= rng.normal(size=(batch_size,sequence_length,self.n_in), std=0.008, avg=0.0,dtype=theano.config.floatX) X_train=noise+X X_tilde= T.switch(T.neq(is_train, 0), X_train, X) [h_t_1,c_t_1,y_vals], _ = theano.scan(fn=step_lstm, sequences=[X_tilde.dimshuffle(1,0,2)], outputs_info=[H, C, None]) # self.output = y_vals.dimshuffle(1,0,2) mdn_input=y_vals.dimshuffle(1,0,2) mdn_input=T.reshape(mdn_input,(batch_size*sequence_length,n_fc)) Y_ll=T.reshape(Y,(batch_size*sequence_length,params['n_output'])) mdn = MDNoutputLayer(rng=rng, input=mdn_input, n_in=n_fc, n_out=params['n_output'], mu_activation=do_nothing, n_components=240) self.params=self.params+mdn.params # self.params.append(mdn.W_mixing) # self.params.append(mdn.W_mu) # self.params.append(mdn.W_sigma) cost = nll(mu = mdn.mu, sigma = mdn.sigma, mixing = mdn.mixing, y = Y_ll) #+ L2_reg * self.frame_pred.L2_sqr L2_reg=0.0001 L2_sqr = theano.shared(0.) for param in self.params: L2_sqr += (T.sum(param[0] ** 2)+T.sum(param[1] ** 2)) cost += L2_reg*L2_sqr _optimizer = optimizer( cost, self.params, lr=lr ) #Sampling from the GMM component = rng.multinomial(pvals=mdn.mixing) component_mean = T.sum(mdn.mu * component.dimshuffle(0,'x',1),axis=2) component_std = T.sum(mdn.sigma * component, axis=1, keepdims=True) samples=rng.normal(size=(batch_size*sequence_length,params['n_output']),avg = component_mean, std=component_std) self.output = T.reshape(samples,(batch_size,sequence_length,params['n_output'])) self.train = theano.function(inputs=[X,Y,is_train,H,C],outputs=[cost,h_t_1[-1],c_t_1[-1]],updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train,H,C], outputs = [self.output,h_t_1[-1],c_t_1[-1]],allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self,rng,params,cost_function='mse',optimizer = RMSprop): lr=params["lr"] # n_lstm=params['n_hidden'] # n_out=params['n_output'] batch_size=params["batch_size"] sequence_length=params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction #CNN global parameters. subsample=(1,1) p_1=0.5 border_mode="same" cnn_batch_size=batch_size*sequence_length pool_size=(2,2) #Layer1: conv2+pool+drop filter_shape=(64,3,9,9) input_shape=(cnn_batch_size,3,112,112) #input_shape= (samples, channels, rows, cols) input=X.reshape(input_shape) # input= X_r.dimshuffle(0,3,1,2) c1=ConvLayer(rng, input,filter_shape, input_shape,border_mode,subsample, activation=nn.relu) p1=PoolLayer(c1.output,pool_size=pool_size,input_shape=c1.output_shape) dl1=DropoutLayer(rng,input=p1.output,prob=p_1,is_train=is_train) #Layer2: conv2+pool filter_shape=(128,p1.output_shape[1],3,3) c2=ConvLayer(rng, dl1.output, filter_shape,p1.output_shape,border_mode,subsample, activation=nn.relu) p2=PoolLayer(c2.output,pool_size=pool_size,input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape=(128,p2.output_shape[1],3,3) c3=ConvLayer(rng, p2.output,filter_shape,p2.output_shape,border_mode,subsample, activation=nn.relu) p3=PoolLayer(c3.output,pool_size=pool_size,input_shape=c3.output_shape) #Layer4: conv2+pool filter_shape=(64,p3.output_shape[1],3,3) c4=ConvLayer(rng, p3.output,filter_shape,p3.output_shape,border_mode,subsample, activation=nn.relu) p4=PoolLayer(c4.output,pool_size=pool_size,input_shape=c4.output_shape) #Layer5: hidden n_in= reduce(lambda x, y: x*y, p4.output_shape[1:]) x_flat = p4.output.flatten(2) h1=HiddenLayer(rng,x_flat,n_in,1024,activation=nn.relu) #Layer6: Regressin layer lreg=LogisticRegression(rng,h1.output,1024,2048) #LSTM paramaters self.n_in = 2048 self.n_lstm = params['n_hidden'] self.n_out = params['n_output'] self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) self.params =c1.params+c2.params+c3.params+c4.params+h1.params+lreg.params+layer1.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,h_tm1_1,c_tm1_1): [h_t_1,c_t_1,y_t_1]=layer1.run(x_t,h_tm1_1,c_tm1_1) y = T.dot(y_t_1, self.W_hy) + self.b_y return [h_t_1,c_t_1,y] H = T.matrix(name="H",dtype=dtype) # initial hidden state C = T.matrix(name="C",dtype=dtype) # initial hidden state rnn_input = lreg.y_pred.reshape((batch_size,sequence_length, self.n_in)) [h_t_1,c_t_1,y_vals], _ = theano.scan(fn=step_lstm, sequences=[rnn_input.dimshuffle(1,0,2)], outputs_info=[H, C, None]) self.output = y_vals.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X,Y,is_train,H,C],outputs=[cost,h_t_1[-1],c_t_1[-1]],updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train,H,C], outputs = [self.output,h_t_1[-1],c_t_1[-1]],allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self, rng, params, cost_function='mse', optimizer=RMSprop): lr = params["lr"] n_lstm = params['n_hidden'] n_out = params['n_output'] batch_size = params["batch_size"] sequence_length = params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction #CNN global parameters. subsample = (1, 1) p_1 = 0.5 border_mode = "valid" cnn_batch_size = batch_size * sequence_length pool_size = (2, 2) #Layer1: conv2+pool+drop filter_shape = (64, 1, 9, 9) input_shape = (cnn_batch_size, 1, 120, 60 ) #input_shape= (samples, channels, rows, cols) input = X.reshape(input_shape) c1 = ConvLayer(rng, input, filter_shape, input_shape, border_mode, subsample, activation=nn.relu) p1 = PoolLayer(c1.output, pool_size=pool_size, input_shape=c1.output_shape) dl1 = DropoutLayer(rng, input=p1.output, prob=p_1) retain_prob = 1. - p_1 test_output = p1.output * retain_prob d1_output = T.switch(T.neq(is_train, 0), dl1.output, test_output) #Layer2: conv2+pool filter_shape = (128, p1.output_shape[1], 3, 3) c2 = ConvLayer(rng, d1_output, filter_shape, p1.output_shape, border_mode, subsample, activation=nn.relu) p2 = PoolLayer(c2.output, pool_size=pool_size, input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape = (128, p2.output_shape[1], 3, 3) c3 = ConvLayer(rng, p2.output, filter_shape, p2.output_shape, border_mode, subsample, activation=nn.relu) p3 = PoolLayer(c3.output, pool_size=pool_size, input_shape=c3.output_shape) #Layer4: hidden n_in = reduce(lambda x, y: x * y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1 = HiddenLayer(rng, x_flat, n_in, 1024, activation=nn.relu) n_in = 1024 rnn_input = h1.output.reshape((batch_size, sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xr = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xi', sample='glorot') self.W_hr = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hr', sample='glorot') self.b_r = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_xz = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xz', sample='glorot') self.W_hz = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hz', sample='glorot') self.b_z = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_xh = init_weight((self.n_in, self.n_lstm), rng=rng, name='W_xh', sample='glorot') self.W_hh = init_weight((self.n_lstm, self.n_lstm), rng=rng, name='W_hh', sample='glorot') self.b_h = init_bias(self.n_lstm, rng=rng, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng, name='W_hy', sample='glorot') self.b_y = init_bias(self.n_out, rng=rng, sample='zero') self.params = [ self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h, self.W_hy, self.b_y ] def step_lstm(x_t, h_tm1): r_t = T.nnet.sigmoid( T.dot(x_t, self.W_xr) + T.dot(h_tm1, self.W_hr) + self.b_r) z_t = T.nnet.sigmoid( T.dot(x_t, self.W_xz) + T.dot(h_tm1, self.W_hz) + self.b_z) h_t = T.tanh( T.dot(x_t, self.W_xh) + T.dot((r_t * h_tm1), self.W_hh) + self.b_h) hh_t = z_t * h_t + (1 - z_t) * h_tm1 y_t = T.dot(hh_t, self.W_hy) + self.b_y return [hh_t, y_t] h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=rnn_input.dimshuffle(1, 0, 2), outputs_info=[h0, None]) self.output = y_vals.dimshuffle(1, 0, 2) self.params = c1.params + c2.params + c3.params + h1.params + self.params cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y, is_train], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X, is_train], outputs=self.output, allow_input_downcast=True) self.n_param = count_params(self.params)
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, single_output=True, output_activation=theano.tensor.nnet.relu,cost_function='nll'): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xi = init_weight((self.n_in, self.n_lstm),'W_xi') self.W_hi = init_weight((self.n_lstm, self.n_lstm),'W_hi', 'svd') self.W_ci = init_weight((self.n_lstm, self.n_lstm),'W_ci', 'svd') self.b_i = shared(np.cast[dtype](np.random.uniform(-0.5,.5,size = n_lstm))) self.W_xf = init_weight((self.n_in, self.n_lstm),'W_xf') self.W_hf = init_weight((self.n_lstm, self.n_lstm),'W_hf', 'svd') self.W_cf = init_weight((self.n_lstm, self.n_lstm),'W_cf', 'svd') self.b_f = shared(np.cast[dtype](np.random.uniform(0, 1.,size = n_lstm))) self.W_xc = init_weight((self.n_in, self.n_lstm),'W_xc') self.W_hc = init_weight((self.n_lstm, self.n_lstm),'W_hc', 'svd') self.b_c = shared(np.zeros(n_lstm, dtype=dtype)) self.W_xo = init_weight((self.n_in, self.n_lstm),'W_xo') self.W_ho = init_weight((self.n_lstm, self.n_lstm),'W_ho', 'svd') self.W_co = init_weight((self.n_lstm, self.n_lstm),'W_co', 'svd') self.b_o = shared(np.cast[dtype](np.random.uniform(-0.5,.5,size = n_lstm))) self.W_hy = init_weight((self.n_lstm, self.n_out),'W_hy') self.b_y = shared(np.zeros(n_out, dtype=dtype)) self.params = [self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_ho, self.W_co, self.b_o, self.W_hy, self.b_y] def step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) + self.b_i) f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo)+ T.dot(h_tm1, self.W_ho) + T.dot(c_t, self.W_co) + self.b_o) h_t = o_t * T.tanh(c_t) y_t = output_activation(T.dot(h_t, self.W_hy) + self.b_y) return [h_t, c_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1,0,2), outputs_info=[h0, c0, None]) if single_output: self.output = y_vals[-1] else: self.output = y_vals.dimshuffle(1,0,2) cxe = T.mean(T.nnet.binary_crossentropy(self.output, Y)) nll = -T.mean(Y * T.log(self.output)+ (1.- Y) * T.log(1. - self.output)) mse = T.mean((self.output - Y) ** 2) cost = 0 if cost_function == 'mse': cost = mse elif cost_function == 'cxe': cost = cxe else: cost = nll optimizer = RMSprop( cost, self.params, lr=lr ) # gparams = T.grad(cost, self.params) # updates = OrderedDict() # for param, gparam in zip(self.params, gparams): # updates[param] = param - gparam * lr # self.loss = theano.function(inputs = [X, Y], outputs = [cxe, mse, cost]) # self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates,allow_input_downcast=True) self.train = theano.function(inputs=[X, Y],outputs=cost,updates=optimizer.getUpdates(),allow_input_downcast=True) #self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates,allow_input_downcast=True) self.predictions = theano.function(inputs = [X], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.debug = theano.function(inputs = [X, Y], outputs = [X.shape, Y.shape, y_vals.shape, cxe.shape])
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, output_activation=theano.tensor.nnet.relu, cost_function='mse', optimizer=RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.n_fc1 = 256 self.n_fc2 = 256 self.n_fc3 = 256 self.W_fc1 = init_weight((self.n_fc1, self.n_fc2), 'W_fc1', 'glorot') self.b_fc1 = init_bias(self.n_fc2, sample='zero') self.W_fc2 = init_weight((self.n_fc2, self.n_fc3), 'W_fc2', 'glorot') self.b_fc2 = init_bias(self.n_fc3, sample='zero') self.W_fc3 = init_weight((self.n_fc3, self.n_out), 'w_fc3', 'glorot') self.b_fc3 = init_bias(self.n_out, sample='zero') #1th layer self.W_xi_1 = init_weight((self.n_in, self.n_lstm), 'W_xi_1', 'glorot') self.W_hi_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hi_1', 'glorot') self.W_ci_1 = init_weight((self.n_lstm, self.n_lstm), 'W_ci_1', 'glorot') self.b_i_1 = init_bias(self.n_lstm, sample='zero') self.W_xf_1 = init_weight((self.n_in, self.n_lstm), 'W_xf_1', 'glorot') self.W_hf_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hf_1', 'glorot') self.W_cf_1 = init_weight((self.n_lstm, self.n_lstm), 'W_cf_1', 'glorot') self.b_f_1 = init_bias(self.n_lstm, sample='one') self.W_xc_1 = init_weight((self.n_in, self.n_lstm), 'W_xc_1', 'glorot') self.W_hc_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hc_1', 'glorot') self.b_c_1 = init_bias(self.n_lstm, sample='zero') self.W_xo_1 = init_weight((self.n_in, self.n_lstm), 'W_xo_1', 'glorot') self.W_ho_1 = init_weight((self.n_lstm, self.n_lstm), 'W_ho_1', 'glorot') self.W_co_1 = init_weight((self.n_lstm, self.n_lstm), 'W_co_1', 'glorot') self.b_o_1 = init_bias(self.n_lstm, sample='zero') #self.W_hy_1 = init_weight((self.n_lstm, self.n_out), 'W_hy_1') #self.b_y_1 = init_bias(self.n_lstm, sample='zero') #2th layer self.W_xi_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xi_2', 'glorot') self.W_hi_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hi_2', 'glorot') self.W_ci_2 = init_weight((self.n_lstm, self.n_lstm), 'W_ci_2', 'glorot') self.b_i_2 = init_bias(self.n_lstm, sample='zero') self.W_xf_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xf_2', 'glorot') self.W_hf_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hf_2', 'glorot') self.W_cf_2 = init_weight((self.n_lstm, self.n_lstm), 'W_cf_2', 'glorot') self.b_f_2 = init_bias(self.n_lstm, sample='one') self.W_xc_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xc_2', 'glorot') self.W_hc_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hc_2', 'glorot') self.b_c_2 = init_bias(self.n_lstm, sample='zero') self.W_xo_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xo_2', 'glorot') self.W_ho_2 = init_weight((self.n_lstm, self.n_lstm), 'W_ho_2', 'glorot') self.W_co_2 = init_weight((self.n_lstm, self.n_lstm), 'W_co_2', 'glorot') self.b_o_2 = init_bias(self.n_lstm, sample='zero') self.W_hy_2 = init_weight((self.n_lstm, self.n_out), 'W_hy_2', 'glorot') self.b_y_2 = init_bias(self.n_out, sample='zero') self.params = [ self.W_xi_1, self.W_hi_1, self.W_ci_1, self.b_i_1, self.W_xf_1, self.W_hf_1, self.W_cf_1, self.b_f_1, self.W_xc_1, self.W_hc_1, self.b_c_1, self.W_xo_1, self.W_ho_1, self.W_co_1, self.b_o_1, # self.W_hy_1, self.b_y_1, self.W_xi_2, self.W_hi_2, self.W_ci_2, self.b_i_2, self.W_xf_2, self.W_hf_2, self.W_cf_2, self.b_f_2, self.W_xc_2, self.W_hc_2, self.b_c_2, self.W_xo_2, self.W_ho_2, self.W_co_2, self.b_o_2, self.W_hy_2, self.b_y_2, self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2, self.W_fc3, self.b_fc3 ] def step_lstm(x_t, h_tm1, c_tm1, h_tm2, c_tm2): i_t_1 = T.nnet.sigmoid( T.dot(x_t, self.W_xi_1) + T.dot(h_tm1, self.W_hi_1) + T.dot(c_tm1, self.W_ci_1) + self.b_i_1) f_t_1 = T.nnet.sigmoid( T.dot(x_t, self.W_xf_1) + T.dot(h_tm1, self.W_hf_1) + T.dot(c_tm1, self.W_cf_1) + self.b_f_1) c_t_1 = f_t_1 * c_tm1 + i_t_1 * T.tanh( T.dot(x_t, self.W_xc_1) + T.dot(h_tm1, self.W_hc_1) + self.b_c_1) o_t_1 = T.nnet.sigmoid( T.dot(x_t, self.W_xo_1) + T.dot(h_tm1, self.W_ho_1) + T.dot(c_t_1, self.W_co_1) + self.b_o_1) h_t_1 = o_t_1 * T.tanh(c_t_1) #y_t_1 = output_activation(T.dot(h_t_1, self.W_hy_1) + self.b_y_1) i_t_2 = T.nnet.sigmoid( T.dot(h_t_1, self.W_xi_2) + T.dot(h_tm2, self.W_hi_2) + T.dot(c_tm2, self.W_ci_2) + self.b_i_2) f_t_2 = T.nnet.sigmoid( T.dot(h_t_1, self.W_xf_2) + T.dot(h_tm2, self.W_hf_2) + T.dot(c_tm2, self.W_cf_2) + self.b_f_2) c_t_2 = f_t_2 * c_tm2 + i_t_2 * T.tanh( T.dot(h_t_1, self.W_xc_2) + T.dot(h_tm2, self.W_hc_2) + self.b_c_2) o_t_2 = T.nnet.sigmoid( T.dot(h_t_1, self.W_xo_2) + T.dot(h_tm2, self.W_ho_2) + T.dot(c_t_2, self.W_co_2) + self.b_o_2) h_t_2 = o_t_2 * T.tanh(c_t_2) y_t_2 = T.tanh(T.dot(h_t_2, self.W_hy_2) + self.b_y_2) return [h_t_1, c_t_1, h_t_2, c_t_2, y_t_2] X = T.tensor3() # batch of sequence of vector Y = T.tensor3( ) # batch of sequence of vector (should be 0 when X is not null) h0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state h0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_vals_1, c_vals_1, h_vals_2, c_vals_2, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1, 0, 2), outputs_info=[h0_1, c0_1, h0_2, c0_2, None]) #Hidden layers fc1_out = T.tanh(T.dot(y_vals, self.W_fc1) + self.b_fc1) fc2_out = T.tanh(T.dot(fc1_out, self.W_fc2) + self.b_fc2) fc3_out = T.tanh(T.dot(fc2_out, self.W_fc3) + self.b_fc3) self.output = fc3_out.dimshuffle(1, 0, 2) cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X], outputs=y_vals.dimshuffle(1, 0, 2), allow_input_downcast=True) self.n_param = (n_lstm * n_lstm * 4 + n_in * n_lstm * 4 + n_lstm * n_out + n_lstm * 3) * 2
def __init__(self,rng,params,cost_function='mse',optimizer = RMSprop): lr=params["lr"] n_lstm=params['n_hidden'] n_out=params['n_output'] batch_size=params["batch_size"] sequence_length=params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction #CNN global parameters. subsample=(1,1) p_1=0.5 border_mode="valid" cnn_batch_size=batch_size*sequence_length pool_size=(2,2) #Layer1: conv2+pool+drop filter_shape=(64,1,9,9) input_shape=(cnn_batch_size,1,120,60) #input_shape= (samples, channels, rows, cols) input= X.reshape(input_shape) c1=ConvLayer(rng, input,filter_shape, input_shape,border_mode,subsample, activation=nn.relu) p1=PoolLayer(c1.output,pool_size=pool_size,input_shape=c1.output_shape) dl1=DropoutLayer(rng,input=p1.output,prob=p_1,is_train=is_train) retain_prob = 1. - p_1 test_output = p1.output*retain_prob d1_output = T.switch(T.neq(is_train, 0), dl1.output, test_output) #Layer2: conv2+pool filter_shape=(128,p1.output_shape[1],3,3) c2=ConvLayer(rng, d1_output, filter_shape,p1.output_shape,border_mode,subsample, activation=nn.relu) p2=PoolLayer(c2.output,pool_size=pool_size,input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape=(128,p2.output_shape[1],3,3) c3=ConvLayer(rng, p2.output,filter_shape,p2.output_shape,border_mode,subsample, activation=nn.relu) p3=PoolLayer(c3.output,pool_size=pool_size,input_shape=c3.output_shape) #Layer4: hidden n_in= reduce(lambda x, y: x*y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1=HiddenLayer(rng,x_flat,n_in,1024,activation=nn.relu) n_in=1024 rnn_input = h1.output.reshape((batch_size,sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) layer2=LSTMLayer(rng,1,self.n_lstm,self.n_lstm) layer3=LSTMLayer(rng,2,self.n_lstm,self.n_lstm) self.params = layer1.params+layer2.params+layer3.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,mask,h_tm1_1,c_tm1_1,h_tm1_2,c_tm1_2,h_tm1_3,c_tm1_3): [h_t_1,c_t_1,y_t_1]=layer1.run(x_t,h_tm1_1,c_tm1_1) dl1=DropoutLayer(rng,input=y_t_1,prob=0.5,is_train=is_train,mask=mask) [h_t_2,c_t_2,y_t_2]=layer2.run(dl1.output,h_tm1_2,c_tm1_2) [h_t_3,c_t_3,y_t_3]=layer3.run(y_t_2,h_tm1_3,c_tm1_3) y = T.dot(y_t_3, self.W_hy) + self.b_y return [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y] h0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state h0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state h0_3 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_3 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state mask_shape=(sequence_length,batch_size,self.n_lstm) p_1=0.5 mask= rng.binomial(size=mask_shape, p=p_1, dtype=X.dtype) #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_t_1,c_t_1,h_t_2,c_t_2,h_t_3,c_t_3,y_vals], _ = theano.scan(fn=step_lstm, sequences=[rnn_input.dimshuffle(1,0,2),mask], outputs_info=[h0_1, c0_1,h0_2, c0_2, h0_3, c0_3, None]) self.output = y_vals.dimshuffle(1,0,2) self.params =c1.params+c2.params+c3.params+h1.params+self.params cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X,Y,is_train],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train], outputs = self.output,allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self,rng,params,cost_function='mse',optimizer = RMSprop): lr=params["lr"] n_lstm=params['n_hidden'] n_out=params['n_output'] batch_size=params["batch_size"] sequence_length=params["seq_length"] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction #CNN global parameters. subsample=(1,1) p_1=0.5 border_mode="valid" cnn_batch_size=batch_size*sequence_length pool_size=(2,2) #Layer1: conv2+pool+drop filter_shape=(64,1,9,9) input_shape=(cnn_batch_size,1,120,60) #input_shape= (samples, channels, rows, cols) input= X.reshape(input_shape) c1=ConvLayer(rng, input,filter_shape, input_shape,border_mode,subsample, activation=nn.relu) p1=PoolLayer(c1.output,pool_size=pool_size,input_shape=c1.output_shape) dl1=DropoutLayer(rng,input=p1.output,prob=p_1,is_train=is_train) #Layer2: conv2+pool filter_shape=(128,p1.output_shape[1],3,3) c2=ConvLayer(rng, dl1.output, filter_shape,p1.output_shape,border_mode,subsample, activation=nn.relu) p2=PoolLayer(c2.output,pool_size=pool_size,input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape=(128,p2.output_shape[1],3,3) c3=ConvLayer(rng, p2.output,filter_shape,p2.output_shape,border_mode,subsample, activation=nn.relu) p3=PoolLayer(c3.output,pool_size=pool_size,input_shape=c3.output_shape) #Layer4: hidden n_in= reduce(lambda x, y: x*y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1=HiddenLayer(rng,x_flat,n_in,1024,activation=nn.relu) n_in=1024 rnn_input = h1.output.reshape((batch_size,sequence_length, n_in)) #Layer5: LSTM self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) self.params = layer1.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,h_tm1,c_tm1): [h_t,c_t,y_t]=layer1.run(x_t,h_tm1,c_tm1) y = T.dot(y_t, self.W_hy) + self.b_y return [h_t,c_t,y] H = T.matrix(name="H",dtype=dtype) # initial hidden state C = T.matrix(name="C",dtype=dtype) # initial hidden state [h_t,c_t,y_vals], _ = theano.scan(fn=step_lstm, sequences=[rnn_input.dimshuffle(1,0,2)], outputs_info=[H, C, None]) self.output = y_vals.dimshuffle(1,0,2) self.params =c1.params+c2.params+c3.params+h1.params+self.params cost=get_err_fn(self,cost_function,Y) L2_reg=0.0001 L2_sqr = theano.shared(0.) for param in self.params: L2_sqr += (T.sum(param ** 2)) cost += L2_reg*L2_sqr _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X,Y,is_train,H,C],outputs=[cost,h_t[-1],c_t[-1]],updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train,H,C], outputs = [self.output,h_t[-1],c_t[-1]],allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, output_activation=theano.tensor.nnet.relu,cost_function='mse',optimizer = RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.n_fc1=256 self.n_fc2=256 self.n_fc3=256 self.W_fc1 = init_weight((self.n_fc1, self.n_fc2),'W_fc1', 'glorot') self.b_fc1 = init_bias(self.n_fc2, sample='zero') self.W_fc2 = init_weight((self.n_fc2, self.n_fc3),'W_fc2', 'glorot') self.b_fc2 =init_bias(self.n_fc3, sample='zero') self.W_fc3 = init_weight((self.n_fc3, self.n_out),'w_fc3', 'glorot') self.b_fc3 =init_bias(self.n_out, sample='zero') #1th layer self.W_xi_1 = init_weight((self.n_in, self.n_lstm), 'W_xi_1', 'glorot') self.W_hi_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hi_1', 'glorot') self.W_ci_1 = init_weight((self.n_lstm, self.n_lstm), 'W_ci_1', 'glorot') self.b_i_1 = init_bias(self.n_lstm, sample='zero') self.W_xf_1 = init_weight((self.n_in, self.n_lstm), 'W_xf_1', 'glorot') self.W_hf_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hf_1', 'glorot') self.W_cf_1 = init_weight((self.n_lstm, self.n_lstm), 'W_cf_1', 'glorot') self.b_f_1 = init_bias(self.n_lstm, sample='one') self.W_xc_1 = init_weight((self.n_in, self.n_lstm), 'W_xc_1', 'glorot') self.W_hc_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hc_1', 'glorot') self.b_c_1 = init_bias(self.n_lstm, sample='zero') self.W_xo_1 = init_weight((self.n_in, self.n_lstm), 'W_xo_1', 'glorot') self.W_ho_1 = init_weight((self.n_lstm, self.n_lstm), 'W_ho_1', 'glorot') self.W_co_1 = init_weight((self.n_lstm, self.n_lstm), 'W_co_1', 'glorot') self.b_o_1 = init_bias(self.n_lstm, sample='zero') #self.W_hy_1 = init_weight((self.n_lstm, self.n_out), 'W_hy_1') #self.b_y_1 = init_bias(self.n_lstm, sample='zero') #2th layer self.W_xi_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xi_2', 'glorot') self.W_hi_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hi_2', 'glorot') self.W_ci_2 = init_weight((self.n_lstm, self.n_lstm), 'W_ci_2', 'glorot') self.b_i_2 = init_bias(self.n_lstm, sample='zero') self.W_xf_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xf_2', 'glorot') self.W_hf_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hf_2', 'glorot') self.W_cf_2 = init_weight((self.n_lstm, self.n_lstm), 'W_cf_2', 'glorot') self.b_f_2 = init_bias(self.n_lstm, sample='one') self.W_xc_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xc_2', 'glorot') self.W_hc_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hc_2', 'glorot') self.b_c_2 = init_bias(self.n_lstm, sample='zero') self.W_xo_2 = init_weight((self.n_lstm, self.n_lstm), 'W_xo_2', 'glorot') self.W_ho_2 = init_weight((self.n_lstm, self.n_lstm), 'W_ho_2', 'glorot') self.W_co_2 = init_weight((self.n_lstm, self.n_lstm), 'W_co_2', 'glorot') self.b_o_2 = init_bias(self.n_lstm, sample='zero') self.W_hy_2 = init_weight((self.n_lstm, self.n_out), 'W_hy_2', 'glorot') self.b_y_2 = init_bias(self.n_out, sample='zero') self.params = [ self.W_xi_1, self.W_hi_1, self.W_ci_1, self.b_i_1, self.W_xf_1, self.W_hf_1, self.W_cf_1, self.b_f_1, self.W_xc_1, self.W_hc_1, self.b_c_1, self.W_xo_1, self.W_ho_1, self.W_co_1, self.b_o_1, # self.W_hy_1, self.b_y_1, self.W_xi_2, self.W_hi_2, self.W_ci_2, self.b_i_2, self.W_xf_2, self.W_hf_2, self.W_cf_2, self.b_f_2, self.W_xc_2, self.W_hc_2, self.b_c_2, self.W_xo_2, self.W_ho_2, self.W_co_2, self.b_o_2, self.W_hy_2, self.b_y_2, self.W_fc1, self.b_fc1,self.W_fc2, self.b_fc2,self.W_fc3, self.b_fc3 ] def step_lstm(x_t, h_tm1, c_tm1,h_tm2,c_tm2): i_t_1 = T.nnet.sigmoid(T.dot(x_t, self.W_xi_1) + T.dot(h_tm1, self.W_hi_1) + T.dot(c_tm1, self.W_ci_1) + self.b_i_1) f_t_1 = T.nnet.sigmoid(T.dot(x_t, self.W_xf_1) + T.dot(h_tm1, self.W_hf_1) + T.dot(c_tm1, self.W_cf_1) + self.b_f_1) c_t_1 = f_t_1 * c_tm1 + i_t_1 * T.tanh(T.dot(x_t, self.W_xc_1) + T.dot(h_tm1, self.W_hc_1) + self.b_c_1) o_t_1 = T.nnet.sigmoid(T.dot(x_t, self.W_xo_1) + T.dot(h_tm1, self.W_ho_1) + T.dot(c_t_1, self.W_co_1) + self.b_o_1) h_t_1 = o_t_1 * T.tanh(c_t_1) #y_t_1 = output_activation(T.dot(h_t_1, self.W_hy_1) + self.b_y_1) i_t_2 = T.nnet.sigmoid(T.dot(h_t_1, self.W_xi_2) + T.dot(h_tm2, self.W_hi_2) + T.dot(c_tm2, self.W_ci_2) + self.b_i_2) f_t_2 = T.nnet.sigmoid(T.dot(h_t_1, self.W_xf_2) + T.dot(h_tm2, self.W_hf_2) + T.dot(c_tm2, self.W_cf_2) + self.b_f_2) c_t_2 = f_t_2 * c_tm2 + i_t_2 * T.tanh(T.dot(h_t_1, self.W_xc_2) + T.dot(h_tm2, self.W_hc_2) + self.b_c_2) o_t_2 = T.nnet.sigmoid(T.dot(h_t_1, self.W_xo_2) + T.dot(h_tm2, self.W_ho_2) + T.dot(c_t_2, self.W_co_2) + self.b_o_2) h_t_2 = o_t_2 * T.tanh(c_t_2) y_t_2 = T.tanh(T.dot(h_t_2, self.W_hy_2) + self.b_y_2) return [h_t_1,c_t_1,h_t_2,c_t_2, y_t_2] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector (should be 0 when X is not null) h0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state h0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state [h_vals_1, c_vals_1,h_vals_2, c_vals_2, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1,0,2), outputs_info=[h0_1, c0_1,h0_2, c0_2, None]) #Hidden layers fc1_out = T.tanh(T.dot(y_vals, self.W_fc1) + self.b_fc1) fc2_out = T.tanh(T.dot(fc1_out, self.W_fc2) + self.b_fc2) fc3_out = T.tanh(T.dot(fc2_out, self.W_fc3) + self.b_fc3) self.output=fc3_out.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X, Y],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.n_param=(n_lstm*n_lstm*4+n_in*n_lstm*4+n_lstm*n_out+n_lstm*3)*2
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, output_activation=theano.tensor.nnet.relu, cost_function='mse', optimizer=RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xi = init_weight((self.n_in, self.n_lstm), 'W_xi', 'glorot') self.W_hi = init_weight((self.n_lstm, self.n_lstm), 'W_hi', 'ortho') self.b_i = init_bias(self.n_lstm, sample='zero') self.W_xf = init_weight((self.n_in, self.n_lstm), 'W_xf', 'glorot') self.W_hf = init_weight((self.n_lstm, self.n_lstm), 'W_hf', 'ortho') self.b_f = init_bias(self.n_lstm, sample='one') self.W_xc = init_weight((self.n_in, self.n_lstm), 'W_xc', 'glorot') self.W_hc = init_weight((self.n_lstm, self.n_lstm), 'W_hc', 'ortho') self.b_c = init_bias(self.n_lstm, sample='zero') self.W_xo = init_weight((self.n_in, self.n_lstm), 'W_xo', 'glorot') self.W_ho = init_weight((self.n_lstm, self.n_lstm), 'W_ho', 'ortho') self.b_o = init_bias(self.n_lstm, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_out), 'W_hy', 'glorot') self.b_y = init_bias(self.n_out, sample='zero') self.params = [ self.W_xi, self.W_hi, self.b_i, self.W_xf, self.W_hf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, self.b_o, self.W_hy, self.b_y ] def step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid( T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + self.b_i) f_t = T.nnet.sigmoid( T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh( T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) o_t = T.nnet.sigmoid( T.dot(x_t, self.W_xo) + T.dot(h_tm1, self.W_ho) + self.b_o) h_t = o_t * T.tanh(c_t) y_t = T.tanh(T.dot(h_t, self.W_hy) + self.b_y) return [h_t, c_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3( ) # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1, 0, 2), outputs_info=[h0, c0, None]) self.output = y_vals.dimshuffle(1, 0, 2) cxe = T.mean(T.nnet.binary_crossentropy(self.output, Y)) nll = -T.mean(Y * T.log(self.output) + (1. - Y) * T.log(1. - self.output)) mse = T.mean((self.output - Y)**2) cost = 0 if cost_function == 'mse': cost = mse elif cost_function == 'cxe': cost = cxe else: cost = nll _optimizer = optimizer(cost, self.params, lr=lr) def reset(): h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) c0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) self.train = theano.function(inputs=[X, Y], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X], outputs=y_vals.dimshuffle(1, 0, 2), allow_input_downcast=True) self.n_param = n_lstm * n_lstm * 4 + n_in * n_lstm * 4 + n_lstm * n_out + n_lstm * 3
def __init__(self, n_in, n_lstm, n_out, lr=0.00001, batch_size=64, output_activation=theano.tensor.nnet.relu,cost_function='mse',optimizer = RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_xr_1 = init_weight((self.n_in, self.n_lstm), 'W_xr_1', 'glorot') self.W_hr_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hr_1', 'ortho') self.b_r_1 = init_bias(self.n_lstm, sample='zero') self.W_xz_1 = init_weight((self.n_in, self.n_lstm), 'W_xz_1', 'glorot') self.W_hz_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hz_1', 'ortho') self.b_z_1 = init_bias(self.n_lstm, sample='zero') self.W_xh_1 = init_weight((self.n_in, self.n_lstm), 'W_xh_1', 'glorot') self.W_hh_1 = init_weight((self.n_lstm, self.n_lstm), 'W_hh_1', 'ortho') self.b_h_1 = init_bias(self.n_lstm, sample='zero') # self.W_hy_1 = init_weight((self.n_lstm, self.n_out),'W_hy_1', 'glorot') # self.b_y_1 = init_bias(self.n_out, sample='zero') self.W_xr_2 = init_weight((self.n_in, self.n_lstm), 'W_xr', 'glorot') self.W_hr_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hr', 'ortho') self.b_r_2 = init_bias(self.n_lstm, sample='zero') self.W_xz_2 = init_weight((self.n_in, self.n_lstm), 'W_xz', 'glorot') self.W_hz_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hz', 'ortho') self.b_z_2 = init_bias(self.n_lstm, sample='zero') self.W_xh_2 = init_weight((self.n_in, self.n_lstm), 'W_xh', 'glorot') self.W_hh_2 = init_weight((self.n_lstm, self.n_lstm), 'W_hh', 'ortho') self.b_h_2 = init_bias(self.n_lstm, sample='zero') self.W_hy_2 = init_weight((self.n_lstm, self.n_out),'W_hy', 'glorot') self.b_y_2 = init_bias(self.n_out, sample='zero') self.params = [self.W_xr_1, self.W_hr_1, self.b_r_1, self.W_xz_1, self.W_hz_1, self.b_z_1, self.W_xh_1, self.W_hh_1, self.b_h_1, self.W_xr_2, self.W_hr_2, self.b_r_2, self.W_xz_2, self.W_hz_2, self.b_z_2, self.W_xh_2, self.W_hh_2, self.b_h_2, self.W_hy_f,self.W_hy_b, self.b_y ] def f_step_lstm(x_t, h_tm1_1): r_t_1 = T.nnet.sigmoid(T.dot(x_t, self.W_xr_1) + T.dot(h_tm1_1, self.W_hr_1) + self.b_r_1) z_t_1 = T.nnet.sigmoid(T.dot(x_t, self.W_xz_1) + T.dot(h_tm1_1, self.W_hz_1) + self.b_z_1) h_t_1 = T.tanh(T.dot(x_t, self.W_xh_1) + T.dot((r_t_1*h_tm1_1),self.W_hh_1) + self.b_h_1) hh_t_1 = z_t_1 * h_t_1 + (1-z_t_1)*h_tm1_1 return [hh_t_1] def b_step_lstm(x_t, h_tm1_2): r_t_2 = T.nnet.sigmoid(T.dot(x_t, self.W_xr_2) + T.dot(h_tm1_2, self.W_hr_2) + self.b_r_2) z_t_2 = T.nnet.sigmoid(T.dot(x_t, self.W_xz_2) + T.dot(h_tm1_2, self.W_hz_2) + self.b_z_2) h_t_2 = T.tanh(T.dot(x_t, self.W_xh_2) + T.dot((r_t_2*h_tm1_2),self.W_hh_2) + self.b_h_2) hh_t_2 = z_t_2 * h_t_2 + (1-z_t_2)*h_tm1_2 return [hh_t_2] X_f = T.tensor3() # batch of sequence of vector X_b = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state [h_f, c_vals], _ = theano.scan(fn=f_step_lstm, sequences=X_f.dimshuffle(1,0,2), outputs_info=[h0, c0]) [h_b, c_vals], _ = theano.scan(fn=b_step_lstm, sequences=X_b.dimshuffle(1,0,2), outputs_info=[h0, c0]) h_b=h_b[:,::-1] y_vals=T.tanh(T.dot(h_f, self.W_hy_f)+T.dot(h_b, self.W_hy_b)+self.b_y) self.output = y_vals.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X_f,X_b, Y],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X_f,X_b], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.n_param=n_lstm*n_lstm*3+n_in*n_lstm*3+n_lstm*n_out+n_lstm*3
def __init__(self, rng, params, cost_function='mse', optimizer=RMSprop): batch_size = params['batch_size'] sequence_length = params["seq_length"] lr = params['lr'] self.n_in = 2048 self.n_lstm = params['n_hidden'] self.n_out = params['n_output'] self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng, name='W_hy', sample='glorot') self.b_y = init_bias(self.n_out, rng=rng, sample='zero') layer1 = LSTMLayer(rng, 0, self.n_in, self.n_lstm) self.params = layer1.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t, h_tm1_1, c_tm1_1): [h_t_1, c_t_1, y_t_1] = layer1.run(x_t, h_tm1_1, c_tm1_1) y = T.dot(y_t_1, self.W_hy) + self.b_y return [h_t_1, c_t_1, y] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction H = T.matrix(name="H", dtype=dtype) # initial hidden state C = T.matrix(name="C", dtype=dtype) # initial hidden state noise = rng.normal(size=(batch_size, sequence_length, self.n_in), std=0.0002, avg=0.0, dtype=theano.config.floatX) X_train = noise + X X_tilde = T.switch(T.neq(is_train, 0), X_train, X) # h0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state # c0_1 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial cell state [h_t_1, c_t_1, y_vals], _ = theano.scan(fn=step_lstm, sequences=[X_tilde.dimshuffle(1, 0, 2)], outputs_info=[H, C, None]) self.output = y_vals.dimshuffle(1, 0, 2) cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y, is_train, H, C], outputs=[cost, h_t_1[-1], c_t_1[-1]], updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function( inputs=[X, is_train, H, C], outputs=[self.output, h_t_1[-1], c_t_1[-1]], allow_input_downcast=True) self.n_param = count_params(self.params)
def __init__(self,rng,params,cost_function='mse',optimizer = RMSprop): lr=params["lr"] n_lstm=params['n_hidden'] n_out=params['n_output'] batch_size=params["batch_size"] sequence_length=params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction #CNN global parameters. subsample=(1,1) p_1=0.5 border_mode="valid" cnn_batch_size=batch_size*sequence_length pool_size=(2,2) #Layer1: conv2+pool+drop filter_shape=(64,1,9,9) input_shape=(cnn_batch_size,1,120,60) #input_shape= (samples, channels, rows, cols) input= X.reshape(input_shape) c1=ConvLayer(rng, input,filter_shape, input_shape,border_mode,subsample, activation=nn.relu) p1=PoolLayer(c1.output,pool_size=pool_size,input_shape=c1.output_shape) dl1=DropoutLayer(rng,input=p1.output,prob=p_1,is_train=is_train) #Layer2: conv2+pool filter_shape=(128,p1.output_shape[1],3,3) c2=ConvLayer(rng, dl1.output, filter_shape,p1.output_shape,border_mode,subsample, activation=nn.relu) p2=PoolLayer(c2.output,pool_size=pool_size,input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape=(128,p2.output_shape[1],3,3) c3=ConvLayer(rng, p2.output,filter_shape,p2.output_shape,border_mode,subsample, activation=nn.relu) p3=PoolLayer(c3.output,pool_size=pool_size,input_shape=c3.output_shape) #Layer4: hidden n_in= reduce(lambda x, y: x*y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1=HiddenLayer(rng,x_flat,n_in,1024,activation=nn.relu) n_in=1024 rnn_input = h1.output.reshape((batch_size,sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng,name='W_hy', sample= 'glorot') self.b_y = init_bias(self.n_out,rng=rng, sample='zero') layer1=LSTMLayer(rng,0,self.n_in,self.n_lstm) self.params = layer1.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t,h_tm1,c_tm1): [h_t,c_t,y_t]=layer1.run(x_t,h_tm1,c_tm1) y = T.dot(y_t, self.W_hy) + self.b_y return [h_t,c_t,y] H = T.matrix(name="H",dtype=dtype) # initial hidden state C = T.matrix(name="C",dtype=dtype) # initial hidden state #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_t,c_t,y_vals], _ = theano.scan(fn=step_lstm, sequences=[rnn_input.dimshuffle(1,0,2)], outputs_info=[H, C, None]) self.output = y_vals.dimshuffle(1,0,2) self.params =c1.params+c2.params+c3.params+h1.params+self.params cost=get_err_fn(self,cost_function,Y) L2_reg=0.0001 L2_sqr = theano.shared(0.) for param in self.params: L2_sqr += (T.sum(param ** 2)) cost += L2_reg*L2_sqr _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X,Y,is_train,H,C],outputs=[cost,h_t[-1],c_t[-1]],updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X,is_train,H,C], outputs = [self.output,h_t[-1],c_t[-1]],allow_input_downcast=True) self.n_param=count_params(self.params)
def __init__(self, rng, params, cost_function="mse", optimizer=RMSprop): lr = params["lr"] n_lstm = params["n_hidden"] n_out = params["n_output"] batch_size = params["batch_size"] sequence_length = params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar("is_train") # pseudo boolean for switching between training and prediction # CNN global parameters. subsample = (1, 1) p_1 = 0.5 border_mode = "same" cnn_batch_size = batch_size * sequence_length pool_size = (2, 2) n_lstm_layer = 2 f_dict = dict() f_dict["filter_shape_" + str(0)] = (64, 1, 9, 9) f_dict["s_filter_shape_" + str(0)] = (64, f_dict["filter_shape_" + str(0)][0], 9, 9) f_dict["filter_shape_" + str(1)] = (128, f_dict["filter_shape_" + str(0)][0], 3, 3) f_dict["s_filter_shape_" + str(1)] = (128, f_dict["filter_shape_" + str(1)][0], 3, 3) f_dict["filter_shape_" + str(2)] = (128, f_dict["filter_shape_" + str(1)][0], 3, 3) f_dict["s_filter_shape_" + str(2)] = (128, f_dict["filter_shape_" + str(2)][0], 3, 3) input_shape = (batch_size, sequence_length, 1, 120, 60) # input_shape= (samples, channels, rows, cols) input = X.reshape(input_shape).dimshuffle(1, 0, 2, 3, 4) input_shape = (batch_size, 1, 120, 60) # input_shape= (samples, channels, rows, cols) s_dict = dict() layer_list = ["p", "l", "p", "d", "l", "p", "l", "p"] rows = input_shape[2] cols = input_shape[3] counter = 0 outputs_info = [] for layer in layer_list: if layer == "l": s_index = str(counter) if counter == 0: pre_nfilter = input_shape[1] else: pre_nfilter = f_dict["filter_shape_" + str(counter - 1)][0] i_shape = (batch_size, pre_nfilter, rows, cols) # input_shape= (samples, channels, rows, cols) s_shape = ( batch_size, f_dict["s_filter_shape_" + s_index][0], rows, cols, ) # input_shape= (samples, channels, rows, cols) h = shared(np.zeros(shape=s_shape, dtype=dtype)) # initial hidden state c = shared(np.zeros(shape=s_shape, dtype=dtype)) # initial hidden state s_dict["i_shape_" + s_index] = i_shape s_dict["s_shape_" + s_index] = s_shape outputs_info.append(h) outputs_info.append(c) counter += 1 if layer == "p": rows = rows / 2 cols = cols / 2 s_dict["final_shape"] = (batch_size, sequence_length, pre_nfilter, rows, cols) outputs_info.append(None) outputs_info = tuple(outputs_info) p_dict = dict() for index in range(counter): s_index = str(index) p_dict["W_xi_" + s_index] = u.init_weight( f_dict["filter_shape_" + s_index], rng=rng, name="W_xi_" + s_index, sample="glorot" ) p_dict["W_hi_" + s_index] = u.init_weight( f_dict["s_filter_shape_" + s_index], rng=rng, name="W_hi_" + s_index, sample="glorot" ) p_dict["W_ci_" + s_index] = u.init_weight( f_dict["s_filter_shape_" + s_index], rng=rng, name="W_ci_" + s_index, sample="glorot" ) p_dict["b_i_" + s_index] = u.init_bias(f_dict["filter_shape_" + s_index][0], rng=rng, name="b_i_" + s_index) p_dict["W_xf_" + s_index] = u.init_weight( f_dict["filter_shape_" + s_index], rng=rng, name="W_xf_" + s_index, sample="glorot" ) p_dict["W_hf_" + s_index] = u.init_weight( f_dict["s_filter_shape_" + s_index], rng=rng, name="W_hf_" + s_index, sample="glorot" ) p_dict["W_cf_" + s_index] = u.init_weight( f_dict["s_filter_shape_" + s_index], rng=rng, name="W_cf_" + s_index, sample="glorot" ) p_dict["b_f_" + s_index] = u.init_bias( f_dict["filter_shape_" + s_index][0], rng=rng, name="b_f_" + s_index, sample="one" ) p_dict["W_xc_" + s_index] = u.init_weight( f_dict["filter_shape_" + s_index], rng=rng, name="W_xc_" + s_index, sample="glorot" ) p_dict["W_hc_" + s_index] = u.init_weight( f_dict["s_filter_shape_" + s_index], rng=rng, name="W_hc_" + s_index, sample="glorot" ) p_dict["b_c_" + s_index] = u.init_bias(f_dict["filter_shape_" + s_index][0], rng=rng, name="b_c_" + s_index) p_dict["W_xo_" + s_index] = u.init_weight( f_dict["filter_shape_" + s_index], rng=rng, name="W_xo_" + s_index, sample="glorot" ) p_dict["W_ho_" + s_index] = u.init_weight( f_dict["s_filter_shape_" + s_index], rng=rng, name="W_ho_" + s_index, sample="glorot" ) p_dict["W_co_" + s_index] = u.init_weight( f_dict["s_filter_shape_" + s_index], rng=rng, name="W_co_" + s_index, sample="glorot" ) p_dict["b_o_" + s_index] = u.init_bias(f_dict["filter_shape_" + s_index][0], rng=rng, name="b_o_" + s_index) def step_lstm(x_t, mask, h_tm1_1, c_tm1_1, h_tm1_2, c_tm1_2, h_tm1_3, c_tm1_3): p1 = PoolLayer(x_t, pool_size=pool_size, input_shape=s_dict["i_shape_0"]) layer_1 = CLSTMLayer(rng, 0, p_dict, f_dict, s_dict, p1.output, h_tm1_1, c_tm1_1, border_mode, subsample) [h_t_1, c_t_1, y_t_1] = layer_1.output p2 = PoolLayer(y_t_1, pool_size=pool_size, input_shape=layer_1.yt_shape) dl1 = DropoutLayer(rng, input=p2.output, prob=p_1, is_train=is_train, mask=mask) layer_2 = CLSTMLayer(rng, 1, p_dict, f_dict, s_dict, dl1.output, h_tm1_2, c_tm1_2, border_mode, subsample) [h_t_2, c_t_2, y_t_2] = layer_2.output p2 = PoolLayer(y_t_2, pool_size=pool_size, input_shape=layer_1.yt_shape) layer_3 = CLSTMLayer(rng, 2, p_dict, f_dict, s_dict, p2.output, h_tm1_3, c_tm1_3, border_mode, subsample) [h_t_3, c_t_3, y_t_3] = layer_3.output p3 = PoolLayer(y_t_3, pool_size=pool_size, input_shape=layer_3.yt_shape) return [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, p3.output] # (1, 0, 2) -> AxBxC to BxAxC # (batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) # T.dot(x_t, self.W_xi_1)x_t=(sequence_length, batch_size ,n_in), W_xi_1= [self.n_in, self.n_lstm] # 5.293.568 # 185.983.658 # 19.100.202 # 8.090.154 s_shape = list(s_dict["i_shape_1"]) # after pooling filter sha[e s_shape.insert(0, sequence_length) mask_shape_1 = tuple(s_shape) mask = rng.binomial(size=mask_shape_1, p=p_1, dtype=input.dtype) [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, y_vals], _ = theano.scan( fn=step_lstm, outputs_info=outputs_info, sequences=[input, mask] ) s_dict["final_shape"] = (batch_size, sequence_length, pre_nfilter, rows, cols) hidden_input = y_vals.dimshuffle(1, 0, 2, 3, 4) n_in = reduce(lambda x, y: x * y, s_dict["final_shape"][2:]) x_flat = hidden_input.flatten(3) h1 = HiddenLayer(rng, x_flat, n_in, 1024, activation=nn.relu) n_in = 1024 lreg = LogisticRegression(rng, h1.output, n_in, 42) self.output = lreg.y_pred self.params = p_dict.values() self.params.append(h1.params[0]) self.params.append(h1.params[1]) self.params.append(lreg.params[0]) self.params.append(lreg.params[1]) # # tmp = theano.tensor.switch(theano.tensor.isnan(Y),0,Y) cost = get_err_fn(self, cost_function, Y) L2_reg = 0.0001 L2_sqr = theano.shared(0.0) for param in self.params: L2_sqr += T.sum(param ** 2) cost += L2_reg * L2_sqr _optimizer = optimizer(cost, self.params, lr=lr) # self.train = theano.function(inputs=[X,Y,is_train],outputs=cost,allow_input_downcast=True) # # _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function( inputs=[X, Y, is_train], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True ) self.predictions = theano.function(inputs=[X, is_train], outputs=self.output, allow_input_downcast=True) self.n_param = count_params(self.params)
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, output_activation=theano.tensor.nnet.relu, cost_function='mse', optimizer=RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out #Forward weights self.W_xi_f = init_weight((self.n_in, self.n_lstm), 'W_xif', 'glorot') self.W_hi_f = init_weight((self.n_lstm, self.n_lstm), 'W_hif', 'ortho') self.b_i_f = init_bias(self.n_lstm, sample='zero') self.W_xf_f = init_weight((self.n_in, self.n_lstm), 'W_xf', 'glorot') self.W_hf_f = init_weight((self.n_lstm, self.n_lstm), 'W_hf', 'ortho') self.b_f_f = init_bias(self.n_lstm, sample='one') self.W_xc_f = init_weight((self.n_in, self.n_lstm), 'W_xcf', 'glorot') self.W_hc_f = init_weight((self.n_lstm, self.n_lstm), 'W_hcf', 'ortho') self.b_c_f = init_bias(self.n_lstm, sample='zero') self.W_xo_f = init_weight((self.n_in, self.n_lstm), 'W_xof', 'glorot') self.W_ho_f = init_weight((self.n_lstm, self.n_lstm), 'W_hof', 'ortho') self.b_o_f = init_bias(self.n_lstm, sample='zero') self.W_hy_f = init_weight((self.n_lstm, self.n_out), 'W_hyf', 'glorot') #Backward weights self.W_xi_b = init_weight((self.n_in, self.n_lstm), 'W_xib', 'glorot') self.W_hi_b = init_weight((self.n_lstm, self.n_lstm), 'W_hib', 'ortho') self.b_i_b = init_bias(self.n_lstm, sample='zero') self.W_xf_b = init_weight((self.n_in, self.n_lstm), 'W_xfb', 'glorot') self.W_hf_b = init_weight((self.n_lstm, self.n_lstm), 'W_hfb', 'ortho') self.b_f_b = init_bias(self.n_lstm, sample='one') self.W_xc_b = init_weight((self.n_in, self.n_lstm), 'W_xcb', 'glorot') self.W_hc_b = init_weight((self.n_lstm, self.n_lstm), 'W_hcb', 'ortho') self.b_c_b = init_bias(self.n_lstm, sample='zero') self.W_xo_b = init_weight((self.n_in, self.n_lstm), 'W_xob', 'glorot') self.W_ho_b = init_weight((self.n_lstm, self.n_lstm), 'W_hob', 'ortho') self.b_o_b = init_bias(self.n_lstm, sample='zero') self.W_hy_b = init_weight((self.n_lstm, self.n_out), 'W_hyb', 'glorot') self.b_y = init_bias(self.n_out, sample='zero') self.params = [ self.W_xi_f, self.W_hi_f, self.b_i_f, self.W_xf_f, self.W_hf_f, self.b_f_f, self.W_xc_f, self.W_hc_f, self.b_c_f, self.W_xo_f, self.W_ho_f, self.b_o_f, self.W_hy_f, self.W_xi_b, self.W_hi_b, self.b_i_b, self.W_xf_b, self.W_hf_b, self.b_f_b, self.W_xc_b, self.W_hc_b, self.b_c_b, self.W_xo_b, self.W_ho_b, self.b_o_b, self.W_hy_b, self.b_y ] def f_step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid( T.dot(x_t, self.W_xi_f) + T.dot(h_tm1, self.W_hi_f) + self.b_i_f) f_t = T.nnet.sigmoid( T.dot(x_t, self.W_xf_f) + T.dot(h_tm1, self.W_hf_f) + self.b_f_f) c_t = f_t * c_tm1 + i_t * T.tanh( T.dot(x_t, self.W_xc_f) + T.dot(h_tm1, self.W_hc_f) + self.b_c_f) o_t = T.nnet.sigmoid( T.dot(x_t, self.W_xo_f) + T.dot(h_tm1, self.W_ho_f) + self.b_o_f) h_t = o_t * T.tanh(c_t) return [h_t, c_t] def b_step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid( T.dot(x_t, self.W_xi_b) + T.dot(h_tm1, self.W_hi_b) + self.b_i_b) f_t = T.nnet.sigmoid( T.dot(x_t, self.W_xf_b) + T.dot(h_tm1, self.W_hf_b) + self.b_f_b) c_t = f_t * c_tm1 + i_t * T.tanh( T.dot(x_t, self.W_xc_b) + T.dot(h_tm1, self.W_hc_b) + self.b_c_b) o_t = T.nnet.sigmoid( T.dot(x_t, self.W_xo_b) + T.dot(h_tm1, self.W_ho_b) + self.b_o_b) h_t = o_t * T.tanh(c_t) return [h_t, c_t] X_f = T.tensor3() # batch of sequence of vector X_b = T.tensor3() # batch of sequence of vector Y = T.tensor3( ) # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_f, c_vals], _ = theano.scan(fn=f_step_lstm, sequences=X_f.dimshuffle(1, 0, 2), outputs_info=[h0, c0]) [h_b, c_vals], _ = theano.scan(fn=b_step_lstm, sequences=X_b.dimshuffle(1, 0, 2), outputs_info=[h0, c0]) h_b = h_b[:, ::-1] y_vals = T.tanh( T.dot(h_f, self.W_hy_f) + T.dot(h_b, self.W_hy_b) + self.b_y) self.output = y_vals.dimshuffle(1, 0, 2) cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X_f, X_b, Y], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X_f, X_b], outputs=y_vals.dimshuffle(1, 0, 2), allow_input_downcast=True) self.n_param = n_lstm * n_lstm * 4 + n_in * n_lstm * 4 + n_lstm * n_out + n_lstm * 3
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, output_activation=theano.tensor.nnet.relu,cost_function='mse',optimizer = RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out #Forward weights self.W_xi_f = init_weight((self.n_in, self.n_lstm), 'W_xif', 'glorot') self.W_hi_f = init_weight((self.n_lstm, self.n_lstm), 'W_hif', 'ortho') self.b_i_f = init_bias(self.n_lstm, sample='zero') self.W_xf_f = init_weight((self.n_in, self.n_lstm), 'W_xf', 'glorot') self.W_hf_f = init_weight((self.n_lstm, self.n_lstm), 'W_hf', 'ortho') self.b_f_f = init_bias(self.n_lstm, sample='one') self.W_xc_f = init_weight((self.n_in, self.n_lstm), 'W_xcf', 'glorot') self.W_hc_f = init_weight((self.n_lstm, self.n_lstm), 'W_hcf', 'ortho') self.b_c_f = init_bias(self.n_lstm, sample='zero') self.W_xo_f = init_weight((self.n_in, self.n_lstm), 'W_xof', 'glorot') self.W_ho_f = init_weight((self.n_lstm, self.n_lstm), 'W_hof', 'ortho') self.b_o_f = init_bias(self.n_lstm, sample='zero') self.W_hy_f = init_weight((self.n_lstm, self.n_out), 'W_hyf', 'glorot') #Backward weights self.W_xi_b = init_weight((self.n_in, self.n_lstm), 'W_xib', 'glorot') self.W_hi_b = init_weight((self.n_lstm, self.n_lstm), 'W_hib', 'ortho') self.b_i_b = init_bias(self.n_lstm, sample='zero') self.W_xf_b = init_weight((self.n_in, self.n_lstm), 'W_xfb', 'glorot') self.W_hf_b = init_weight((self.n_lstm, self.n_lstm), 'W_hfb', 'ortho') self.b_f_b = init_bias(self.n_lstm, sample='one') self.W_xc_b = init_weight((self.n_in, self.n_lstm), 'W_xcb', 'glorot') self.W_hc_b = init_weight((self.n_lstm, self.n_lstm), 'W_hcb', 'ortho') self.b_c_b = init_bias(self.n_lstm, sample='zero') self.W_xo_b = init_weight((self.n_in, self.n_lstm), 'W_xob', 'glorot') self.W_ho_b = init_weight((self.n_lstm, self.n_lstm), 'W_hob', 'ortho') self.b_o_b = init_bias(self.n_lstm, sample='zero') self.W_hy_b = init_weight((self.n_lstm, self.n_out), 'W_hyb', 'glorot') self.b_y = init_bias(self.n_out, sample='zero') self.params = [self.W_xi_f, self.W_hi_f, self.b_i_f, self.W_xf_f, self.W_hf_f, self.b_f_f, self.W_xc_f, self.W_hc_f, self.b_c_f, self.W_xo_f, self.W_ho_f, self.b_o_f, self.W_hy_f, self.W_xi_b, self.W_hi_b, self.b_i_b, self.W_xf_b, self.W_hf_b, self.b_f_b, self.W_xc_b, self.W_hc_b, self.b_c_b, self.W_xo_b, self.W_ho_b, self.b_o_b, self.W_hy_b, self.b_y] def f_step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi_f) + T.dot(h_tm1, self.W_hi_f) + self.b_i_f) f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf_f) + T.dot(h_tm1, self.W_hf_f) + self.b_f_f) c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc_f) + T.dot(h_tm1, self.W_hc_f) + self.b_c_f) o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo_f) + T.dot(h_tm1, self.W_ho_f) + self.b_o_f) h_t = o_t * T.tanh(c_t) return [h_t, c_t] def b_step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi_b) + T.dot(h_tm1, self.W_hi_b) + self.b_i_b) f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf_b) + T.dot(h_tm1, self.W_hf_b) + self.b_f_b) c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc_b) + T.dot(h_tm1, self.W_hc_b) + self.b_c_b) o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo_b) + T.dot(h_tm1, self.W_ho_b) + self.b_o_b) h_t = o_t * T.tanh(c_t) return [h_t, c_t] X_f = T.tensor3() # batch of sequence of vector X_b = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector (should be 0 when X is not null) h0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state c0 = shared(np.zeros(shape=(batch_size,self.n_lstm), dtype=dtype)) # initial hidden state [h_f, c_vals], _ = theano.scan(fn=f_step_lstm, sequences=X_f.dimshuffle(1,0,2), outputs_info=[h0, c0]) [h_b, c_vals], _ = theano.scan(fn=b_step_lstm, sequences=X_b.dimshuffle(1,0,2), outputs_info=[h0, c0]) h_b=h_b[:,::-1] y_vals=T.tanh(T.dot(h_f, self.W_hy_f)+T.dot(h_b, self.W_hy_b)+self.b_y) self.output = y_vals.dimshuffle(1,0,2) cost=get_err_fn(self,cost_function,Y) _optimizer = optimizer( cost, self.params, lr=lr ) self.train = theano.function(inputs=[X_f,X_b, Y],outputs=cost,updates=_optimizer.getUpdates(),allow_input_downcast=True) self.predictions = theano.function(inputs = [X_f,X_b], outputs = y_vals.dimshuffle(1,0,2),allow_input_downcast=True) self.n_param=n_lstm*n_lstm*4+n_in*n_lstm*4+n_lstm*n_out+n_lstm*3
def __init__(self, n_in, n_lstm, n_out, lr=0.05, batch_size=64, single_output=True, output_activation=theano.tensor.nnet.relu, cost_function='mse', optimizer=RMSprop): self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.n_fc1 = 256 self.n_fc2 = 256 self.n_fc3 = 256 self.W_fc1 = init_weight((self.n_fc1, self.n_fc2), 'W_fc1', 'glorot') self.b_fc1 = init_bias(self.n_fc2, sample='zero') self.W_fc2 = init_weight((self.n_fc2, self.n_fc3), 'W_fc2', 'glorot') self.b_fc2 = init_bias(self.n_fc3, sample='zero') self.W_fc3 = init_weight((self.n_fc3, self.n_out), 'w_fc3', 'glorot') self.b_fc3 = init_bias(self.n_out, sample='zero') self.W_xr = init_weight((self.n_in, self.n_lstm), 'W_xr', 'glorot') self.W_hr = init_weight((self.n_lstm, self.n_lstm), 'W_hr', 'ortho') self.b_r = init_bias(self.n_lstm, sample='zero') self.W_xz = init_weight((self.n_in, self.n_lstm), 'W_xz', 'glorot') self.W_hz = init_weight((self.n_lstm, self.n_lstm), 'W_hz', 'ortho') self.b_z = init_bias(self.n_lstm, sample='zero') self.W_xh = init_weight((self.n_in, self.n_lstm), 'W_xh', 'glorot') self.W_hh = init_weight((self.n_lstm, self.n_lstm), 'W_hh', 'ortho') self.b_h = init_bias(self.n_lstm, sample='zero') self.W_hy = init_weight((self.n_lstm, self.n_fc1), 'W_hy', 'glorot') self.b_y = init_bias(self.n_fc1, sample='zero') self.params = [ self.W_xr, self.W_hr, self.b_r, self.W_xz, self.W_hz, self.b_z, self.W_xh, self.W_hh, self.b_h, self.W_hy, self.b_y, self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2, self.W_fc3, self.b_fc3 ] def step_lstm(x_t, h_tm1): r_t = T.nnet.sigmoid( T.dot(x_t, self.W_xr) + T.dot(h_tm1, self.W_hr) + self.b_r) z_t = T.nnet.sigmoid( T.dot(x_t, self.W_xz) + T.dot(h_tm1, self.W_hz) + self.b_z) h_t = T.tanh( T.dot(x_t, self.W_xh) + T.dot((r_t * h_tm1), self.W_hh) + self.b_h) hh_t = z_t * h_t + (1 - z_t) * h_tm1 y_t = T.tanh(T.dot(hh_t, self.W_hy) + self.b_y) return [hh_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector #Y_NaN= T.tensor3() # batch of sequence of vector h0 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state [h_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=X.dimshuffle(1, 0, 2), outputs_info=[h0, None]) #Hidden layer fc1_out = T.tanh(T.dot(y_vals, self.W_fc1) + self.b_fc1) fc2_out = T.tanh(T.dot(fc1_out, self.W_fc2) + self.b_fc2) fc3_out = T.tanh(T.dot(fc2_out, self.W_fc3) + self.b_fc3) self.output = fc3_out.dimshuffle(1, 0, 2) cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X], outputs=self.output, allow_input_downcast=True) self.n_param = n_lstm * n_lstm * 4 + n_in * n_lstm * 4 + n_lstm * n_out + n_lstm * 3