def __init__(self, rng, n_in, n_out, n_h, f_act=leaky_relu, f_out=softmax, orth_init=True, dropout_rate=0, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function :param orth_init: if true, the initialize transition matrix to be orthogonal (bool) :param dropout_rate: dropout rate (float) :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' if orth_init: Whh_ = rvs(rng, n_h) else: Whh_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) bh_ = np.zeros(n_h) Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_h + n_out)), (n_out, n_h)) bo_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Whh = theano.shared(name='Whh', value=Whh_.astype(theano.config.floatX)) Whi = theano.shared(name='Whi', value=Whi_.astype(theano.config.floatX)) bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX)) Woh = theano.shared(name='Woh', value=Woh_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) self.p = [Whh, Whi, Woh, bh, bo, h0] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.dropout_rate = dropout_rate self.x = T.vector() x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if dropout_rate > 0: np.random.seed(int(time.time())) # for training def masked_forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh) o_t = Woh.dot(h_t) + bo mask = np.random.binomial(np.ones(n_h, dtype=int), 1 - dropout_rate) masked_h_t = h_t * T.cast(mask, theano.config.floatX) return [o_t, masked_h_t] # for testing def forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh) o_t = Woh.dot(h_t) + bo h_t = (1.0 - dropout_rate) * h_t return [o_t, h_t] [o_train, _], _ = theano.scan(masked_forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) [o_test, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) else: def forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] [o_train, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) o_test = o_train if obj == 'c': # classification task self.y = T.bscalar('y') self.o_train = f_out(o_train[-1]) self.o_test = f_out(o_test[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o_train, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o_test), self.y), 1., 0.) self.prediction = np.argmax(self.o_test) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o_train = o_train[-1] self.o_test = o_test[-1] #obj function to compute grad, use dropout self.cost = (self.o_train[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o_test[0] - self.y)**2 self.prediction = self.o_test[0] _, self.Sigma, _ = T.nlinalg.SVD(full_matrices=1, compute_uv=1)(self.p[0]) self.max_singular = T.max(self.Sigma) self.min_singular = T.min(self.Sigma) self.optimiser = sgd_optimizer(self, 'RNN')
def __init__(self, rng, n_in, n_out, n_h, n_layers, f_act=leaky_relu, obj='single', dropout_rate = 0): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param n_layers: Number of hidden layers (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function ''' if obj=='single': f_out = softmax elif obj=='multi': f_out = sigmoid self.x = T.vector() # construct hidden layers assert(n_layers>=1) first_hiddenLayer = HiddenLayer( rng=rng, input=self.x, predict_input=self.x, n_in=n_in, n_out=n_h, activation=f_act, dropout_rate = dropout_rate, nametag='0' ) self.hidden_layers = [first_hiddenLayer] self.p = first_hiddenLayer.params[:] for i in range(n_layers-1): cur_hiddenLayer = ResNetLayer( rng=rng, input=self.hidden_layers[-1].output, predict_input=self.hidden_layers[-1].predict_output, n_h=n_h, activation=f_act, dropout_rate = dropout_rate, nametag=str(i+1) ) self.hidden_layers.append(cur_hiddenLayer) self.p.extend(cur_hiddenLayer.params[:]) # params for output layer self.outputLayer = HiddenLayer( rng=rng, input=self.hidden_layers[-1].output, predict_input=self.hidden_layers[-1].predict_output, n_in=n_h, n_out=n_out, activation=f_out, dropout_rate = 0, nametag='o' ) self.p.extend(self.outputLayer.params[:]) self.n_layers = n_layers + 1 self.obj = obj if obj=='single': self.y = T.bscalar('y') self.o = self.outputLayer.output self.cost = T.nnet.categorical_crossentropy(self.o, T.eye(n_out)[self.y]) self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.) self.prediction = np.argmax(self.o) elif obj=='multi': self.y = T.bvector('y') self.o = self.outputLayer.output self.cost = T.nnet.binary_crossentropy(self.o, self.y).mean() self.prediction = T.argsort(self.o) self.accuracy = self.y[T.argmax(self.o)] self.accuracy3 = (1.0/3.0) * (self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]]) self.accuracy5 = (1.0/5.0) * (self.y[self.prediction[-5]]+self.y[self.prediction[-4]]+self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]]) self.optimiser = sgd_optimizer(self, 'ResNet')
def __init__(self, rng, n_in, n_out, n_h, n_r, f_act=leaky_relu, f_out=softmax, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param n_r: Number of reflection vectors (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' U_ = np.tril(rng.normal(0, 0.01, (n_h, n_r))) norms = np.linalg.norm(U_, axis=0) U_ = 1. / norms * U_ Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) bh_ = np.zeros(n_h) Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_h + n_out)), (n_out, n_h)) bo_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Whi = theano.shared(name='Whi', value=Whi_.astype(theano.config.floatX)) U = theano.shared(name='U', value=U_.astype(theano.config.floatX)) bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX)) Woh = theano.shared(name='Woh', value=Woh_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) self.p = [U, Whi, Woh, bh, bo, h0] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.x = T.vector() #x_scan = T.shape_padright(self.x) x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if n_h != n_r: # Number of reflection vectors is less than the hidden dimension def forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + H_wy(U, h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] else: def forward_prop_step(x_t, h_t_prev): h_t_prev = T.set_subtensor(h_t_prev[-1], h_t_prev[-1] * U[-1, -1]) h_t = f_act(Whi.dot(x_t) + H_wy(U[:, :-1], h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] ## For loop version below (when n_r < n_h) # def forward_prop_step(x_t, h_t_prev): # Wh = h_t_prev # for i in range(n_r): # Wh -= 2. * U[:, n_r - i - 1] * T.dot(U[:, n_r - i - 1], Wh) # h_t = f_act(Whi.dot(x_t) + Wh + bh) # o_t = Woh.dot(h_t) + bo # return [o_t, h_t] [o_scan, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) if obj == 'c': # classification task self.y = T.bscalar('y') self.o = f_out(o_scan[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.) self.prediction = np.argmax(self.o) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o = o_scan[-1] #obj function to compute grad, use dropout self.cost = (self.o[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o[0] - self.y)**2 self.prediction = self.o[0] self.optimiser = sgd_optimizer(self, 'oRNN')
def __init__(self, rng, n_in, n_out, n_h, n_r, margin=1.0, sig_mean=1.0, f_act=leaky_relu, f_out=softmax, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param n_r: Number of reflection vectors (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' U_ = np.tril(rng.normal(0, 0.01, (n_h, n_r))) norms_U_ = np.linalg.norm(U_, axis=0) U_ = 1. / norms_U_ * U_ V_ = np.tril(rng.normal(0, 0.01, (n_h, n_r))) norms_V_ = np.linalg.norm(V_, axis=0) V_ = 1. / norms_V_ * V_ #Sig_ = np.ones( n_h) P_ = np.zeros(n_h) Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) bh_ = np.zeros(n_h) Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_h + n_out)), (n_out, n_h)) bo_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Whi = theano.shared(name='Whi', value=Whi_.astype(theano.config.floatX)) U = theano.shared(name='U', value=U_.astype(theano.config.floatX)) V = theano.shared(name='V', value=V_.astype(theano.config.floatX)) #Sig = theano.shared(name='Sig', value=Sig_.astype(theano.config.floatX)) P = theano.shared(name='P', value=P_.astype(theano.config.floatX)) bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX)) Woh = theano.shared(name='Woh', value=Woh_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) #self.p = [U, V, Sig, Whi, Woh, bh, bo, h0] self.p = [U, V, P, Whi, Woh, bh, bo, h0] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.x = T.vector() #x_scan = T.shape_padright(self.x) x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if n_h != n_r: # Number of reflection vectors is less than the hidden dimension def forward_prop_step(x_t, h_t_prev): Sig = 2 * margin * (sigmoid(P) - 0.5) + sig_mean h_t = f_act(Whi.dot(x_t) + svd_H_wy(U, V, Sig, h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] else: def forward_prop_step(x_t, h_t_prev): Sig = 2 * margin * (sigmoid(P) - 0.5) + sig_mean Hu1SigHv1 = T.set_subtensor(Sig[-1], Sig[-1] * U[-1, -1] * V[-1, -1]) h_t = f_act( Whi.dot(x_t) + svd_H_wy(U[:, :-1], V[:, :-1], Hu1SigHv1, h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] [o_scan, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) if obj == 'c': # classification task self.y = T.bscalar('y') self.o = f_out(o_scan[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.) self.prediction = np.argmax(self.o) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o = o_scan[-1] #obj function to compute grad, use dropout self.cost = (self.o[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o[0] - self.y)**2 self.prediction = self.o[0] self.max_singular = 2 * margin * (sigmoid(T.max(self.p[2])) - 0.5) + sig_mean self.min_singular = 2 * margin * (sigmoid(T.min(self.p[2])) - 0.5) + sig_mean self.optimiser = sgd_optimizer(self, 'svdRNN')
def __init__(self, rng, n_in, n_out, n_h, dropout=0, sigma_g=sigmoid, sigma_c=hyperbolic_tangent, sigma_h=hyperbolic_tangent, sigma_y=softmax, dropout_rate=0, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param sigma_g, sigma_c, sigma_h, sigma_y: activation functions :param dropout_rate: dropout rate (float) :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' Wf_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Uf_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bf_ = np.zeros(n_h) Wi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Ui_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bi_ = np.zeros(n_h) Wo_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Uo_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bo_ = np.zeros(n_h) Wc_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Uc_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bc_ = np.zeros(n_h) Wy_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_out + n_h)), (n_out, n_h)) by_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) c0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Wf = theano.shared(name='Wf', value=Wf_.astype(theano.config.floatX)) Uf = theano.shared(name='Uf', value=Uf_.astype(theano.config.floatX)) bf = theano.shared(name='bf', value=bf_.astype(theano.config.floatX)) Wi = theano.shared(name='Wi', value=Wi_.astype(theano.config.floatX)) Ui = theano.shared(name='Ui', value=Ui_.astype(theano.config.floatX)) bi = theano.shared(name='bi', value=bi_.astype(theano.config.floatX)) Wo = theano.shared(name='Wo', value=Wo_.astype(theano.config.floatX)) Uo = theano.shared(name='Uo', value=Uo_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) Wc = theano.shared(name='Wc', value=Wc_.astype(theano.config.floatX)) Uc = theano.shared(name='Uc', value=Uc_.astype(theano.config.floatX)) bc = theano.shared(name='bc', value=bc_.astype(theano.config.floatX)) Wy = theano.shared(name='Wy', value=Wy_.astype(theano.config.floatX)) by = theano.shared(name='by', value=by_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) c0 = theano.shared(name='c0', value=c0_.astype(theano.config.floatX)) self.p = [ Wf, Uf, bf, Wi, Ui, bi, Wo, Uo, bo, Wc, Uc, bc, Wy, by, c0, h0 ] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.x = T.vector() x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if dropout_rate > 0: np.random.seed(int(time.time())) # for training def masked_forward_prop_step(x_t, h_t_prev, c_t_prev): f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf) i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi) o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo) c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc) c_t += c_t_prev * f_t h_t = o_t * sigma_h(c_t) y_t = Wy.dot(h_t) + by mask = np.random.binomial(np.ones(n_h, dtype=int), 1.0 - dropout_rate) masked_h_t = h_t * T.cast(mask, theano.config.floatX) return [y_t, masked_h_t, c_t] # for testing def forward_prop_step(x_t, h_t_prev, c_t_prev): f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf) i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi) o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo) c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc) c_t += c_t_prev * f_t h_t = o_t * sigma_h(c_t) h_t = (1.0 - dropout_rate) * h_t y_t = Wy.dot(h_t) + by return [y_t, h_t, c_t] [o_train, _, _], _ = theano.scan(masked_forward_prop_step, sequences=[x_scan], outputs_info=[None, h0, c0], n_steps=seq_len) [o_test, _, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0, c0], n_steps=seq_len) else: def forward_prop_step(x_t, h_t_prev, c_t_prev): f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf) i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi) o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo) c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc) c_t += c_t_prev * f_t h_t = o_t * sigma_h(c_t) y_t = Wy.dot(h_t) + by return [y_t, h_t, c_t] [o_train, _, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0, c0], n_steps=seq_len) o_test = o_train if obj == 'c': # classification task self.y = T.bscalar('y') self.o_train = sigma_y(o_train[-1]) self.o_test = sigma_y(o_test[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o_train, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o_test), self.y), 1., 0.) self.prediction = np.argmax(self.o_test) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o_train = o_train[-1] self.o_test = o_test[-1] #obj function to compute grad, use dropout self.cost = (self.o_train[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o_test[0] - self.y)**2 self.prediction = self.o_test[0] self.optimiser = sgd_optimizer(self, 'LSTM')