def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None): ''' Initializes the weights and biases of the neural network Parameters: ----------- param: wts - weights type: np.ndarray, optional param: bs - biases type: np.ndarray, optional param: method - calls some pre-specified weight initialization routines type: string, optional ''' # with tied weights, the encoding and decoding matrices are simply transposes # of one another if self.tied_wts: if seed is not None: np.random.seed(seed=seed) # weights and biases if wts is None and bs is None: wts = [None] bs = [None, None] if init_method == 'gauss': wts[0] = scale_factor * \ np.random.randn(self.num_nodes[0], self.num_nodes[0]) bs[0] = np.zeros(self.num_nodes[1]) bs[1] = np.zeros(self.num_nodes[0]) if init_method == 'fan-io': v = np.sqrt( 1. * scale_factor / (self.num_nodes[0] + self.num_nodes[1] + 1)) wts[0] = scale_factor * v * \ np.random.rand( self.num_nodes[0], self.num_nodes[1] - v) bs[0] = np.zeros(self.num_nodes[1]) bs[1] = np.zeros(self.num_nodes[0]) else: assert isinstance(wts, list) assert isinstance(bs, list) self.wts_ = [ theano.shared(nu.floatX(wt), borrow=True) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs] # if encoding and decoding matrices are distinct, just default back to the # normal case else: super(Autoencoder, self).set_weights( init_method=init_method, scale_factor=scale_factor, seed=seed)
def compile_multilayer_functions(self, wts=None, bs=None): ''' compiles prediction and scoring functions for testing Parameters: ----------- param: wts - weights type: numpy ndarray matrix param: bs - biases type: numpy ndarray matrix ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ else: wts = [nu.floatX(w) for w in wts] bs = [nu.floatX(b) for b in bs] X = T.matrix() y = T.matrix() # output probabilities y_prob = self.fprop(X, wts, bs) self.predict_prob = theano.function( inputs=[X], outputs=[y_prob], mode='FAST_RUN', allow_input_downcast=True) # prediction probabilities y_pred = T.argmax(y_prob, axis=1) self.predict_label = theano.function( inputs=[X], outputs=[y_pred], mode='FAST_RUN', allow_input_downcast=True) # evaluation loss eval_loss = self.compute_eval_loss(X, y, wts, bs) self.compute_test_loss = theano.function( inputs=[X, y], outputs=eval_loss, mode='FAST_RUN', allow_input_downcast=True) # scoring (classification accuracy) self.score = theano.function( inputs=[X, y], outputs=1.0 - T.mean(T.neq(y_pred, T.argmax(y, axis=1))), mode='FAST_RUN', allow_input_downcast=True)
def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None): ''' Initializes the weights and biases of the neural network Parameters: ----------- param: wts - weights type: np.ndarray, optional param: bs - biases type: np.ndarray, optional param: init_method - calls some pre-specified weight initialization routines type: string param: scale_factor - additional hyperparameter for weight initialization type: float, optional param: seed - seeds the random number generator type: int, optional ''' if seed is not None: np.random.seed(seed=seed) self.srng.seed(seed) if wts is None and bs is None: wts = (len(self.num_nodes) - 1) * [None] bs = (len(self.num_nodes) - 1) * [None] if init_method == 'gauss': for i, (n1, n2) in enumerate( zip(self.num_nodes[:-1], self.num_nodes[1:])): wts[i] = scale_factor * 1. / \ np.sqrt(n2) * np.random.randn(n1, n2) bs[i] = np.zeros(n2) elif init_method == 'fan-io': for i, (n1, n2) in enumerate( zip(self.num_nodes[:-1], self.num_nodes[1:])): v = scale_factor * np.sqrt(6. / (n1 + n2 + 1)) wts[i] = 2.0 * v * np.random.rand(n1, n2) - v bs[i] = np.zeros(n2) else: sys.exit(ne.weight_error()) else: # this scenario occurs most when doing unsupervised pre-training to initialize # the weights assert isinstance(wts, list) assert isinstance(bs, list) self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]
def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None): ''' Initializes the weights and biases of the neural network Parameters: ----------- param: wts - weights type: np.ndarray, optional param: bs - biases type: np.ndarray, optional param: init_method - calls some pre-specified weight initialization routines type: string param: scale_factor - additional hyperparameter for weight initialization type: float, optional param: seed - seeds the random number generator type: int, optional ''' if seed is not None: np.random.seed(seed=seed) self.srng.seed(seed) if wts is None and bs is None: wts = (len(self.num_nodes) - 1) * [None] bs = (len(self.num_nodes) - 1) * [None] if init_method == 'gauss': for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])): wts[i] = scale_factor * 1. / \ np.sqrt(n2) * np.random.randn(n1, n2) bs[i] = np.zeros(n2) elif init_method == 'fan-io': for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])): v = scale_factor * np.sqrt(6. / (n1 + n2 + 1)) wts[i] = 2.0 * v * np.random.rand(n1, n2) - v bs[i] = np.zeros(n2) else: sys.exit(ne.weight_error()) else: # this scenario occurs most when doing unsupervised pre-training to initialize # the weights assert isinstance(wts, list) assert isinstance(bs, list) self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]
def momentum(params, grad_params, learn_rate=1., alpha=0.9): ''' standard sgd with 'momentum' param: params - model parameters type: list of theano shared variables param: grad_params - derivative of the loss with respect to the model parameters type: list of theano variables param: learn_rate - 'master' learning rate for the adagrad algorithm type: float param: alpha - viscosity term for momentum type: float ''' updates = [] for param, grad_param in zip(params, grad_params): # velocity term with vicosity velocity = theano.shared(nu.floatX(np.zeros(param.get_value().shape))) velocity_ = alpha * velocity - learn_rate * grad_param # paramater update param_ = param + velocity_ # collected updates of both the parameter and velocity updates.append((velocity, velocity_)) updates.append((param, param_)) return updates
def compile_multilayer_functions(self, wts=None, bs=None): ''' compiles prediction and scoring functions for testing Parameters: ----------- param: wts - weights type: numpy ndarray matrix param: bs - biases type: numpy ndarray matrix ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ else: wts = [nu.floatX(w) for w in wts] bs = [nu.floatX(b) for b in bs] X = T.matrix() y = T.matrix() # output probabilities y_prob = self.fprop(X, wts, bs) self.predict_prob = theano.function(inputs=[X], outputs=[y_prob], mode='FAST_RUN', allow_input_downcast=True) # prediction probabilities y_pred = T.argmax(y_prob, axis=1) self.predict_label = theano.function(inputs=[X], outputs=[y_pred], mode='FAST_RUN', allow_input_downcast=True) # evaluation loss eval_loss = self.compute_eval_loss(X, y, wts, bs) self.compute_test_loss = theano.function(inputs=[X, y], outputs=eval_loss, mode='FAST_RUN', allow_input_downcast=True) # scoring (classification accuracy) self.score = theano.function( inputs=[X, y], outputs=1.0 - T.mean(T.neq(y_pred, T.argmax(y, axis=1))), mode='FAST_RUN', allow_input_downcast=True)
def rmsprop(params, grad_params, learn_rate=None, rho=None, eps=1e-6, max_norm=False, c=3): ''' Geoff hinton's '"RMSprop" algorithm - RPROP for mini-batches Parameters: ---------- param: params - model parameters type: list of theano shared variables param: grad_params - derivative of the loss with respect to the model parameters type: list of theano variables param: learn_rate - learning rate for rmsprop type: float param: rho - momentum term param: eps - fudge factor [need ref here] type: float param: max_norm - flag to incidate that weights will be L2-norm constrained type: boolean param: c - L2-norm constraint for max_norm regularization type: float ''' updates = [] for param, grad_param in zip(params, grad_params): # accumulated gradient acc_grad_param = theano.shared( nu.floatX(np.zeros(param.get_value().shape))) # initial value acc_grad_param_ = rho * acc_grad_param + (1 - rho) * grad_param**2 # parameter update param_ = param - learn_rate * grad_param / \ T.sqrt(acc_grad_param_ + eps) # there's probably a better way to check if this is a weight matrix... if max_norm and param.get_value().ndim == 2: param_ = maxnorm(param_, c) # collected updates of both the parameter and the accumulated gradient updates.append((acc_grad_param, acc_grad_param_)) updates.append((param, param_)) return updates
def adagrad(params, grad_params, learn_rate=1., eps=1e-6, max_norm=False, c=5): ''' adaptive gradient method - typically works better than vanilla SGD and has some nice theoretical guarantees Parameters: ---------- param: params - model parameters type: list of theano shared variables param: grad_params - derivative of the loss with respect to the model parameters type: list of theano variables param: learn_rate - 'master' learning rate for the adagrad algorithm type: float param: eps - fudge factor [need ref here] type: float param: max_norm - flag to incidate that weights will be L2-norm constrained type: boolean param: c - L2-norm constraint for max_norm regularization type: float Returns: -------- None Updates: -------- wts,bs ''' updates = [] for param, grad_param in zip(params, grad_params): # accumulated gradient acc_grad_param = theano.shared( nu.floatX(np.zeros(param.get_value().shape))) acc_grad_param_ = acc_grad_param + grad_param**2 # parameter update param_ = param - learn_rate * grad_param / \ T.sqrt(acc_grad_param_ + eps) # there's probably a better way to check if this is a weight matrix... if max_norm and param.get_value().ndim == 2: param_ = maxnorm(param_, c) # collected updates of both the parameter and the accumulated gradient updates.append((acc_grad_param, acc_grad_param_)) updates.append((param, param_)) return updates
def check_gradients(self, X_in, Y_in, wts=None, bs=None): ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...''' # assume that if it's not provided, they will be shared variables - this is # probably dangerous, but this is a debugging tool anyway, # so...whatever if wts is None and bs is None: wts = self.wts_ bs = self.bs_ else: wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts] bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs] X = T.matrix() # inputs Y = T.matrix() # labels v = T.vector() # vector of biases and weights i = T.lscalar() # index # 1. compile the numerical gradient function def compute_numerical_gradient(v, i, X, Y, eps=1e-4): # perturb the input v_plus = T.inc_subtensor(v[i], eps) v_minus = T.inc_subtensor(v[i], -1.0 * eps) # roll it back into the weight matrices and bias vectors wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes) wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes) # compute the loss for both sides, and then compute the numerical # gradient loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus) loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus) # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps) return 1.0 * (loss_plus - loss_minus) / (2 * eps) compute_ngrad = theano.function( inputs=[v, i, X, Y], outputs=compute_numerical_gradient(v, i, X, Y)) # 2. compile backprop (theano's autodiff) optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] # all model parameters in a list # gradient of each model param w.r.t training loss grad_params = [T.grad(optim_loss, param) for param in params] # gradient of the full weight vector grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w) # compute the mean difference between the numerical and exact gradients v0 = nu.unroll([wt.get_value() for wt in wts], [b.get_value() for b in bs]) # get the indices of the weights/biases we want to check idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)] ngrad = [None] * len(idxs) for j, idx in enumerate(idxs): ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in) bgrad = compute_bgrad(X_in, Y_in)[idxs] cerr = np.mean(np.abs(ngrad - bgrad)) assert cerr < 1e-10
def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params): ''' Full-batch optimization using scipy's L-BFGS-B and CG Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: num_epochs - the number of full runs through the dataset type: int param: **optim_params type: dictionary of optimization parameters ''' X = T.matrix('X') # input variable y = T.matrix('y') # output variable w = T.vector('w') # weight vector # reshape the vector w into weight and bias matrices, and set up the # theano graph to compute the loss and gradient wts, bs = nu.t_reroll(w, self.num_nodes) optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] grad_params = [T.grad(optim_loss, param) for param in params] grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_loss_grad_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss, grad_w], allow_input_downcast=True) compute_loss_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss], allow_input_downcast=True) # initialize the weight vector and perform full-batch optimization wts0 = [wt.get_value() for wt in self.wts_] bs0 = [b.get_value() for b in self.bs_] w0 = nu.unroll(wts0, bs0) # print 'Checking gradients...' # self.check_gradients(X_tr,y_tr,wts0,bs0) # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr) try: optim_method = optim_params.pop('optim_method') except KeyError: sys.exit(ne.method_err()) # very annoying. if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32': sys.exit('Sorry, L-BFGS-B only works with float64') wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True, options={'maxiter': num_epochs}) # re-rolln back into weights and biases wts, bs = nu.reroll(wf.x, self.num_nodes) self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]
def shared_dataset(self, X, y): ''' As per the deep learning tutorial, loading the data all at once (if possible) into the GPU will significantly speed things up ''' return theano.shared(nu.floatX(X)), theano.shared(nu.floatX(y))
def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params): ''' Full-batch optimization using scipy's L-BFGS-B and CG; this function is duplicated for autoencoders, since the option of tied-weights makes the unrolling/rerolling a bit different Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: num_epochs - the number of full runs through the dataset type: int ''' X = T.matrix('X') # input variable y = T.matrix('y') # output variable w = T.vector('w') # weight vector # reshape w into wts/biases, taking note of whether tied weights are # being used or not wts, bs = nu.t_reroll_ae(w, self.num_nodes, self.tied_wts) # get the loss optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs) # compute grad params = [p for param in [wts, bs] for p in param] # all model parameters in a list # gradient of each model param w.r.t training loss grad_params = [T.grad(optim_loss, param) for param in params] # gradient of the full weight vector grad_w = nu.t_unroll_ae( grad_params[:len(wts)], grad_params[len(wts):], self.tied_wts) compute_loss_grad_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss, grad_w], allow_input_downcast=True) compute_loss_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss], allow_input_downcast=True) # initial value for the weight vector wts0 = [wt.get_value() for wt in self.wts_] bs0 = [b.get_value() for b in self.bs_] w0 = nu.unroll_ae(wts0, bs0, self.tied_wts) # print 'Checking gradients for fun...' # self.check_gradients(X_tr,y_tr,wts0,bs0) # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr) try: optim_method = optim_params.pop('optim_method') except KeyError: sys.exit(ne.method_err()) # very annoying. if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32': sys.exit('Sorry, L-BFGS-B only works with float64') # scipy optimizer wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True, options={'maxiter': num_epochs}) # print 'Post-training loss',compute_loss_from_vector(wf.x,X_tr,y_tr) # re-roll this back into weights and biases wts, bs = nu.reroll_ae(wf.x, self.num_nodes, self.tied_wts) self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]
def check_gradients(self, X_in, Y_in, wts=None, bs=None): ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...''' # assume that if it's not provided, they will be shared variables - this is # probably dangerous, but this is a debugging tool anyway, # so...whatever if wts is None and bs is None: wts = self.wts_ bs = self.bs_ else: wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts] bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs] X = T.matrix() # inputs Y = T.matrix() # labels v = T.vector() # vector of biases and weights i = T.lscalar() # index # 1. compile the numerical gradient function def compute_numerical_gradient(v, i, X, Y, eps=1e-4): # perturb the input v_plus = T.inc_subtensor(v[i], eps) v_minus = T.inc_subtensor(v[i], -1.0 * eps) # roll it back into the weight matrices and bias vectors wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes) wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes) # compute the loss for both sides, and then compute the numerical # gradient loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus) loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus) # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps) return 1.0 * (loss_plus - loss_minus) / (2 * eps) compute_ngrad = theano.function(inputs=[v, i, X, Y], outputs=compute_numerical_gradient( v, i, X, Y)) # 2. compile backprop (theano's autodiff) optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] # all model parameters in a list # gradient of each model param w.r.t training loss grad_params = [T.grad(optim_loss, param) for param in params] # gradient of the full weight vector grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w) # compute the mean difference between the numerical and exact gradients v0 = nu.unroll([wt.get_value() for wt in wts], [b.get_value() for b in bs]) # get the indices of the weights/biases we want to check idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)] ngrad = [None] * len(idxs) for j, idx in enumerate(idxs): ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in) bgrad = compute_bgrad(X_in, Y_in)[idxs] cerr = np.mean(np.abs(ngrad - bgrad)) assert cerr < 1e-10
def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params): ''' Full-batch optimization using scipy's L-BFGS-B and CG Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: num_epochs - the number of full runs through the dataset type: int param: **optim_params type: dictionary of optimization parameters ''' X = T.matrix('X') # input variable y = T.matrix('y') # output variable w = T.vector('w') # weight vector # reshape the vector w into weight and bias matrices, and set up the # theano graph to compute the loss and gradient wts, bs = nu.t_reroll(w, self.num_nodes) optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] grad_params = [T.grad(optim_loss, param) for param in params] grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_loss_grad_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss, grad_w], allow_input_downcast=True) compute_loss_from_vector = theano.function(inputs=[w, X, y], outputs=[optim_loss], allow_input_downcast=True) # initialize the weight vector and perform full-batch optimization wts0 = [wt.get_value() for wt in self.wts_] bs0 = [b.get_value() for b in self.bs_] w0 = nu.unroll(wts0, bs0) # print 'Checking gradients...' # self.check_gradients(X_tr,y_tr,wts0,bs0) # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr) try: optim_method = optim_params.pop('optim_method') except KeyError: sys.exit(ne.method_err()) # very annoying. if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32': sys.exit('Sorry, L-BFGS-B only works with float64') wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True, options={'maxiter': num_epochs}) # re-rolln back into weights and biases wts, bs = nu.reroll(wf.x, self.num_nodes) self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]
X = T.matrix('X') y = T.matrix('y') # define the architecture d = 784 k = 10 h1 = 625; activ1 = na.reLU h2 = 625; activ2 = na.reLU activ3 = na.softmax # initialize weights and biases wts = [None,None,None] bs = [None,None,None] np.random.seed(1234) wts[0] = theano.shared(nu.floatX(0.01*np.random.randn(d,h1)),borrow=True) wts[1] = theano.shared(nu.floatX(0.01*np.random.randn(h1,h2)),borrow=True) wts[2] = theano.shared(nu.floatX(0.01*np.random.randn(h2,k)),borrow=True) bs[0] = theano.shared(nu.floatX(np.zeros(h1)),borrow=True) bs[1] = theano.shared(nu.floatX(np.zeros(h2)),borrow=True) bs[2] = theano.shared(nu.floatX(np.zeros(k)),borrow=True) # forward propagation act1 = activ1(T.dot(X,wts[0]) + bs[0]) compute_act1 = theano.function(inputs=[X],outputs=act1,allow_input_downcast=True,mode='FAST_RUN') act2 = activ2(T.dot(act1,wts[1]) + bs[1]) compute_act2 = theano.function(inputs=[X],outputs=act2,allow_input_downcast=True,mode='FAST_RUN') y_pred = activ3(T.dot(act2,wts[2]) + bs[2]) compute_y_pred = theano.function(inputs=[X],outputs=y_pred,allow_input_downcast=True,mode='FAST_RUN') X_tr = np.random.randn(128,784)