def exe(self, mainloop): """ .. todo:: WRITEME """ for k, p in mainloop.updates.items(): for key in self.keys: if key in str(k): token = 1 for waiver in self.waivers: if waiver in str(k): token = 0 if token: updated_param = mainloop.updates[k] if self.is_vector: col_norms = T.sqrt(T.sqr(updated_param).sum(axis=0)) desired_norms = T.clip(col_norms, 0, self.weight_norm) ratio = (desired_norms / (1e-7 + col_norms)) mainloop.updates[k] = updated_param * ratio else: norm = T.sqrt(T.sqr(updated_param).sum()) desired_norm = T.clip(norm, 0, self.weight_norm) ratio = (desired_norm / (1e-7 + norm)) mainloop.updates[k] = updated_param * ratio
def cost(self): """ :rtype: (theano.Variable | None, dict[theano.Variable,theano.Variable] | None) :returns: cost, known_grads """ known_grads = None if self.loss == 'ce' or self.loss == 'priori': if self.attrs.get("target", "").endswith("[sparse:coo]"): assert isinstance(self.y, tuple) assert len(self.y) == 3 from NativeOp import crossentropy_softmax_and_gradient_z_sparse y_mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")] ce, grad_z = crossentropy_softmax_and_gradient_z_sparse( self.z, self.index, self.y[0], self.y[1], self.y[2], y_mask) return self.norm * T.sum(ce), {self.z: grad_z} if self.y_data_flat.type == T.ivector().type: # Use crossentropy_softmax_1hot to have a more stable and more optimized gradient calculation. # Theano fails to use it automatically; I guess our self.i indexing is too confusing. #idx = self.index.flatten().dimshuffle(0,'x').repeat(self.y_m.shape[1],axis=1) # faster than line below #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m * idx, y_idx=self.y_data_flat * self.index.flatten()) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = -T.log(T.nnet.softmax(self.y_m)[self.i,self.y_data_flat[self.i]]) #z_c = T.exp(self.z[:,self.y]) #nll = -T.log(z_c / T.sum(z_c,axis=2,keepdims=True)) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = T.set_subtensor(nll[self.j], T.constant(0.0)) else: nll = -T.dot(T.log(T.clip(self.p_y_given_x[self.i], 1.e-38, 1.e20)), self.y_data_flat[self.i].T) return self.norm * T.sum(nll), known_grads elif self.loss == 'entropy': h_e = T.exp(self.y_m) #(TB) pcx = T.clip((h_e / T.sum(h_e, axis=1, keepdims=True)).reshape((self.index.shape[0],self.index.shape[1],self.attrs['n_out'])), 1.e-6, 1.e6) # TBD ee = -T.sum(pcx[self.i] * T.log(pcx[self.i])) # TB #nll, pcxs = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y[self.i]) nll, _ = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) # TB ce = nll.reshape(self.index.shape) * self.index # TB y = self.y_data_flat.reshape(self.index.shape) * self.index # TB f = T.any(T.gt(y,0), axis=0) # B return T.sum(f * T.sum(ce, axis=0) + (1-f) * T.sum(ee, axis=0)), known_grads #return T.sum(T.switch(T.gt(T.sum(y,axis=0),0), T.sum(ce, axis=0), -T.sum(ee, axis=0))), known_grads #return T.switch(T.gt(T.sum(self.y_m[self.i]),0), T.sum(nll), -T.sum(pcx * T.log(pcx))), known_grads elif self.loss == 'priori': pcx = self.p_y_given_x[self.i, self.y_data_flat[self.i]] pcx = T.clip(pcx, 1.e-38, 1.e20) # For pcx near zero, the gradient will likely explode. return -T.sum(T.log(pcx)), known_grads elif self.loss == 'sse': if self.y_data_flat.dtype.startswith('int'): y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim=1), 'int32') y_oh = T.eq(T.shape_padleft(T.arange(self.attrs['n_out']), y_f.ndim), T.shape_padright(y_f, 1)) return T.mean(T.sqr(self.p_y_given_x[self.i] - y_oh[self.i])), known_grads else: #return T.sum(T.sum(T.sqr(self.y_m - self.y.reshape(self.y_m.shape)), axis=1)[self.i]), known_grads return T.sum(T.sqr(self.y_m[self.i] - self.y_data_flat.reshape(self.y_m.shape)[self.i])), known_grads #return T.sum(T.sum(T.sqr(self.z - (self.y.reshape((self.index.shape[0], self.index.shape[1], self.attrs['n_out']))[:self.z.shape[0]])), axis=2).flatten()[self.i]), known_grads #y_z = T.set_subtensor(T.zeros((self.index.shape[0],self.index.shape[1],self.attrs['n_out']), dtype='float32')[:self.z.shape[0]], self.z).flatten() #return T.sum(T.sqr(y_z[self.i] - self.y[self.i])), known_grads #return T.sum(T.sqr(self.y_m - self.y[:self.z.shape[0]*self.index.shape[1]]).flatten()[self.i]), known_grads else: assert False, "unknown loss: %s" % self.loss
def get_constraint_updates(self): constraint_updates = OrderedDict() if self.flags['scalar_lambd']: constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd) # constraint filters to have unit norm if self.flags['wv_norm'] in ('unit', 'max_unit'): wv = constraint_updates.get(self.Wv, self.Wv) wv_norm = T.sqrt(T.sum(wv**2, axis=0)) if self.flags['wv_norm'] == 'unit': constraint_updates[self.Wv] = wv / wv_norm elif self.flags['wv_norm'] == 'max_unit': constraint_updates[self.Wv] = wv / wv_norm * T.minimum(wv_norm, 1.0) constraint_updates[self.scalar_norms] = T.maximum(1.0, self.scalar_norms) ## clip parameters to maximum values (if applicable) for (k,v) in self.clip_max.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(param, param, v) ## clip parameters to minimum values (if applicable) for (k,v) in self.clip_min.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param) return constraint_updates
def custom_loss(y_true, y_pred): epsilon = 0.001 first_log = T.log(T.clip(y_pred, 0.001, np.inf) + 1.) second_log = T.log(T.clip(y_true, 0.001, np.inf) + 1.) first_sum = T.log(T.sum(T.clip(y_pred, 0.001, np.inf))+1) second_sum = T.log(T.sum(T.clip(y_true, 0.001, np.inf))+1) return T.mean(T.square(first_log-second_log), axis=-1) + CMC_PENALTY*T.square(first_sum-second_sum)
def get_constraint_updates(self): constraint_updates = OrderedDict() if self.flags['wv_norm'] == 'unit': constraint_updates[self.Wv] = self.Wv / self.norm_wv elif self.flags['wv_norm'] == 'max_unit': constraint_updates[self.Wv] = self.Wv / self.norm_wv * T.minimum(self.norm_wv, 1.0) if self.flags['scalar_lambd']: constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd) ## Enforce sparsity pattern on g if required ## if self.sparse_gmask: constraint_updates[self.Wg] = self.Wg * self.sparse_gmask.mask.T ## clip parameters to maximum values (if applicable) for (k,v) in self.clip_max.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(param, param, v) ## clip parameters to minimum values (if applicable) for (k,v) in self.clip_min.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param) return constraint_updates
def compute_hard_windows(self, image_shape, location, scale): # find topleft(front) and bottomright(back) corners for each patch a = location - 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale) b = location + 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale) # grow by three patch pixels a -= self.kernel.k_sigma_radius(self.cutoff, scale) b += self.kernel.k_sigma_radius(self.cutoff, scale) # clip to fit inside image and have nonempty window a = T.clip(a, 0, image_shape - 1) b = T.clip(b, a + 1, image_shape) if self.batched_window: # take the bounding box of all windows; now the slices # will have the same length for each sample and scan can # be avoided. comes at the cost of typically selecting # more of the input. a = a.min(axis=0, keepdims=True) b = b.max(axis=0, keepdims=True) # make integer a = T.cast(T.floor(a), 'int16') b = T.cast(T.ceil(b), 'int16') return a, b
def gaussian_likelihood_diagonal_variance(t, mu, sig, dim): """ Gaussian Likelihood along first dimension Parameters ---------- t : TensorVariable mu : FullyConnected (Linear) sig : FullyConnected (Softplus) dim : First dimension of the target vector t """ # First clip sig sig_clip = T.clip(sig, 1e-40, 1e40) # Since the variance matrix is diagonal, normalization term is easier to compute, # and calculus overflow can easily be prevented by first summing by 2*pi and taking square sig_time_2pi = T.sqrt(sig_clip * 2 * math.pi) ####################### ####################### # This is the problem... product goes to 0 normalization_coeff = T.clip(T.prod(sig_time_2pi, axis=0), 1e-40, 1e40) ####################### ####################### # Once again, fact that sig is diagonal allows for simplifications : # term by term division instead of inverse matrix multiplication exp_term = (T.exp(- 0.5 * (t-mu) * (t-mu) / sig_clip).sum(axis=0)) pdf = exp_term / normalization_coeff return pdf
def get_constraint_updates(self): updates = OrderedDict() ## unit-variance constraint on hidden-unit activations ## if self.flags['unit_std']: updates[self.Wv] = self.Wv / self.avg_hact_std ## clip parameters to maximum values (if applicable) for (k,v) in self.clip_max.iteritems(): assert k in [param.name for param in self.params()] param = getattr(self, k) updates[param] = T.clip(param, param, v) ## clip parameters to minimum values (if applicable) for (k,v) in self.clip_min.iteritems(): assert k in [param.name for param in self.params()] param = getattr(self, k) updates[param] = T.clip(updates.get(param, param), v, param) ## constrain lambd to be a scalar if self.flags['scalar_lambd']: lambd = updates.get(self.lambd, self.lambd) updates[self.lambd] = T.mean(lambd) * T.ones_like(lambd) return updates
def build_and_train_model(self,n_hu,n_hl): print('Building Model') input_phrase = T.imatrix('train_inputmatrix') labels = T.imatrix('trainphrase_matrix') network = self.define_layers(input_phrase,labels,n_hu,n_hl) print("Defining loss") #Prediction or loss prediction = [] prediction.append(T.clip(lasagne.layers.get_output(network[0]),1.0e-7,1.0-1.0e-7)) prediction.append(T.clip(lasagne.layers.get_output(network[1]),1.0e-7,1.0-1.0e-7)) loss = l.define_loss(prediction[0],prediction[1]) self.model = network #define params params = lasagne.layers.get_all_params(network) updates = lasagne.updates.adadelta(loss,params) #run test train_fn = theano.function([input_phrase,labels],[loss, prediction[0], prediction[1]],updates=updates,allow_input_downcast=True) print("Model and params defined now training") epoch = 0 for epoch in range(self.end_epoch): train_loss = 0 train_pred = [] start_time = time.time() loss, predicted, phrase = train_fn(self.train_inputmatrix,self.trainphrase_matrix) print('Training Loss: ' + str(loss) + ' Train Epoch ' + str(epoch)) self.save_best(loss,predicted,network)
def __init__(self, rng, input, filter_shape, image_shape, W=None, bias=False, padding='valid',activation=T.nnet.relu): assert image_shape[1] == filter_shape[1] self.input = input fan_in = numpy.prod(filter_shape[1:]) fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:])) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) if W==None: W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) self.W =W conv_out = K.conv2d( x=input, kernel=self.W, filter_shape=filter_shape, image_shape=image_shape, border_mode=padding ) if bias==True: b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) self.output = self.output = T.clip(activation(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')), 0.001, 0.999) self.params = [self.W, self.b] else: self.output = T.clip(activation(conv_out), 0.001, 0.999) self.params = [self.W] self.input = input
def kl_divergence(y_true, y_pred): y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon) y_true = T.clip(y_true, epsilon, 1.0 - epsilon) kld = T.mean(y_true * ( T.log(y_true) - T.log(y_pred))) return kld
def redo_theano(self): self.h = shared(N.zeros(self.nhid, dtype=floatX), name="h") self.v = shared(N.zeros(self.nvis, dtype=floatX), name="v") input_v = T.vector() assert input_v.type.dtype == floatX self.init_h_v = function([input_v], updates={self.h: self.predict(input_v), self.v: input_v}) coding_obj = self.coding_obj(self.v, self.h) assert len(coding_obj.type.broadcastable) == 0 coding_grad = T.grad(coding_obj, self.h) assert len(coding_grad.type.broadcastable) == 1 self.coding_obj_grad = function([], [coding_obj, coding_grad]) self.new_h = shared(N.zeros(self.nhid, dtype=floatX), name="new_h") alpha = T.scalar(name="alpha") outside_grad = T.vector(name="outside_grad") new_h = T.clip(self.h * T.exp(-alpha * outside_grad), 1e-10, 1e4) new_obj = self.coding_obj(self.v, new_h) self.try_step = function([alpha, outside_grad], updates={self.new_h: new_h}, outputs=new_obj) self.accept_h = function([], updates={self.h: self.new_h}) self.get_h = function([], self.h) V = T.matrix(name="V") H = T.matrix(name="H") coding_obj_batch = self.coding_obj_batch(V, H) self.code_learning_obj = function([V, H], coding_obj_batch) learning_grad = T.grad(coding_obj_batch, self.W) self.code_learning_step = function([V, H, alpha], updates={self.W: self.W - alpha * learning_grad}) pred_obj = T.mean(T.sqr(self.predict(V) - H)) predictor_params = [self.pred_W, self.pred_b, self.pred_g] pred_grads = T.grad(pred_obj, wrt=predictor_params) predictor_updates = {} for param, grad in zip(predictor_params, pred_grads): predictor_updates[param] = param - alpha * grad predictor_updates[self.pred_g] = T.clip( predictor_updates[self.pred_g], N.cast[floatX](0.5), N.cast[floatX](1000.0) ) self.train_predictor = function([V, H, alpha], updates=predictor_updates)
def sigmoid_readout(operators, v_in, h_L, external): """Sigmoid readout layer. Cost is the binary crossentropy and monitor is RMSE. :param operators: list of [weight, bias] with shapes (n_hidden, n_visible) and (n_visible, ) :param h_L: shape (timesteps, n_hidden) :return: shape (timesteps, n_visible) """ weight = operators[0] bias = operators[1] v_pred = sigmoid(T.dot(h_L, weight) + bias) # broadcastable bias?? v_pred_c = T.clip(v_pred, 1.0e-7, 1.0 - 1.0e-7) v_in_c = T.clip(v_in, 1.0e-7, 1.0 - 1.0e-7) # Sample is just rounded to nearest integer: v_sample = T.round(v_pred) v_sample_c = T.clip(v_sample, eps, 1.0 - eps) # Cost: # cost = 1000 * ((v_pred[:-1] - v_in[1:]) ** 2).mean() # cost = -T.xlogx.xlogy0(v_in_c[1:], v_pred_c[:-1]) - \ # T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_pred_c[:-1]) cost = crossent(v_pred_c[:-1], v_in_c[1:]) # TODO: v_sample_c !!! cost = cost.mean() # Monitor: # monitor = -T.xlogx.xlogy0(v_in_c[1:], v_sample_c[:-1]) - \ # T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_sample_c[:-1]) monitor = crossent(v_sample_c[:-1], v_in_c[1:]) monitor = monitor.mean() return v_sample, cost, monitor, None
def _modify_updates(self, updates): if self.zero_hidbias: hidbias_updated = updates[self.hidbias] updates[self.hidbias] = tensor.clip(hidbias_updated, 0, 0) if self.zero_visbias: visbias_updated = updates[self.visbias] updates[self.visbias] = tensor.clip(visbias_updated, 0, 0)
def sigmoid_readout_old(operators, v_in, h_L, g): """Sigmoid readout layer. Cost is the binary crossentropy and monitor is RMSE. :param params: list of [weight, bias] with shapes (n_hidden, n_visible) and (n_visible, ) :param h_L: shape (timesteps, n_visible) :return: shape (timesteps, n_hidden) """ weight = operators[0] bias = operators[1] v_pred = g(T.dot(h_L, weight) + bias) # broadcastable bias?? v_pred_c = T.clip(v_pred, 1.0e-7, 1.0 - 1.0e-7) v_in_c = T.clip(v_in, 1.0e-7, 1.0 - 1.0e-7) # Cost: cost = -T.xlogx.xlogy0(v_in_c[1:], v_pred_c[:-1]) - T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_pred_c[:-1]) cost = cost.sum() / v_in.shape[0] # Sample is just rounded to nearest integer: v_sample = T.round(v_pred) v_sample_c = T.clip(v_sample, 1.0e-7, 1.0 - 1.0e-7) # Monitor (needs to return something... for now): monitor = -T.xlogx.xlogy0(v_in_c[1:], v_sample_c[:-1]) - T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_sample_c[:-1]) monitor = monitor.sum() / v_in.shape[0] return v_sample, cost, monitor, None
def softmax_readout(operators, v_in, h_L, external): """Softmax readout layer. Cost is the binary crossentropy and monitor is RMSE. :param operators: list of [weight, bias] with shapes (n_hidden, n_visible) and (n_visible, ) :param h_L: shape (timesteps, n_hidden) :return: shape (timesteps, n_visible) """ weight = operators[0] bias = operators[1] v_pred = softmax(T.dot(h_L, weight) + bias) # broadcastable bias?? v_pred_c = T.clip(v_pred, 1.0e-7, 1.0 - 1.0e-7) v_in_c = T.clip(v_in, 1.0e-7, 1.0 - 1.0e-7) # Sampled value is just the argmax of softmax: v_sample = rng.multinomial(pvals=v_pred, dtype=theano.config.floatX) v_sample_c = T.clip(v_sample, eps, 1.0 - eps) # Cost: # cost = 1000 * ((v_pred[:-1] - v_in[1:]) ** 2).mean() # cost = -T.xlogx.xlogy0(v_in_c[1:], v_pred_c[:-1]) - \ # T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_pred_c[:-1]) cost = crossent(v_pred_c[:-1], v_in_c[1:]) cost = cost.mean() # Monitor: # monitor = -T.xlogx.xlogy0(v_in_c[1:], v_sample_c[:-1]) - \ # T.xlogx.xlogy0(1 - v_in_c[1:], 1 - v_sample_c[:-1]) # TODO: changed monitor to v_pred_c!!! monitor = crossent(v_pred_c[:-1], v_in_c[1:]) monitor = monitor.mean() return v_sample, cost, monitor, None
def lcn_std_diff(x,size=9): # Function borrowed from bengioe_util p = x.reshape((1,1,48,48)) #p = (p-TT.mean(p))/T.std(p) g = gaussian(size,1.591/size) g/=g.sum() g = numpy.float32(g.reshape((1,1,size,size))) mean = TT.nnet.conv.conv2d(p,TT.constant(g), (1,1,48,48), (1,1,size,size), 'full').reshape((48+size-1,)*2) mean = mean[size/2:48+size/2, size/2:48+size/2] meansq = TT.nnet.conv.conv2d(TT.sqr(p),TT.constant(g), (1,1,48,48), (1,1,size,size), 'full').reshape((48+size-1,)*2) meansq = meansq[size/2:48+size/2, size/2:48+size/2] var = meansq - TT.sqr(mean) var = TT.clip(var, 0, 1e30) std = TT.sqrt(var) std = TT.clip(std, TT.mean(std), 1e30) out = (p - mean) / std return out - out.min()
def init_process(model, gaussian, delta, fn_type): print("Building model and compiling functions...") # Prepare Theano variables for inputs and targets import theano.tensor as T input_var_list = [T.tensor4('inputs{}'.format(i)) for i in range(scales)] target_var = T.imatrix('targets') # Create network model if model == 'jy': print('Building JY CNN...') network = JY_cnn(input_var_list, gaussian, delta) learning_rate = 0.006 # elif model == 'fcrnn': # print('Building FCRNN...') # network = FCRNN(input_var_list, delta) # learning_rate = 0.0005 print('defining loss function') prediction = lasagne.layers.get_output(network) prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7) loss = lasagne.objectives.binary_crossentropy(prediction, target_var) loss = loss.mean() print('defining update') params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learning_rate, momentum=0.9) # updates = lasagne.updates.adagrad(loss, params, learning_rate=learning_rate) print('defining testing method') test_prediction = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.clip(test_prediction, 1e-7, 1.0 - 1e-7) #frame prediction layer_list = lasagne.layers.get_all_layers(network) gauss_layer = layer_list[-3] pre_gauss_layer = layer_list[-4] if gaussian else layer_list[-3] gauss_pred = lasagne.layers.get_output(gauss_layer, deterministic=True) pre_gauss_pred = lasagne.layers.get_output(pre_gauss_layer, deterministic=True) test_loss = lasagne.objectives.binary_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_pred_result = T.argmax(test_prediction, axis=1) target_result = T.argmax(target_var, axis=1) test_acc = T.mean(T.eq(test_pred_result, target_result), dtype=theano.config.floatX) if fn_type == 'train': print('compiling training function') func = theano.function(input_var_list + [target_var], [loss, prediction, gauss_pred, pre_gauss_pred], updates=updates) elif fn_type == 'val' or fn_type == 'test': print('compiling validation and testing function') func = theano.function(input_var_list + [target_var], [test_loss, test_acc, test_pred_result, test_prediction, gauss_pred, pre_gauss_pred]) return func, network
def rmsprop(self, lr, tparams, grads, inp_list, cost, params): clip = params["grad_clip"] decay_rate = tensor.constant(params["decay_rate"], dtype=theano.config.floatX) smooth_eps = tensor.constant(params["smooth_eps"], dtype=theano.config.floatX) zipped_grads = [theano.shared(np.zeros_like(p.get_value()), name="%s_grad" % k) for k, p in tparams.iteritems()] running_grads2 = [ theano.shared(np.zeros_like(p.get_value()), name="%s_rgrad2" % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] if clip > 0.0: rg2up = [ ( rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip) ** 2), 0.0, np.inf), ) for rg2, g in zip(running_grads2, grads) ] else: rg2up = [ (rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2), 0.0, np.inf)) for rg2, g in zip(running_grads2, grads) ] f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name="rmsprop_f_grad_shared") updir = [theano.shared(p.get_value() * numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems()] updir_new = [ (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2) ] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function( [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update" ) return f_grad_shared, f_update, zipped_grads, running_grads2, updir
def train(self, X, evalinter=10): ''' function to call to train this NMF GD on given matrix X Calls trainingloop() ''' self.initvars(X) # define errors and cost tErr = (1./2.) * ((self.X - T.dot(self.W, self.H))**2).sum() tReg = (1./2.) * ((self.W**2).sum() * self.Wreg + (self.H**2).sum() * self.Hreg) tCost = tErr + tReg # get gradients gW, gH = T.grad(tCost, [self.W, self.H]) # define updates and function updW = (self.W, T.clip(self.W - self.lr * gW, 0, np.infty)) updH = (self.H, T.clip(self.H - self.lr * gH, 0, np.infty)) trainf = theano.function( inputs=[], outputs=[tErr], updates=[updW, updH] ) normf = theano.function( inputs=[], outputs=[], updates=[ (self.W, (self.W.T/T.sum(self.W, axis=1)).T), # ] ) # train loop err = self.trainloop(X, trainf=trainf, evalinter=evalinter) return self.W.get_value(), self.H.get_value(), err
def get_output_for(self, inputs, **kwargs): mu_area, sigma_area, is_not_padded, slicedists = inputs # Rescale input mu_area = mu_area / self.rescale_input sigma_area = sigma_area / self.rescale_input # For each slice pair, compute if both of them are valid is_pair_not_padded = is_not_padded[:, :-1] + is_not_padded[:, 1:] > 1.5 # Compute the distance between slices h = slicedists[:, :-1] # Compute mu for each slice pair m1 = mu_area[:, :-1] m2 = mu_area[:, 1:] eps = 1e-2 mu_volumes = (m1 + m2 + T.sqrt(T.clip(m1*m2, eps, utils.maxfloat))) * h / 3.0 mu_volumes = mu_volumes * is_pair_not_padded # Compute sigma for each slice pair s1 = sigma_area[:, :-1] s2 = sigma_area[:, 1:] sigma_volumes = h*(s1 + s2) / 3.0 sigma_volumes = sigma_volumes * is_pair_not_padded # Compute mu and sigma per patient mu_volume_patient = T.sum(mu_volumes, axis=1) sigma_volume_patient = T.sqrt(T.clip(T.sum(sigma_volumes**2, axis=1), eps, utils.maxfloat)) # Concat and return return T.concatenate([ mu_volume_patient.dimshuffle(0, 'x'), sigma_volume_patient.dimshuffle(0, 'x')], axis=1)
def unet_crossentropy_loss_sampled(y_true, y_pred): epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices classPos = 1 classNeg = 0 indPos = T.eq(y_true, classPos).nonzero()[0] indNeg = T.eq(y_true, classNeg).nonzero()[0] #pos = y_true[ indPos ] #neg = y_true[ indNeg ] # shuffle n = indPos.shape[0] indPos = indPos[UNET.srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[UNET.srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([ indPos.shape[0], indNeg.shape[0]]), dtype='int64') #n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] #loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(y_pred_clipped[indNeg])) loss_vector = T.clip(loss_vector, epsilon, 1.0-epsilon) average_loss = T.mean(loss_vector) if T.isnan(average_loss): average_loss = T.mean( y_pred_clipped[indPos]) return average_loss
def theano_mu_sigma_erf(mu_erf, sigma_erf, eps=1e-7): x_axis = theano.shared(np.arange(0, 600, dtype='float32')).dimshuffle('x',0) if sigma_erf.ndim==0: sigma_erf = T.clip(sigma_erf.dimshuffle('x','x'), eps, 1) elif sigma_erf.ndim==1: sigma_erf = T.clip(sigma_erf.dimshuffle(0,'x'), eps, 1) x = (x_axis - mu_erf.dimshuffle(0,'x')) / (sigma_erf * np.sqrt(2).astype('float32')) return (T.erf(x) + 1)/2
def objective(y_true, y_pred, P, Q, alpha=0., beta=0.15, dbeta=0., gamma=0.01, gamma1=-1., poos=0.23, eps=1e-6): '''Expects a binary class matrix instead of a vector of scalar classes. ''' beta = np.float32(beta) dbeta = np.float32(dbeta) gamma = np.float32(gamma) poos = np.float32(poos) eps = np.float32(eps) # scale preds so that the class probas of each sample sum to 1 y_pred += eps y_pred /= y_pred.sum(axis=-1, keepdims=True) y_true = T.cast(y_true.flatten(), 'int64') y1 = T.and_(T.gt(y_true, 0), T.le(y_true, Q)) # in-set y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q)) # out-of-set or unlabeled y0sum = y0.sum() + eps # number of oos y1sum = y1.sum() + eps # number of in-set # we want to reduce cross entrophy of labeled data # convert all oos/unlabeled to label=0 cost0 = T.nnet.categorical_crossentropy(y_pred, T.switch(y_true <= Q, y_true, 0)) cost0 = T.dot(y1, cost0) / y1sum # average cost per labeled example if alpha: cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred) cost1 = T.dot(y0, cost1) / y0sum # average cost per labeled example cost0 += alpha*cost1 # we want to increase the average entrophy in each batch # average over batch if beta: y_pred_avg0 = T.dot(y0, y_pred) / y0sum y_pred_avg0 = T.clip(y_pred_avg0, eps, np.float32(1) - eps) y_pred_avg0 /= y_pred_avg0.sum(axis=-1, keepdims=True) cost2 = T.nnet.categorical_crossentropy(y_pred_avg0.reshape((1,-1)), P-dbeta)[0] # [None,:] cost2 = T.switch(y0sum > 0.5, cost2, 0.) # ignore cost2 if no samples cost0 += beta*cost2 # binary classifier score if gamma: y_pred0 = T.clip(y_pred[:,0], eps, np.float32(1) - eps) if gamma1 < 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot(np.float32(1)-poos*y0.T,T.log(np.float32(1)-y_pred0)) cost3 /= y_pred.shape[0] cost0 += gamma*cost3 elif gamma1 > 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0,T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost31 = - T.dot(y1,T.log(np.float32(1)-y_pred0)) cost3 /= y1sum cost0 += gamma*cost3 + gamma1*cost31 else: # gamma1 == 0. cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0, T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost0 += gamma*cost3 return cost0
def build_objective2(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.flatten(nn.layers.get_output(model.l_target)) targets = T.clip(targets, 0, 1) p_no_nodule = predictions[:,0] p_nodule = np.float32(1.)-p_no_nodule p = T.clip(p_nodule, epsilon, 1.-epsilon) bce = T.nnet.binary_crossentropy(p, targets) return T.mean(bce)
def kl_divergence(target, prediction, eps=1e-6): '''Kullback-Leibler divergence''' prediction = T.reshape(prediction, (prediction.shape[1], prediction.shape[2])) target = T.reshape(target, (target.shape[1], target.shape[2])) prediction = T.clip(prediction, eps, 1 - eps) target = T.clip(target, eps, 1 - eps) kl = T.sum(target * T.log(target / prediction), axis=0, keepdims=True) return kl
def _interpolate(im, x, y, out_height, out_width): # *_f are floats num_batch, height, width, channels = im.shape height_f = T.cast(height, theano.config.floatX) width_f = T.cast(width, theano.config.floatX) # scale indices from [-1, 1] to [0, width/height]. x = (x + 1) / 2 * width_f y = (y + 1) / 2 * height_f # Clip indices to ensure they are not out of bounds. max_x = width_f - 1 max_y = height_f - 1 x0 = T.clip(x, 0, max_x) x1 = T.clip(x + 1, 0, max_x) y0 = T.clip(y, 0, max_y) y1 = T.clip(y + 1, 0, max_y) # We need floatX for interpolation and int64 for indexing. x0_f = T.floor(x0) x1_f = T.floor(x1) y0_f = T.floor(y0) y1_f = T.floor(y1) x0 = T.cast(x0, 'int64') x1 = T.cast(x1, 'int64') y0 = T.cast(y0, 'int64') y1 = T.cast(y1, 'int64') # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width*height base = T.repeat( T.arange(num_batch, dtype='int64')*dim1, out_height*out_width) base_y0 = base + y0*dim2 base_y1 = base + y1*dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[idx_a] Ib = im_flat[idx_b] Ic = im_flat[idx_c] Id = im_flat[idx_d] # calculate interpolated values wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x') wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x') wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x') wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x') output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0) return output
def _interpolate(im, x, y, out_height, out_width): # *_f are floats num_batch, height, width, channels = im.shape height_f = T.cast(height, theano.config.floatX) width_f = T.cast(width, theano.config.floatX) # clip coordinates to [-1, 1] x = T.clip(x, -1, 1) y = T.clip(y, -1, 1) # scale coordinates from [-1, 1] to [0, width/height - 1] x = (x + 1) / 2 * (width_f - 1) y = (y + 1) / 2 * (height_f - 1) # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates; # we need those in floatX for interpolation and in int64 for indexing. for # indexing, we need to take care they do not extend past the image. x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 x0 = T.cast(x0_f, 'int64') y0 = T.cast(y0_f, 'int64') x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64') y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64') # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width*height base = T.repeat( T.arange(num_batch, dtype='int64')*dim1, out_height*out_width) base_y0 = base + y0*dim2 base_y1 = base + y1*dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[idx_a] Ib = im_flat[idx_b] Ic = im_flat[idx_c] Id = im_flat[idx_d] # calculate interpolated values wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x') wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x') wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x') wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x') output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0) assert str(output.dtype) == theano.config.floatX, str(output.dtype) return output
def _interpolate(im, x, y, out_height, out_width): # *_f are floats num_batch, height, width, channels = im.shape height_f = T.cast(height, 'float32') width_f = T.cast(width, 'float32') zero = T.zeros([], dtype='int64') max_y = im.shape[1] - 1 max_x = im.shape[2] - 1 # scale indices from [-1, 1] to [0, width/height]. x = (x + 1.0)*(width_f) / 2.0 y = (y + 1.0)*(height_f) / 2.0 x0 = T.cast(T.floor(x), 'int64') x1 = x0 + 1 y0 = T.cast(T.floor(y), 'int64') y1 = y0 + 1 # Clip indicies to ensure they are not out of bounds. x0 = T.clip(x0, zero, max_x) x1 = T.clip(x1, zero, max_x) y0 = T.clip(y0, zero, max_y) y1 = T.clip(y1, zero, max_y) # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width*height base = _repeat( T.arange(num_batch, dtype='int32')*dim1, out_height*out_width) base_y0 = base + y0*dim2 base_y1 = base + y1*dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[idx_a] Ib = im_flat[idx_b] Ic = im_flat[idx_c] Id = im_flat[idx_d] # calculate interpolated values x0_f = T.cast(x0, 'float32') x1_f = T.cast(x1, 'float32') y0_f = T.cast(y0, 'float32') y1_f = T.cast(y1, 'float32') wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x') wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x') wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x') wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x') output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0) return output
def _step_state(x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, ctrl, exist, time_step): a_t_e, v_t_e, x_t_e, t_t, t_h = step(x_h_, v_h_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_h, exist, time_step) t_h = common.disconnected_grad(t_h) t_t = common.disconnected_grad(t_t) # approximated dynamic of the un-observed parts in the state a_t_a = tt.zeros(shape=(3,2), dtype=np.float32) v_t_a = v_t_ x_t_a = x_t_ + self.dt * v_t_a # difference in predictions n_v_t = v_t_e - v_t_a n_a_t = a_t_e - a_t_a n_x_t = x_t_e - x_t_a # disconnect the gradient of the noise signals n_v_t = common.disconnected_grad(n_v_t) n_a_t = common.disconnected_grad(n_a_t) n_x_t = common.disconnected_grad(n_x_t) # add the noise to the approximation a_t = a_t_a + n_a_t v_t = v_t_a + n_v_t x_t = x_t_a + n_x_t # update the observed part of the state delta_steer = ctrl[0] accel = ctrl[1] delta_steer = tt.clip(delta_steer, -np.pi/4, np.pi/4) angle = angle_ + delta_steer speed = speed_ + accel * self.dt speed = tt.clip(speed, 0, self.v_max) v_h_x = speed * tt.sin(angle) v_h_y = speed * tt.cos(angle) v_h = tt.stack([v_h_x,v_h_y]) x_h = x_h_ + self.dt * v_h x_h = tt.clip(x_h, -self.bw, self.bw) return x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t
def clipped_gradients(gradients, gradient_clipping): clipped_grads = [ T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients ] return clipped_grads
def clip(x, min_value, max_value): if max_value < min_value: max_value = min_value return T.clip(x, min_value, max_value)
def binary_crossentropy(output, target, from_logits=False): if from_logits: output = T.nnet.sigmoid(output) # avoid numerical instability with _EPSILON clipping output = T.clip(output, _EPSILON, 1.0 - _EPSILON) return T.nnet.binary_crossentropy(output, target)
def __init__(self, extractor, dataset, train_batch_size=16, extractor_learning_rate=1e-5, ranker_learning_rate=1e-4, weight_decay=1e-5, optimizer=lasagne.updates.rmsprop, ranker_nonlinearity=lasagne.nonlinearities.linear, debug=False, do_log=True): self.train_batch_size = train_batch_size self.extractor = extractor self.dataset = dataset self.weight_decay = weight_decay self.optimizer = optimizer self.ranker_nonlinearity = ranker_nonlinearity self.extractor_learning_rate = extractor_learning_rate self.ranker_learning_rate = ranker_learning_rate self.debug = debug self.do_log = do_log if force_not_log: self.do_log = False logger.warning('Not logging because pastalog is not installed.') extractor_name = self.extractor.__class__.__name__ if extractor.augmentation: extractor_name = "%s-aug" % extractor_name self.NAME = "e:%s-d:%s-bs:%d-elr:%f-rlr:%f-opt:%s-rnl:%s-wd:%f-rs:%s" % ( extractor_name, self.dataset.get_name(), self.train_batch_size, extractor_learning_rate, ranker_learning_rate, self.optimizer.__name__, self.ranker_nonlinearity.__name__, self.weight_decay, str(settings.RANDOM_SEED)) if self.do_log: self.pastalog = Log('http://localhost:8100/', self.NAME) # TODO: check if converting these to shared variable actually improves # performance. self.input_var = T.ftensor4('inputs') self.target_var = T.fvector('targets') self.extractor.set_input_var(self.input_var, batch_size=train_batch_size) self.extractor_layer = self.extractor.get_output_layer() self.extractor_learning_rate_shared_var = theano.shared( np.cast['float32'](extractor_learning_rate), name='extractor_learning_rate') self.ranker_learning_rate_shared_var = theano.shared( np.cast['float32'](ranker_learning_rate), name='ranker_learning_rate') self.extractor_params = lasagne.layers.get_all_params( self.extractor_layer, trainable=True) self.absolute_rank_estimate, self.ranker_params = self._create_absolute_rank_estimate( self.extractor_layer) self.reshaped_input = lasagne.layers.ReshapeLayer( self.absolute_rank_estimate, (-1, 2)) # the posterior estimate layer is not trainable self.posterior_estimate = lasagne.layers.DenseLayer( self.reshaped_input, num_units=1, W=lasagne.init.np.array([[1], [-1]]), b=lasagne.init.Constant(val=0), nonlinearity=lasagne.nonlinearities.sigmoid) self.posterior_estimate.params[self.posterior_estimate.W].remove( 'trainable') self.posterior_estimate.params[self.posterior_estimate.b].remove( 'trainable') # the clipping is done to prevent the model from diverging as caused by # binary XEnt self.predictions = T.clip( lasagne.layers.get_output(self.posterior_estimate).ravel(), self._epsilon, 1.0 - self._epsilon) self.xent_loss = lasagne.objectives.binary_crossentropy( self.predictions, self.target_var).mean() self.l2_penalty = lasagne.regularization.regularize_network_params( self.absolute_rank_estimate, lasagne.regularization.l2) self.loss = self.xent_loss + self.l2_penalty * self.weight_decay self.test_absolute_rank_estimate = lasagne.layers.get_output( self.absolute_rank_estimate, deterministic=True) self._create_theano_functions()
def mean_absolute_percentage_error_loss(y_pred, y_actual, **kwargs): eprint("Use mean absolute percentage error. Ensure no outputs are exactly 0.") diff = T.abs_( (y_actual - y_pred) / T.clip(T.abs_(y_actual), epsilon, np.inf)) return(100. * T.mean(diff, axis=-1))
def build_objective2(model, deterministic=False, epsilon=1.e-7): predictions = T.flatten( nn.layers.get_output(model.l_out, deterministic=deterministic)) targets = T.flatten(nn.layers.get_output(model.l_target)) preds = T.clip(predictions, epsilon, 1. - epsilon) return T.mean(nn.objectives.binary_crossentropy(preds, targets))
def rho(x): return tt.clip(x, 0, 1)
def categorical_crossentropy(expected, predicted): """ Categorical cross-entropy error. """ epsilon = smallest_positive_number() predicted = T.clip(predicted, epsilon, 1.0 - epsilon) return T.nnet.categorical_crossentropy(predicted, expected).mean()
def test_mlp(initial_learning_rate, learning_rate_decay, squared_filter_length_limit, n_epochs, batch_size, dropout, results_file_name, layer_sizes, dataset, use_bias): """ The dataset is the one from the mlp demo on deeplearning.net. This training function is lifted from there almost exactly. :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_mnist(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch epoch = T.scalar() x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels learning_rate = theano.shared( np.asarray(initial_learning_rate, dtype=theano.config.floatX)) rng = np.random.RandomState(1234) # construct the MLP class classifier = MLP(rng=rng, input=x, layer_sizes=layer_sizes, use_bias=use_bias) # Build the expresson for the cost function. cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) # Compile theano function for testing. test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) #theano.printing.pydotprint(test_model, outfile="test_file.png", # var_with_name_simple=True) # Compile theano function for validation. validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) #theano.printing.pydotprint(validate_model, outfile="validate_file.png", # var_with_name_simple=True) # Compute gradients of the model wrt parameters gparams = [] for param in classifier.params: # Use the right cost function here to train with or without dropout. gparam = T.grad(dropout_cost if dropout else cost, param) gparams.append(gparam) # ... and allocate mmeory for momentum'd versions of the gradient gparams_mom = [] for param in classifier.params: gparam_mom = theano.shared( np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) gparams_mom.append(gparam_mom) # Compute momentum for the current epoch mom = ifelse(epoch < 500, 0.5 * (1. - epoch / 500.) + 0.99 * (epoch / 500.), 0.99) # Update the step direction using momentum updates = {} for gparam_mom, gparam in zip(gparams_mom, gparams): updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam # ... and take a step along that direction for param, gparam_mom in zip(classifier.params, gparams_mom): stepped_param = param - (1. - mom) * learning_rate * gparam_mom # This is a silly hack to constrain the norms of the rows of the weight # matrices. This just checks if there are two dimensions to the # parameter and constrains it if so... maybe this is a bit silly but it # should work for now. if param.get_value(borrow=True).ndim == 2: squared_norms = T.sum(stepped_param**2, axis=1).reshape( (stepped_param.shape[0], 1)) scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.) updates[param] = stepped_param * scale else: updates[param] = stepped_param # Compile theano function for training. This returns the training cost and # updates the model parameters. output = dropout_cost if dropout else cost train_model = theano.function( inputs=[epoch, index], outputs=output, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #theano.printing.pydotprint(train_model, outfile="train_file.png", # var_with_name_simple=True) # Theano function to decay the learning rate, this is separate from the # training function because we only want to do this once each epoch instead # of after each minibatch. decay_learning_rate = theano.function( inputs=[], outputs=learning_rate, updates={learning_rate: learning_rate * learning_rate_decay}) ############### # TRAIN MODEL # ############### print '... training' best_params = None best_validation_errors = np.inf best_iter = 0 test_score = 0. epoch_counter = 0 start_time = time.clock() results_file = open(results_file_name, 'wb') while epoch_counter < n_epochs: # Train this epoch epoch_counter = epoch_counter + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(epoch_counter, minibatch_index) # Compute loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_errors = np.sum(validation_losses) # Report and save progress. print "epoch {}, test error {}, learning_rate={}{}".format( epoch_counter, this_validation_errors, learning_rate.get_value(borrow=True), " **" if this_validation_errors < best_validation_errors else "") best_validation_errors = min(best_validation_errors, this_validation_errors) results_file.write("{0}\n".format(this_validation_errors)) results_file.flush() new_learning_rate = decay_learning_rate() end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_errors * 100., best_iter, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def binary_crossentropy(expected, predicted): """ Binary cross-entropy error. """ epsilon = smallest_positive_number() predicted = T.clip(predicted, epsilon, 1.0 - epsilon) return T.nnet.binary_crossentropy(predicted, expected).mean()
def build_objective(model, deterministic=False, epsilon=1e-12): p = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.flatten(nn.layers.get_output(model.l_target)) p = T.clip(p, epsilon, 1.-epsilon) bce = T.nnet.binary_crossentropy(p, targets) return T.mean(bce)
def lrelu(x): return tensor.clip(tensor.nnet.relu(x, 1. / 3), -3.0, 3.0)
def train_rnn(): global vocab, CNN_FEATURE_SIZE, word_to_index, index_to_word, SEQUENCE_LENGTH, MAX_SENTENCE_LENGTH logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Load the preprocessed dataset containing features extracted by GoogLeNet dataset = pickle.load(open('./data/image_caption_with_cnn_features.pkl')) # Count words occuring at least 5 times and construct mapping int <-> word allwords = Counter() for item in dataset: for sentence in item['sentences']: allwords.update(sentence['tokens']) vocab = [k for k, v in allwords.items() if (v >= 2 and k not in ['best', 'Best', '!', 'ever'])] vocab.insert(0, '#START#') vocab.append('#END#') word_to_index = {w: i for i, w in enumerate(vocab)} index_to_word = {i: w for i, w in enumerate(vocab)} logging.info('Size of vocabulary: {0}'.format(len(vocab))) SEQUENCE_LENGTH = 9 MAX_SENTENCE_LENGTH = SEQUENCE_LENGTH - 3 # 1 for image, 1 for start token, 1 for end token BATCH_SIZE = 75 CNN_FEATURE_SIZE = 5 EMBEDDING_SIZE = 1024 LR = 0.001 BATCH_SIZE = 75 ITERATIONS = 19000 # Configuration 23213: 512:0.001:20000 # Configuration 23214: 256:0.001:20000 # Configuration 23216: 512:0.0001:20000 # Configuration 23217: 1024:0.0001:20000 # Configuration 23227: 512:0.01:50000 # Configuration 23229: 512:0.001:50000 # Configuration 23231: 512:0.001:20000 BS=200 # Configuration 23232: 1024:0.001:20000 BS=200 # Configuration 23233: 1024:0.0001:20000 BS=200 SEQ = 32 # Configuration 23234: 512:0.0001:20000 BS=200 SEQ=16 # Configuration 23235: 512:0.0001:20000 BS=100 SEQ=8 # Configuration 23242: 1024:0.0001:20000 BS=200 SEQ=13 # Configuration 23243: 512:0.0001:20000 BS=200 SEQ=13 v >= 5 # Configuration 23246: 512:0.0001:75000 BS=100 SEQ=15 removing best ! AND v >= 4 # Config 23318: 512:0.001:25000 BS=200 SEQ=13 removing best ! AND v >= 3 # Config 23319: 512:0.0001:25000 BS=100 SEQ=11 removing best ! AND v >= 3 # Config 23322: 512:0.0001:25000 BS=100 SEQ=11 removing best ! AND v >= 2 # Config 23323: 1024:0.001:25000 BS=100 SEQ=11 removing best ! AND v >= 2 logging.info('Embeddings: {0} Learning rate: {1} Iter: {2}'.format(EMBEDDING_SIZE, LR, ITERATIONS)) # sentence embedding maps integer sequence with dim (BATCH_SIZE, SEQUENCE_LENGTH - 1) to # (BATCH_SIZE, SEQUENCE_LENGTH-1, EMBEDDING_SIZE) l_input_sentence = lasagne.layers.InputLayer((BATCH_SIZE, SEQUENCE_LENGTH - 1)) l_sentence_embedding = lasagne.layers.EmbeddingLayer(l_input_sentence, input_size=len(vocab), output_size=EMBEDDING_SIZE, ) # cnn embedding changes the dimensionality of the representation from 1000 to EMBEDDING_SIZE, # and reshapes to add the time dimension - final dim (BATCH_SIZE, 1, EMBEDDING_SIZE) l_input_cnn = lasagne.layers.InputLayer((BATCH_SIZE, CNN_FEATURE_SIZE)) l_cnn_embedding = lasagne.layers.DenseLayer(l_input_cnn, num_units=EMBEDDING_SIZE, nonlinearity=lasagne.nonlinearities.identity) l_cnn_embedding = lasagne.layers.ReshapeLayer(l_cnn_embedding, ([0], 1, [1])) # the two are concatenated to form the RNN input with dim (BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_SIZE) l_rnn_input = lasagne.layers.ConcatLayer([l_cnn_embedding, l_sentence_embedding]) l_dropout_input = lasagne.layers.DropoutLayer(l_rnn_input, p=0.5) l_lstm = lasagne.layers.LSTMLayer(l_dropout_input, num_units=EMBEDDING_SIZE, unroll_scan=True, grad_clipping=5.) l_dropout_output = lasagne.layers.DropoutLayer(l_lstm, p=0.5) # the RNN output is reshaped to combine the batch and time dimensions # dim (BATCH_SIZE * SEQUENCE_LENGTH, EMBEDDING_SIZE) l_shp = lasagne.layers.ReshapeLayer(l_dropout_output, (-1, EMBEDDING_SIZE)) # decoder is a fully connected layer with one output unit for each word in the vocabulary l_decoder = lasagne.layers.DenseLayer(l_shp, num_units=len(vocab), nonlinearity=lasagne.nonlinearities.softmax) # finally, the separation between batch and time dimension is restored l_out = lasagne.layers.ReshapeLayer(l_decoder, (BATCH_SIZE, SEQUENCE_LENGTH, len(vocab))) # Define symbolic variables for the various inputs # cnn feature vector x_cnn_sym = T.matrix() # sentence encoded as sequence of integer word tokens x_sentence_sym = T.imatrix() # mask defines which elements of the sequence should be predicted mask_sym = T.imatrix() # ground truth for the RNN output y_sentence_sym = T.imatrix() output = lasagne.layers.get_output(l_out, { l_input_sentence: x_sentence_sym, l_input_cnn: x_cnn_sym }) loss = T.mean(calc_cross_ent(output, mask_sym, y_sentence_sym)) MAX_GRAD_NORM = 15 all_params = lasagne.layers.get_all_params(l_out, trainable=True) all_grads = T.grad(loss, all_params) all_grads = [T.clip(g, -5, 5) for g in all_grads] all_grads, norm = lasagne.updates.total_norm_constraint( all_grads, MAX_GRAD_NORM, return_norm=True) updates = lasagne.updates.adam(all_grads, all_params, learning_rate=LR) f_train = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym], [loss, norm], updates=updates ) f_val = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym], loss) for iteration in range(ITERATIONS): x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(get_data_batch(dataset, BATCH_SIZE)) loss_train, norm = f_train(x_cnn, x_sentence, mask, y_sentence) if not iteration % 250: logging.info('Iteration {} loss_train: {} norm: {}'.format(iteration, loss_train, norm)) try: batch = get_data_batch(dataset, BATCH_SIZE, split='val') x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(batch) loss_val = f_val(x_cnn, x_sentence, mask, y_sentence) logging.info('Val loss: {}'.format(loss_val)) except IndexError: continue param_values = lasagne.layers.get_all_param_values(l_out) d = {'param values': param_values, 'vocab': vocab, 'word_to_index': word_to_index, 'index_to_word': index_to_word, } pickle.dump(d, open(r'./data/trained_lstm.pkl'.format(EMBEDDING_SIZE, LR*10000, ITERATIONS), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self, arch=None, lbda=1, perdatapoint=False, srng=RandomStreams(seed=427), prior=log_normal, opt='adam', coupling=4, coupling_dim=200, pad='same', stride=2, pool=None, uncoupled_init=0, convex_combination=0): if arch == 'Riashat': kernel_width = 3 self.kernel_width = kernel_width stride = 1 self.stride = stride pad = 'valid' self.pad = pad self.weight_shapes = [ (32, 1, kernel_width, kernel_width), # -> (None, 16, 14, 14) (32, 32, kernel_width, kernel_width) ] # -> (None, 16, 7, 7) self.args = [[32, kernel_width, stride, pad, rectify, 'none'], [32, kernel_width, stride, pad, rectify, 'max']] self.pool_size = 5 else: self.pool_size = 2 self.n_kernels = np.array(self.weight_shapes)[:, 1].sum() self.kernel_shape = self.weight_shapes[0][:1] + self.weight_shapes[0][ 2:] print "kernel_shape", self.kernel_shape self.kernel_size = np.prod(self.weight_shapes[0]) self.num_classes = 10 if arch == 'Riashat': self.num_hids = 256 else: self.num_hids = 128 self.num_mlp_layers = 1 self.num_mlp_params = self.num_classes + \ self.num_hids * self.num_mlp_layers self.num_cnn_params = np.sum(np.array(self.weight_shapes)[:, 0]) self.num_params = self.num_mlp_params + self.num_cnn_params self.coupling = coupling self.extra_l2 = 0 self.convex_combination = convex_combination #def __init__(self, self.lbda = lbda self.perdatapoint = perdatapoint self.srng = srng self.prior = prior self.__dict__.update(locals()) if perdatapoint: self.wd1 = self.input_var.shape[0] else: self.wd1 = 1 #def _get_theano_variables(self): self.input_var = T.matrix('input_var') self.input_var = T.tensor4('input_var') # <-- for CNN self.target_var = T.matrix('target_var') self.dataset_size = T.scalar('dataset_size') self.learning_rate = T.scalar('learning_rate') #def _get_hyper_net(self): # inition random noise print self.num_params ep = self.srng.normal(size=(self.wd1, self.num_params), dtype=floatX) logdets_layers = [] h_net = lasagne.layers.InputLayer([None, self.num_params]) # mean and variation of the initial noise layer_temp = LinearFlowLayer(h_net) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if self.coupling: layer_temp = CoupledWNDenseLayer(h_net, coupling_dim, uncoupled_init=uncoupled_init) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) for c in range(self.coupling - 1): h_net = PermuteLayer(h_net, self.num_params) layer_temp = CoupledWNDenseLayer(h_net, coupling_dim, uncoupled_init=uncoupled_init) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if self.convex_combination: layer_temp = ConvexBiasLayer( h_net, upweight_primary=self.convex_combination) h_net = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) self.h_net = h_net self.weights = lasagne.layers.get_output(h_net, ep) self.logdets = sum([get_output(ld, ep) for ld in logdets_layers]) #def _get_primary_net(self): t = np.cast['int32'](0) if 1: #self.dataset == 'mnist': p_net = lasagne.layers.InputLayer([None, 1, 28, 28]) print p_net.output_shape inputs = {p_net: self.input_var} #logpw = np.float32(0.) for ws, args in zip(self.weight_shapes, self.args): num_filters = ws[0] # TO-DO: generalize to have multiple samples? weight = self.weights[0, t:t + num_filters].dimshuffle( 0, 'x', 'x', 'x') num_filters = args[0] filter_size = args[1] stride = args[2] pad = args[3] nonl = args[4] p_net = lasagne.layers.Conv2DLayer(p_net, num_filters, filter_size, stride, pad, nonlinearity=nonl) p_net = stochastic_weight_norm(p_net, weight) if args[5] == 'max': p_net = lasagne.layers.MaxPool2DLayer(p_net, self.pool_size) #print p_net.output_shape t += num_filters for layer in range(self.num_mlp_layers): weight = self.weights[:, t:t + self.num_hids].reshape( (self.wd1, self.num_hids)) p_net = lasagne.layers.DenseLayer(p_net, self.num_hids, nonlinearity=rectify) p_net = stochastic_weight_norm(p_net, weight) if self.extra_l2: self.l2_penalty = lasagne.regularization.regularize_layer_params_weighted( {p_net: 3.5 / 128}, lasagne.regularization.l2) t += self.num_hids weight = self.weights[:, t:t + self.num_classes].reshape( (self.wd1, self.num_classes)) p_net = lasagne.layers.DenseLayer(p_net, self.num_classes, nonlinearity=nonlinearities.softmax) p_net = stochastic_weight_norm(p_net, weight) y = T.clip(get_output(p_net, inputs), 0.001, 0.999) # stability self.p_net = p_net self.y = y #def _get_params(self): params = lasagne.layers.get_all_params([self.h_net, self.p_net]) self.params = list() for param in params: if type(param) is not RSSV: self.params.append(param) params0 = lasagne.layers.get_all_param_values([self.h_net, self.p_net]) params = lasagne.layers.get_all_params([self.h_net, self.p_net]) updates = {p: p0 for p, p0 in zip(params, params0)} self.reset = theano.function([], None, updates=updates) self.add_reset('init') #def _get_elbo(self): logdets = self.logdets self.logqw = -logdets self.logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1) self.kl = (self.logqw - self.logpw).mean() self.kl_term = self.kl / T.cast(self.dataset_size, floatX) self.logpyx = -cc(self.y, self.target_var).mean() self.loss = -self.logpyx + self.kl_term # DK - extra monitoring (TODO) params = self.params ds = self.dataset_size self.logpyx_grad = flatten_list( T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2) self.logpw_grad = flatten_list( T.grad(-self.logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.logqw_grad = flatten_list( T.grad(self.logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2) self.monitored = [ self.logpyx, self.logpw, self.logqw, self.logpyx_grad, self.logpw_grad, self.logqw_grad ] #def _get_grads(self): grads = T.grad(self.loss, self.params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=self.max_norm) cgrads = [T.clip(g, -self.clip_grad, self.clip_grad) for g in mgrads] if self.opt == 'adam': self.updates = lasagne.updates.adam( cgrads, self.params, learning_rate=self.learning_rate) elif self.opt == 'momentum': self.updates = lasagne.updates.nesterov_momentum( cgrads, self.params, learning_rate=self.learning_rate) elif self.opt == 'sgd': self.updates = lasagne.updates.sgd( cgrads, self.params, learning_rate=self.learning_rate) #def _get_train_func(self): train = theano.function([ self.input_var, self.target_var, self.dataset_size, self.learning_rate ], self.loss, updates=self.updates) self.train_func = train # DK - putting this here, because is doesn't get overwritten by subclasses self.monitor_func = theano.function([ self.input_var, self.target_var, self.dataset_size, self.learning_rate ], self.monitored, on_unused_input='warn') #def _get_useful_funcs(self): self.predict_proba = theano.function([self.input_var], self.y) self.predict = theano.function([self.input_var], self.y.argmax(1))
def kullback_leibler_divergence_loss(y_pred, y_actual, **kwargs): y_actual = T.clip(y_actual, epsilon, 1) y_pred = T.clip(y_pred, epsilon, 1) return T.sum(y_actual * T.log(y_actual / y_pred), axis=-1)
def parameters_updates(self, LR): updates = [] beta1 = 0.9 beta2 = 0.999 epsilon = 1e-8 alpha = 0.05 t = self.n_samples + 1 a_t = LR * T.sqrt(1 - beta2**t) / (1 - beta1**t) #updates.append((self.Wba, self.Wba)) m_t_Wa = beta1 * self.m_Wa + (1 - beta1) * self.dEdWa v_t_Wa = beta2 * self.v_Wa + (1 - beta2) * self.dEdWa**2 step_Wa = a_t * m_t_Wa / (T.sqrt(v_t_Wa) + epsilon) if self.binary_training == True: step_Wa = T.clip(step_Wa, -self.W0_a, self.W0_a) updates.append((self.m_Wa, m_t_Wa)) updates.append((self.v_Wa, v_t_Wa)) updates.append((self.Wa, self.Wa - step_Wa)) m_t_Wx = beta1 * self.m_Wx + (1 - beta1) * self.dEdWx v_t_Wx = beta2 * self.v_Wx + (1 - beta2) * self.dEdWx**2 step_Wx = a_t * m_t_Wx / (T.sqrt(v_t_Wx) + epsilon) if self.binary_training == True: step_Wx = T.clip(step_Wx, -self.W0_x, self.W0_x) updates.append((self.m_Wx, m_t_Wx)) updates.append((self.v_Wx, v_t_Wx)) updates.append((self.Wx, self.Wx - step_Wx)) if self.BN == True: m_t_bn_a_beta = beta1 * self.m_bn_a_beta + ( 1 - beta1) * self.dEdbn_a_beta v_t_bn_a_beta = beta2 * self.v_bn_a_beta + ( 1 - beta2) * self.dEdbn_a_beta**2 step_bn_a_beta = a_t * m_t_bn_a_beta / (T.sqrt(v_t_bn_a_beta) + epsilon) updates.append((self.m_bn_a_beta, m_t_bn_a_beta)) updates.append((self.v_bn_a_beta, v_t_bn_a_beta)) updates.append((self.bn_a_beta, self.bn_a_beta - step_bn_a_beta)) m_t_bn_a_gamma = beta1 * self.m_bn_a_gamma + ( 1 - beta1) * self.dEdbn_a_gamma v_t_bn_a_gamma = beta2 * self.v_bn_a_gamma + ( 1 - beta2) * self.dEdbn_a_gamma**2 step_bn_a_gamma = a_t * m_t_bn_a_gamma / (T.sqrt(v_t_bn_a_gamma) + epsilon) updates.append((self.m_bn_a_gamma, m_t_bn_a_gamma)) updates.append((self.v_bn_a_gamma, v_t_bn_a_gamma)) updates.append( (self.bn_a_gamma, self.bn_a_gamma - step_bn_a_gamma)) m_t_bn_b_gamma = beta1 * self.m_bn_b_gamma + ( 1 - beta1) * self.dEdbn_b_gamma v_t_bn_b_gamma = beta2 * self.v_bn_b_gamma + ( 1 - beta2) * self.dEdbn_b_gamma**2 step_bn_b_gamma = a_t * m_t_bn_b_gamma / (T.sqrt(v_t_bn_b_gamma) + epsilon) updates.append((self.m_bn_b_gamma, m_t_bn_b_gamma)) updates.append((self.v_bn_b_gamma, v_t_bn_b_gamma)) updates.append( (self.bn_b_gamma, self.bn_b_gamma - step_bn_b_gamma)) m_t_bn_c_beta = beta1 * self.m_bn_c_beta + ( 1 - beta1) * self.dEdbn_c_beta v_t_bn_c_beta = beta2 * self.v_bn_c_beta + ( 1 - beta2) * self.dEdbn_c_beta**2 step_bn_c_beta = a_t * m_t_bn_c_beta / (T.sqrt(v_t_bn_c_beta) + epsilon) updates.append((self.m_bn_c_beta, m_t_bn_c_beta)) updates.append((self.v_bn_c_beta, v_t_bn_c_beta)) updates.append((self.bn_c_beta, self.bn_c_beta - step_bn_c_beta)) m_t_bn_c_gamma = beta1 * self.m_bn_c_gamma + ( 1 - beta1) * self.dEdbn_c_gamma v_t_bn_c_gamma = beta2 * self.v_bn_c_gamma + ( 1 - beta2) * self.dEdbn_c_gamma**2 step_bn_c_gamma = a_t * m_t_bn_c_gamma / (T.sqrt(v_t_bn_c_gamma) + epsilon) updates.append((self.m_bn_c_gamma, m_t_bn_c_gamma)) updates.append((self.v_bn_c_gamma, v_t_bn_c_gamma)) updates.append( (self.bn_c_gamma, self.bn_c_gamma - step_bn_c_gamma)) # very sligthly biased variance estimation new_bn_a_mean = (1 - alpha) * self.bn_a_mean + alpha * self.a_mean new_bn_a_var = (1 - alpha) * self.bn_a_var + alpha * self.a_var new_bn_b_mean = (1 - alpha) * self.bn_b_mean + alpha * self.b_mean new_bn_b_var = (1 - alpha) * self.bn_b_var + alpha * self.b_var new_bn_c_mean = (1 - alpha) * self.bn_c_mean + alpha * self.c_mean new_bn_c_var = (1 - alpha) * self.bn_c_var + alpha * self.c_var updates.append((self.bn_a_mean, new_bn_a_mean)) updates.append((self.bn_a_var, new_bn_a_var)) updates.append((self.bn_b_mean, new_bn_b_mean)) updates.append((self.bn_b_var, new_bn_b_var)) updates.append((self.bn_c_mean, new_bn_c_mean)) updates.append((self.bn_c_var, new_bn_c_var)) else: m_t_bn_a_beta = beta1 * self.m_bn_a_beta + ( 1 - beta1) * self.dEdbn_a_beta v_t_bn_a_beta = beta2 * self.v_bn_a_beta + ( 1 - beta2) * self.dEdbn_a_beta**2 step_bn_a_beta = a_t * m_t_bn_a_beta / (T.sqrt(v_t_bn_a_beta) + epsilon) updates.append((self.m_bn_a_beta, m_t_bn_a_beta)) updates.append((self.v_bn_a_beta, v_t_bn_a_beta)) updates.append((self.bn_a_beta, self.bn_a_beta - step_bn_a_beta)) m_t_bn_c_beta = beta1 * self.m_bn_c_beta + ( 1 - beta1) * self.dEdbn_c_beta v_t_bn_c_beta = beta2 * self.v_bn_c_beta + ( 1 - beta2) * self.dEdbn_c_beta**2 step_bn_c_beta = a_t * m_t_bn_c_beta / (T.sqrt(v_t_bn_c_beta) + epsilon) updates.append((self.m_bn_c_beta, m_t_bn_c_beta)) updates.append((self.v_bn_c_beta, v_t_bn_c_beta)) updates.append((self.bn_c_beta, self.bn_c_beta - step_bn_c_beta)) m_t_h0 = beta1 * self.m_h0 + (1 - beta1) * self.dEdh0 v_t_h0 = beta2 * self.v_h0 + (1 - beta2) * self.dEdh0**2 step_h0 = a_t * m_t_h0 / (T.sqrt(v_t_h0) + epsilon) updates.append((self.m_h0, m_t_h0)) updates.append((self.v_h0, v_t_h0)) updates.append((self.h0, self.h0 - step_h0)) m_t_c0 = beta1 * self.m_c0 + (1 - beta1) * self.dEdc0 v_t_c0 = beta2 * self.v_c0 + (1 - beta2) * self.dEdc0**2 step_c0 = a_t * m_t_c0 / (T.sqrt(v_t_c0) + epsilon) updates.append((self.m_c0, m_t_c0)) updates.append((self.v_c0, v_t_c0)) updates.append((self.c0, self.c0 - step_c0)) updates.append((self.n_samples, t)) return updates
def clip(gradient, bound): assert bound > 0 return T.clip(gradient, -bound, bound)
def hard_sigm(self, x): return T.clip((x + 1) / 2, 0, 1)
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W_temp = T.grad(cost=cost, wrt=classifier.W) g_b_temp = T.grad(cost=cost, wrt=classifier.b) # g_norm = T._tensor_py_operators.norm(g_W_temp, 4) g_W_clip = T.clip(g_W_temp, -2, 2) g_b_clip = T.clip(g_b_temp, -2, 2) # b_norm = T._tensor_py_operators.norm(g_b, 4) # g_b = T.clip(g_b, 0, .5) srng = RandomStreams(seed=234) rv_n_w = srng.normal(g_W_temp.shape, avg=0.0, std=0.28) rv_n_b = srng.normal(g_b_temp.shape, avg=0.0, std=0.28) g_W = g_W_clip + rv_n_w g_b = g_b_clip + rv_n_b # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # this theano function returns training error for each minibatch train_model_loss = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print('... training the model') # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 validation_records = [] training_records = [] while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) # compute zero-one loss on training set training_losses = [ train_model_loss(i) for i in range(n_train_batches) ] this_training_loss = numpy.mean(training_losses) print( 'epoch %i, minibatch %i/%i, validation error %f training error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100., this_training_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of' ' best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) # save the best model with open('best_model.pkl', 'wb') as f: pickle.dump(classifier, f) if patience <= iter: done_looping = True break validation_records.append(this_validation_loss * 100) training_records.append(this_training_loss * 100) end_time = timeit.default_timer() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) return validation_records, training_records, test_score * 100
def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept, target_acceptance_rate, stepsize_inc, stepsize_dec, stepsize_min, stepsize_max, avg_acceptance_slowness): """This function is executed after `n_steps` of HMC sampling (`hmc_move` function). It creates the updates dictionary used by the `simulate` function. It takes care of updating: the position (if the move is accepted), the stepsize (to track a given target acceptance rate) and the average acceptance rate (computed as a moving average). Parameters ---------- positions: shared variable, theano matrix Shared theano matrix whose rows contain the old position stepsize: shared variable, theano scalar Shared theano scalar containing current step size avg_acceptance_rate: shared variable, theano scalar Shared theano scalar containing the current average acceptance rate final_pos: shared variable, theano matrix Shared theano matrix whose rows contain the new position accept: theano scalar Boolean-type variable representing whether or not the proposed HMC move should be accepted or not. target_acceptance_rate: float The stepsize is modified in order to track this target acceptance rate. stepsize_inc: float Amount by which to increment stepsize when acceptance rate is too high. stepsize_dec: float Amount by which to decrement stepsize when acceptance rate is too low. stepsize_min: float Lower-bound on `stepsize`. stepsize_min: float Upper-bound on `stepsize`. avg_acceptance_slowness: float Average acceptance rate is computed as an exponential moving average. (1-avg_acceptance_slowness) is the weight given to the newest observation. Returns ------- rval1: dictionary-like A dictionary of updates to be used by the `HMC_Sampler.simulate` function. The updates target the position, stepsize and average acceptance rate. """ ## POSITION UPDATES ## # broadcast `accept` scalar to tensor with the same dimensions as # final_pos. accept_matrix = accept.dimshuffle(0, *(('x', ) * (final_pos.ndim - 1))) # if accept is True, update to `final_pos` else stay put new_positions = TT.switch(accept_matrix, final_pos, positions) # end-snippet-5 start-snippet-7 ## STEPSIZE UPDATES ## # if acceptance rate is too low, our sampler is too "noisy" and we reduce # the stepsize. If it is too high, our sampler is too conservative, we can # get away with a larger stepsize (resulting in better mixing). _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate, stepsize * stepsize_inc, stepsize * stepsize_dec) # maintain stepsize in [stepsize_min, stepsize_max] new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max) # new_stepsize=stepsize # TODO remove for adaptive step sizes # end-snippet-7 start-snippet-6 ## ACCEPT RATE UPDATES ## # perform exponential moving average mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype) new_acceptance_rate = TT.add(avg_acceptance_slowness * avg_acceptance_rate, (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype)) # end-snippet-6 start-snippet-8 return [(positions, new_positions), (stepsize, new_stepsize), (avg_acceptance_rate, new_acceptance_rate)]
def _interpolate(im, x, y, out_height, out_width, border_mode): # *_f are floats num_batch, height, width, channels = im.shape height_f = T.cast(height, theano.config.floatX) width_f = T.cast(width, theano.config.floatX) # scale coordinates from [-1, 1] to [0, width/height - 1] x = (x + 1) / 2 * (width_f - 1) y = (y + 1) / 2 * (height_f - 1) # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates; # we need those in floatX for interpolation and in int64 for indexing. x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 # for indexing, we need to take care of the border mode for outside pixels. if border_mode == 'nearest': x0 = T.clip(x0_f, 0, width_f - 1) x1 = T.clip(x1_f, 0, width_f - 1) y0 = T.clip(y0_f, 0, height_f - 1) y1 = T.clip(y1_f, 0, height_f - 1) elif border_mode == 'mirror': w = 2 * (width_f - 1) x0 = T.minimum(x0_f % w, -x0_f % w) x1 = T.minimum(x1_f % w, -x1_f % w) h = 2 * (height_f - 1) y0 = T.minimum(y0_f % h, -y0_f % h) y1 = T.minimum(y1_f % h, -y1_f % h) elif border_mode == 'wrap': x0 = T.mod(x0_f, width_f) x1 = T.mod(x1_f, width_f) y0 = T.mod(y0_f, height_f) y1 = T.mod(y1_f, height_f) else: raise ValueError("border_mode must be one of " "'nearest', 'mirror', 'wrap'") x0, x1, y0, y1 = (T.cast(v, 'int64') for v in (x0, x1, y0, y1)) # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width*height base = T.repeat( T.arange(num_batch, dtype='int64')*dim1, out_height*out_width) base_y0 = base + y0*dim2 base_y1 = base + y1*dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[idx_a] Ib = im_flat[idx_b] Ic = im_flat[idx_c] Id = im_flat[idx_d] # calculate interpolated values wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x') wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x') wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x') wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x') output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0) return output
def forward(self, inputtensor): x = inputtensor[0] x = T.clip(x, -1, 1) return (binaryOp(x),)
def clipped_v(self, x): return T.clip(T.abs_(x), 0, 1)
Prob1 = T.tanh(th*Prob1) #the nonlinear tanh() function accelerates the state transfer delta_W2 = updates[param] - param delta_W2_direction = T.cast(T.sgn(delta_W2),theano.config.floatX) dis2=T.abs_(delta_W2) #the absolute distance k2=delta_W2_direction*T.floor(dis2/L) #the integer part v2=delta_W2-k2*L #the decimal part Prob2= T.abs_(v2/L) #the transfer probability Prob2 = T.tanh(th*Prob2) #the nonlinear tanh() function accelerates the state transfer srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) Gate1 = T.cast(srng.binomial(n=1, p=Prob1, size=T.shape(Prob1)), theano.config.floatX) # Gate1 is a binary variable with probability of Prob1 to be 1 Gate2 = T.cast(srng.binomial(n=1, p=Prob2, size=T.shape(Prob2)), theano.config.floatX) # Gate2 is a binary variable with probability of Prob2 to be 1 delta_W1_new=(k1+delta_W1_direction*Gate1)*L #delta_W1_new = k*L where k is an integer updates_param1 = T.clip(parambest + delta_W1_new,-H,H) updates_param1 = weight_tune(updates_param1,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space delta_W2_new=(k2+delta_W2_direction*Gate2)*L #delta_W2_new = k*L where k is an integer updates_param2 = T.clip(param + delta_W2_new,-H,H) updates_param2 = weight_tune(updates_param2,-H,H) #fine tuning for guaranteeing each element strictly constrained in the discrete space # if update_type<100, the weight probabilistically tranfers from parambest to state_rand, which helps to search the global minimum # elst it would probabilistically transfer from param to a state nearest to updates[param] updates[param]= T.switch(T.lt(update_type,100), updates_param1, updates_param2) return updates def train( network, train_fn,val_fn,
def cross_entropy_binary(self, y): output = T.clip(self.p_y_given_x, 1e-7, 1 - (1e-7)) return T.sum(binary_crossentropy(output, y), axis=1)
#updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) ########### all_params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param == True) ip_params = lib.get_params(ip_cost, lambda x: hasattr(x, 'param') and x.param==True\ and 'BigFrameLevel' in x.name) other_params = [p for p in all_params if p not in ip_params] all_params = ip_params + other_params lib.print_params_info(ip_params, path=FOLDER_PREFIX) lib.print_params_info(other_params, path=FOLDER_PREFIX) lib.print_params_info(all_params, path=FOLDER_PREFIX) ip_grads = T.grad(ip_cost, wrt=ip_params, disconnected_inputs='warn') ip_grads = [ T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in ip_grads ] other_grads = T.grad(cost, wrt=other_params, disconnected_inputs='warn') other_grads = [ T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in other_grads ] grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn') grads = [ T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads ] ip_updates = lasagne.updates.adam(ip_grads, ip_params) other_updates = lasagne.updates.adam(other_grads, other_params)
l_output = ReshapeLayer(l_output_dense, (batch_size, seqlen, size + 2)) return l_output, l_ntm if __name__ == '__main__': # Define the input and expected output variable input_var, target_var = T.tensor3s('input', 'target') # The generator to sample examples from generator = RepeatCopyTask(batch_size=1, max_iter=1000000, size=8, min_length=3, \ max_length=5, max_repeats=5, unary=True, end_marker=True) # The model (1-layer Neural Turing Machine) l_output, l_ntm = model(input_var, batch_size=generator.batch_size, size=generator.size, num_units=100, memory_shape=(128, 20)) # The generated output variable and the loss function pred_var = T.clip(lasagne.layers.get_output(l_output), 1e-6, 1. - 1e-6) loss = T.mean(lasagne.objectives.binary_crossentropy(pred_var, target_var)) # Create the update expressions params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=5e-4) # Compile the function for a training step, as well as the prediction function and # a utility function to get the inner details of the NTM train_fn = theano.function([input_var, target_var], loss, updates=updates) ntm_fn = theano.function([input_var], pred_var) ntm_layer_fn = theano.function([input_var], lasagne.layers.get_output(l_ntm, get_details=True)) # Training try: scores, all_scores = [], [] for i, (example_input, example_output) in generator: score = train_fn(example_input, example_output)
def __init__(self, learning_rate, drop_out, Layers, N_hidden, D_input, D_out, Task_type='regression', L2_lambda=0.0, _EPSILON=1e-12, fixlayer=[], mid_target='0'): #------varibles------ #label self.hard_target = T.matrix('hard_target') #input layer self.l_in = lasagne.layers.InputLayer(shape=(None, D_input)) #last hidden layer self.l_hid = self.l_in #stack hidden layers #l2 regularization self.l2_penalty = 0 self.lr = theano.shared( np.array(learning_rate, dtype=theano.config.floatX)) for i in range(Layers): self.l_hid = lasagne.layers.DenseLayer( self.l_hid, num_units=N_hidden, W=lasagne.init.HeUniform(gain='relu'), b=lasagne.init.Constant(0.001), nonlinearity=lasagne.nonlinearities.rectify) print('Add Dense layer') self.l2_penalty += lasagne.regularization.regularize_layer_params( self.l_hid, l2) * L2_lambda self.l_hid = lasagne.layers.dropout(self.l_hid, drop_out) print('Add Dropout layer') #out_layer if mid_target == "mid_target": self.l_out = lasagne.layers.DenseLayer( self.l_hid, num_units=D_out, nonlinearity=lasagne.nonlinearities.rectify) print('relu out') else: self.l_out = lasagne.layers.DenseLayer( self.l_hid, num_units=D_out, nonlinearity=lasagne.nonlinearities.linear) print('linear out') #select weights not to be updated d = 1 # how many have deleted self.all_params = lasagne.layers.get_all_params(self.l_out) self.get_weights = lasagne.layers.get_all_param_values(self.l_out) for f in fixlayer: del self.all_params[(f - d) * 2] del self.all_params[(f - d) * 2] d += 1 #------training function------ #output of net for train / eval self.l_out_train = lasagne.layers.get_output(self.l_out, deterministic=False) self.l_out_eval = lasagne.layers.get_output(self.l_out, deterministic=True) if Task_type != 'regression': self.l_out_train = T.exp(self.l_out_train) / T.sum( T.exp(self.l_out_train), axis=1, keepdims=True) self.l_out_eval = T.exp(self.l_out_eval) / T.sum( T.exp(self.l_out_eval), axis=1, keepdims=True) print('Add Softmax output layer') self.l_out_train = T.clip(self.l_out_train, _EPSILON, 1.0 - _EPSILON) self.l_out_eval = T.clip(self.l_out_eval, _EPSILON, 1.0 - _EPSILON) #loss function for train / eval if Task_type != 'regression': self.loss_train = T.mean( lasagne.objectives.categorical_crossentropy( self.l_out_train, self.hard_target)) self.loss_eval = T.mean( lasagne.objectives.categorical_crossentropy( self.l_out_eval, self.hard_target)) else: self.loss_train = T.mean( lasagne.objectives.squared_error(self.l_out_train, self.hard_target)) self.loss_eval = T.mean( lasagne.objectives.squared_error(self.l_out_train, self.hard_target)) self.acc = T.mean( lasagne.objectives.categorical_accuracy(self.l_out_eval, self.hard_target)) #eval functions self.get_acc = theano.function([self.l_in.input_var, self.hard_target], self.acc) self.get_loss = theano.function( [self.l_in.input_var, self.hard_target], self.loss_eval) self.updates = lasagne.updates.adam(self.loss_train + self.l2_penalty, self.all_params, learning_rate=self.lr) #train function self.train = theano.function([self.l_in.input_var, self.hard_target], updates=self.updates) self.train_loss_acc = theano.function( [self.l_in.input_var, self.hard_target], [self.loss_eval, self.acc], updates=self.updates) #output function self.get_out = theano.function([self.l_in.input_var], self.l_out_eval) self.hid_out = theano.function([self.l_in.input_var], lasagne.layers.get_output( self.l_hid, deterministic=True))
def hard_sigmoid(x): return T.clip((x+1.)/2.,0,1)
def sqrt(x): x = T.clip(x, 0., np.inf) return T.sqrt(x)