def get_output_for(self, input, deterministic=False, batch_norm_use_averages=None, batch_norm_update_averages=None, **kwargs): # Decide whether to use the stored averages or mini-batch statistics if batch_norm_use_averages is None: batch_norm_use_averages = deterministic use_averages = batch_norm_use_averages # Decide whether to update the stored averages if batch_norm_update_averages is None: batch_norm_update_averages = not deterministic update_averages = batch_norm_update_averages # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = [ 'x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim) ] # and prepare the converse pattern removing those broadcastable axes unpattern = [d for d in range(input.ndim) if d not in self.axes] # call cuDNN if needed, obtaining normalized outputs and statistics if not use_averages or update_averages: # cuDNN requires beta/gamma tensors; create them if needed shape = tuple(s for (d, s) in enumerate(input.shape) if d not in self.axes) gamma = self.gamma or theano.tensor.ones(shape) beta = self.beta or theano.tensor.zeros(shape) mode = 'per-activation' if self.axes == (0, ) else 'spatial' (normalized, input_mean, input_inv_std) = dnn.dnn_batch_normalization_train( input, gamma.dimshuffle(pattern), beta.dimshuffle(pattern), mode, self.epsilon) # normalize with stored averages, if needed if use_averages: mean = self.mean.dimshuffle(pattern) inv_std = self.inv_std.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) normalized = (input - mean) * (gamma * inv_std) + beta # update stored averages, if needed if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ( (1 - self.alpha) * running_mean + self.alpha * input_mean.dimshuffle(unpattern)) running_inv_std.default_update = ( (1 - self.alpha) * running_inv_std + self.alpha * input_inv_std.dimshuffle(unpattern)) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): dummy = 0 * (running_mean + running_inv_std).dimshuffle(pattern) normalized = normalized + dummy return normalized
def __init__(self, layers, momentum=0.9, eps=1e-5, renorm_max_r=1.0, renorm_max_d=0.0, renorm_max_it=10, json_param={}): super().__init__(layer_index=len(layers)) self.input = layers[-1].output self.input_shape = layers[-1].output_shape #get parameters self.enabled = json_param.get("enabled", True) self.momentum = json_param.get("momentum", momentum) self.renorm_max_r = json_param.get("renormMaxR", renorm_max_r) self.renorm_max_d = json_param.get("renormMaxD", renorm_max_d) self.renorm_max_it = json_param.get("renormMaxIt", renorm_max_it) self.eps = json_param.get("eps", eps) if self.enabled: #initialize param param_shape = (self.input_shape[1],) self.omega = theano.shared(numpy.asarray(numpy.ones(param_shape), dtype=theano.config.floatX), name="bn omega") self.beta = theano.shared(numpy.asarray(numpy.zeros(param_shape), dtype=theano.config.floatX), name="bn beta") self.mean = theano.shared(numpy.asarray(numpy.zeros(param_shape), dtype=theano.config.floatX), name="bn mean") self.stdinv = theano.shared(numpy.asarray(numpy.ones(param_shape), dtype=theano.config.floatX), name="bn std inv") #evaluate x_shape = self.input_shape x = self.input #directly call cudnn version until added to master dim = ['x',0,'x','x'] use_cudnn = theano.sandbox.cuda.dnn.dnn_available() and (theano.sandbox.cuda.dnn.version() >= (5000,5000)) # use_cudnn = theano.gpuarray.dnn.dnn_available(None) and (theano.gpuarray.dnn.version() >= 5000) if use_cudnn: from theano.sandbox.cuda.dnn import dnn_batch_normalization_train, dnn_batch_normalization_test # from theano.gpuarray.dnn import dnn_batch_normalization_train, dnn_batch_normalization_test var = tensor.sqr(1.0 / self.stdinv) x_n_train, x_mean, x_stdinv = dnn_batch_normalization_train(x, self.omega.dimshuffle(dim), self.beta.dimshuffle(dim), 'spatial', self.eps) x_n_test = dnn_batch_normalization_test(x, self.omega.dimshuffle(dim), self.beta.dimshuffle(dim), self.mean.dimshuffle(dim), var.dimshuffle(dim), 'spatial', self.eps) x_std = 1.0 / x_stdinv else: #WARNING: BROKEN! xt = x.dimshuffle((1,0,2,3)).flatten(2) x_mean = tensor.sum(xt, axis=1) / (self.input_shape[0]*self.input_shape[2]*self.input_shape[3]) x_mean = tensor.cast(x_mean, "float32") x_std = tensor.sqrt(tensor.mean(x*x, axis=[0,2,3]) - x_mean*x_mean + self.eps) x_stdinv = 1.0 / x_std x_n_test = (x - self.mean.dimshuffle(dim)) * (self.omega * self.stdinv).dimshuffle(dim) + self.beta.dimshuffle(dim) x_n_train = (x - x_mean.dimshuffle(dim)) * (self.omega * x_stdinv).dimshuffle(dim) + self.beta.dimshuffle(dim) #override old value with renormalized version # if (self.renorm_max_r > 1.0) or (self.renorm_max_d > 0.0): # r_alpha = math.log(self.renorm_max_r) / self.renorm_max_it # d_alpha = math.log(self.renorm_max_d + 1) / self.renorm_max_it # r_max = tensor.minimum(self.renorm_max_r, tensor.exp(get_epoch()*r_alpha)) # d_max = tensor.minimum(self.renorm_max_d, tensor.exp(get_epoch()*d_alpha) - 1) # x_r = tensor.gradient.zero_grad(tensor.clip(x_std*self.stdinv, 1.0/r_max, r_max)) # x_d = tensor.gradient.zero_grad(tensor.clip((x_mean - self.mean) * self.stdinv, -d_max, d_max)) # x_n_train = (x - x_mean.dimshuffle(dim)) * (self.omega*x_stdinv*x_r).dimshuffle(dim) + (self.beta + self.omega*x_d).dimshuffle(dim) self.local_updates = [(self.mean, self.momentum*self.mean + (1.0 - self.momentum)*x_mean.squeeze()), (self.stdinv, self.momentum*self.stdinv + (1.0 - self.momentum)*x_stdinv.squeeze())] self.output_shape = self.input_shape self.output = tensor.switch(get_train(), tensor.as_tensor_variable(x_n_train), theano.gradient.disconnected_grad(x_n_test)) else: self.output_shape = self.input_shape self.output = self.input logging.verbose("Adding", self)
def get_output_for(self, input, deterministic=False, batch_norm_use_averages=None, batch_norm_update_averages=None, **kwargs): # Decide whether to use the stored averages or mini-batch statistics if batch_norm_use_averages is None: batch_norm_use_averages = deterministic use_averages = batch_norm_use_averages # Decide whether to update the stored averages if batch_norm_update_averages is None: batch_norm_update_averages = not deterministic update_averages = batch_norm_update_averages # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # and prepare the converse pattern removing those broadcastable axes unpattern = [d for d in range(input.ndim) if d not in self.axes] # call cuDNN if needed, obtaining normalized outputs and statistics if not use_averages or update_averages: # cuDNN requires beta/gamma tensors; create them if needed shape = tuple(s for (d, s) in enumerate(input.shape) if d not in self.axes) gamma = self.gamma or theano.tensor.ones(shape) beta = self.beta or theano.tensor.zeros(shape) mode = 'per-activation' if self.axes == (0,) else 'spatial' (normalized, input_mean, input_inv_std) = dnn.dnn_batch_normalization_train( input, gamma.dimshuffle(pattern), beta.dimshuffle(pattern), mode, self.epsilon) # normalize with stored averages, if needed if use_averages: mean = self.mean.dimshuffle(pattern) inv_std = self.inv_std.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) normalized = (input - mean) * (gamma * inv_std) + beta # update stored averages, if needed if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean.dimshuffle(unpattern)) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std.dimshuffle(unpattern)) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): dummy = 0 * (running_mean + running_inv_std).dimshuffle(pattern) normalized = normalized + dummy return normalized
if __name__ == '__main__': shape = (1, 4, 4, 4) f_g = theano.shared(numpy.ones(shape=(shape[1], ), dtype='float32')) f_b = theano.shared(numpy.zeros(shape=(shape[1], ), dtype='float32')) f_x = theano.tensor.tensor4() f0_y, f0_mean, f0_std = dnn_bnrelu_train(f_x, f_g[None, :, None, None], f_b[None, :, None, None], "spatial") f0_yg = theano.tensor.grad(f0_y.sum(), f_x) f0_yg = theano.printing.Print('R0:')(f0_yg) f0 = theano.function([f_x], [f0_y, f0_y.sum(), f0_yg]) f1_xn, f1_mean, f1_std = dnn_batch_normalization_train( f_x, f_g[None, :, None, None], f_b[None, :, None, None], "spatial") f1_y = tensor.maximum(f1_xn, 0.0) f1_yg = theano.tensor.grad(f1_y.sum(), f_x) f1_yg = theano.printing.Print('R1:')(f1_yg) f1 = theano.function([f_x], [f1_y, f1_y.sum(), f1_yg]) x = numpy.random.uniform(-5.0, 5.0, shape).astype(numpy.float32) y0, ys0, yy0 = f0(x) y1, ys1, yy1 = f1(x) print("X Mean:", x.mean(axis=(0, 2, 3))) print("X std:", x.std(axis=(0, 2, 3))) print("------X------") print(numpy.array(x))