def forward_pass(self, X, training=True): # Initialize running mean and variance if first run if self.running_mean is None: self.running_mean = R.mean(X, axis=0) self.running_var = R.variance(X, axis=0) if training and self.trainable: mean = R.mean(X, axis=0) var = R.variance(X, axis=0) self.running_mean = self.momentum * self.running_mean + ( R.t(1) - self.momentum) * mean self.running_var = self.momentum * self.running_var + ( R.t(1) - self.momentum) * var else: mean = self.running_mean var = self.running_var # Statistics saved for backward pass self.X_centered = X - mean self.stddev_inv = R.div(R.t(1), R.square_root(var + self.eps)) X_norm = self.X_centered * self.stddev_inv output = self.gamma * X_norm + self.beta return output
def initialize(self, optimizer): # Initialize the weights limit = R.div(R.t(1), R.square_root(R.t(int(self.input_shape[0])))) limit_value = limit() self.W = R.t(np.random.uniform(-limit_value, limit_value, (int(self.input_shape[0]), self.n_units))) self.w0 = R.t(np.zeros((1,self.n_units))) # Weight optimizers self.W_opt = copy.copy(optimizer) self.w0_opt = copy.copy(optimizer)
def update(self, w, grad_wrt_w): # If not initialized if self.Eg is None: self.Eg = R.t(np.zeros(np.shape(grad_wrt_w()))) self.Eg = self.rho * self.Eg + (R.t(1) - self.rho) * R.pow( grad_wrt_w, R.t(2)) # Divide the learning rate for a weight by a running average of the magnitudes of recent # gradients for that weight return w - self.learning_rate * R.div( grad_wrt_w, R.square_root(self.Eg + self.eps))
def initialize(self, optimizer): # Initialize the weights filter_height, filter_width = self.filter_shape channels = self.input_shape[0] limit = R.div(R.t(1), R.square_root(R.t(int(np.prod(self.filter_shape))))) limit_value = limit() # limit = 1 / math.sqrt(np.prod(self.filter_shape)) self.W = R.t(np.random.uniform(-limit_value, limit_value, size=(self.n_filters, channels, filter_height, filter_width))) self.w0 = R.t(np.zeros((self.n_filters, 1))) # Weight optimizers self.W_opt = copy.copy(optimizer) self.w0_opt = copy.copy(optimizer)
def update(self, w, grad_wrt_w): # If not initialized if self.m is None: self.m = R.t(np.zeros(np.shape(grad_wrt_w()))) self.v = R.t(np.zeros(np.shape(grad_wrt_w()))) self.m = self.b1 * self.m + (R.t(1) - self.b1) * grad_wrt_w self.v = self.b2 * self.v + (R.t(1) - self.b2) * R.pow( grad_wrt_w, R.t(2)) m_hat = R.div(self.m, R.t(1) - self.b1) v_hat = R.div(self.v, R.t(1) - self.b2) self.w_updt = R.div(self.learning_rate * m_hat, R.square_root(v_hat) + self.eps) return w - self.w_updt