def ApplyActivation(self): state = self.state if self.activation == deepnet_pb2.Hyperparams.LOGISTIC: cm.sigmoid(state) elif self.activation == deepnet_pb2.Hyperparams.TANH: cm.tanh(state) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR: state.greater_than(0, target=self.temp) state.mult(self.temp) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR_SMOOTH: cm.log_1_plus_exp(state) elif self.activation == deepnet_pb2.Hyperparams.LINEAR: pass elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.temp.reciprocal() state.mult_by_row(self.temp) elif self.activation == deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.NN.divide(self.temp, target=self.temp) state.mult_by_row(self.temp) else: raise Exception('Unknown activation')
def ApplyActivation(self): state = self.state if self.activation == deepnet_pb2.Hyperparams.LOGISTIC: cm.sigmoid(state) elif self.activation == deepnet_pb2.Hyperparams.TANH: cm.tanh(state) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR: state.greater_than(0, target=self.temp) state.mult(self.temp) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR_SMOOTH: cm.log_1_plus_exp(state) elif self.activation == deepnet_pb2.Hyperparams.LINEAR: pass elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.temp.reciprocal() state.mult_by_row(self.temp) elif self.activation == deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.NN.divide(self.temp, target=self.temp) state.mult_by_row(self.temp) else: raise Exception("Unknown activation")
def grad(X, Y, act, params, grads, aux): H, bh = params _H, _bh = grads a, eh, loss = aux # forward pass a[0].assign(X) n_layers = len(eh) for i in range(n_layers): # a = sigmoid( ap*H + bh ) a[i].dot(H[i], target=a[i + 1]) a[i + 1].add_row_vec(bh[i]) if i < n_layers - 1: cm.sigmoid(a[i + 1]) else: # last layer if act == 'logistic': cm.sigmoid(a[i + 1]) elif act == 'softmax': a_t = a[i + 1].transpose() cm.softmax(a_t) a_t.transpose(target=a[i + 1]) a_t.free_device_memory() else: pass # backward pass # compute error term of the last layer a[-1].subtract(Y, target=eh[-1]) # check the following for i in range(n_layers - 1, -1, -1): # compute derivatives _H[i].assign(0.0) _H[i].add_dot(a[i].T, eh[i]) eh[i].sum(axis=0, target=_bh[i]) # compute error term for the previous layer if i > 0: # eh = sigmoid'(a) x ( ehp*H' ) eh[i].dot(H[i].T, target=eh[i - 1]) eh[i - 1].apply_logistic_deriv(a[i]) if act == 'logistic': cm.cross_entropy_bernoulli(Y, a[n_layers], target=loss) elif act == 'softmax': loss = cm.cross_entropy(Y, a[n_layers], target=loss) elif act == 'linear': a[-1].mult(a[-1], target=loss) return loss.sum()
def grad(X, Y, act, params, grads, aux): H, bh = params _H, _bh = grads a, eh, loss = aux # forward pass a[0].assign(X) n_layers = len(eh) for i in range(n_layers): # a = sigmoid( ap*H + bh ) a[i].dot(H[i], target = a[i+1]) a[i+1].add_row_vec(bh[i]) if i < n_layers-1: cm.sigmoid(a[i+1]) else: # last layer if act == 'logistic': cm.sigmoid(a[i+1]) elif act == 'softmax': a_t = a[i+1].transpose() cm.softmax(a_t) a_t.transpose(target=a[i+1]) a_t.free_device_memory() else: pass # backward pass # compute error term of the last layer a[-1].subtract(Y, target=eh[-1]) # check the following for i in range(n_layers-1, -1, -1): # compute derivatives _H[i].assign(0.0) _H[i].add_dot(a[i].T, eh[i]) eh[i].sum(axis=0, target=_bh[i]) # compute error term for the previous layer if i > 0: # eh = sigmoid'(a) x ( ehp*H' ) eh[i].dot(H[i].T, target=eh[i-1]) eh[i-1].apply_logistic_deriv(a[i]) if act == 'logistic': cm.cross_entropy_bernoulli(Y, a[n_layers], target=loss) elif act == 'softmax': loss = cm.cross_entropy(Y, a[n_layers], target=loss) elif act == 'linear': a[-1].mult(a[-1], target=loss) return loss.sum()
def activate(X, params, a): batch_size = X.shape[0] H, O, bh, bo = params # a = f( x*H + bh ) X.dot(H, target=a) a.add_row_vec(bh) cm.sigmoid(a) return a
def grad(X, Y, act_type, rho, params, grads, aux): H, O, bh, bo = params _H, _O, _bh, _bo = grads a, z, eh, eo, loss, s, s_m = aux _H.assign(0.0) _O.assign(0.0) _bh.assign(0.0) _bo.assign(0.0) # watch out for the redundand accumulations ### FORWARD PASS ### # a = tanh( x*H + bh ) X.dot(H, target=a) a.add_row_vec(bh) cm.sigmoid(a) # b = sigm( a*O + bo ) #a.dot(H.T, target=z) # use tyied weights a.dot(O, target=z) z.add_row_vec(bo) if act_type == 'logistic': cm.sigmoid(z) # DEBUG ### BACKWARD PASS ### # eo = z - y z.subtract(Y, target=eo) # eh = sigmoid'(a) x ( eo * O + (rho-1)/(s-1) - rho/s ) eo.dot(O.T, target = eh) # the following needs to be verified if rho > 0: a.sum(axis=0, target=s) s.mult(1.0/a.shape[0]) # normalize by batch_size s.reciprocal() s.mult(rho) a.sum(axis=0, target=s_m) # TODO: remove this redundancy s_m.mult(1.0/a.shape[0]) # normalize by batch_size s_m.subtract(1.0) s_m.reciprocal() s_m.mult(rho-1) s.subtract(s_m) eh.add_row_mult(s, -1.0) eh.apply_logistic_deriv(a) ### COMPUTE GRADIENTS ### _O.add_dot(a.T, eo) _H.add_dot(X.T, eh) _bo.add_sums(eo, axis=0) _bh.add_sums(eh, axis=0) ### COMPUTE ERROR ### if act_type == 'logistic': cm.cross_entropy_bernoulli(Y, z, target=loss) elif act_type == 'linear': eo.mult(eo, target=loss) #loss.add_mult(eo, eo) # DEBUG else: raise ValueError("Activation function '%s' is unknown" % args.act_type) err = loss.sum() return err