def linear_activation_forward(self, A_prev, W, b, activation): if activation == "sigmoid": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache" Z, linear_cache = self.linear_forward(A_prev, W, b) A, activation_cache = activations.sigmoid(Z) elif activation == "relu": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = self.linear_forward(A_prev, W, b) A, activation_cache = activations.relu(Z) elif activation == "softmax": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = self.linear_forward(A_prev, W, b) A, activation_cache = activations.softmax(Z) elif activation == "euler": Z, linear_cache = self.linear_forward(A_prev, W, b) A, activation_cache = activations.euler(Z) assert (A.shape == (W.shape[0], A_prev.shape[1])) cache = (linear_cache, activation_cache) return A, cache
def softmax_grad(x): """ Computes the gradient of the softmax function. The softmax takes a vector of inputs and computes the jacobian for the partial derivatives of the function with respect to the input J_{i,j} = \partial f_i( x ) / \partial x_j Args: x (np.ndarray): The input vector to the softmax during forward propogation of shape (M,) Returns: A Jacobian of shape (M,M) which contains the partial derivatives """ f_x = np.squeeze(softmax(x)) n = len(f_x) mask = np.eye(n, dtype=bool) jac = np.zeros((n, n)) diag_idx, _ = np.where(mask == True) jac[mask] = f_x[diag_idx] * (1 - f_x[diag_idx]) i_idx, k_idx = np.where(mask == False) jac[~mask] = -f_x[i_idx] * f_x[k_idx] return jac
def forward(self, x, dropout_prob=None): """ 前向传播,针对一个mini-batch处理 """ net_inputs = [] # 各层的输入 net_outputs = [] # 各层激活后的输出 net_d = [] # 为了层号对应,将输入层直接添加 net_inputs.append(x) net_outputs.append(x) net_d.append(np.ones(x.shape[1:])) # 输入层无丢弃概率 for i in range(1, self.weight_num): # 参数数量比层数少1 x = x @ self.params['w' + str(i)].T net_inputs.append(x) x = tanh(x) if dropout_prob: # 训练阶段丢弃 x, d_temp = dropout(x, dropout_prob) net_d.append(d_temp) net_outputs.append(x) out = x @ self.params['w' + str(self.weight_num)].T net_inputs.append(out) out = softmax(out) net_outputs.append(out) return { 'net_inputs': net_inputs, 'net_outputs': net_outputs, 'd': net_d }, out
def forward_bn(self, x, bn_mode='train'): """ 带BN层的前向传播 """ net_inputs = [] net_outputs = [] caches = [] net_inputs.append(x) net_outputs.append(x) caches.append(x) for i in range(1, self.weight_num): # 所有隐层的输入都进行BN,输入层和输出层不进行BN x = x = x @ self.params['w' + str(i)].T net_inputs.append(x) x, cache = self.batch_norm(x, i, bn_mode) # 可以将BN理解为加在隐藏层神经元输入和输出间可训练的一层 caches.append(cache) x = tanh(x) net_outputs.append(x) out = x @ self.params['w' + str(self.weight_num)].T net_inputs.append(out) out = softmax(out) net_outputs.append(out) return { 'net_inputs': net_inputs, 'net_outputs': net_outputs, 'cache': caches }, out
def forward_pass(self, verbose=False, debug=False): """ Computes the values of all units (neurons) over ALL sample points (vectorized). Args: Returns: """ if (verbose): print("\n _______ Forward Pass _______") # Zeroth step: Outputs of input units. X = self.X_active if (verbose): print("\t X.shape", X.shape) # First step: Outputs (H) of hidden units. if (verbose): print("\t Computing 1st layer . . . ") S_h = util.withBias(X) @ self.V.T H = activations.relu(S_h) # Second step: Outputs (O) of output units. if (verbose): print("\t Computing 2nd layer . . . ") S_o = util.withBias(H) @ self.W.T O = activations.softmax(S_o, verbose) if debug: pdb.set_trace() self.O = O return X, S_h, H, S_o, O
def feedforward(self, s_inst, s_trans): y_r = act.sigmoid(s_inst, self.V_r) g = act.softmax(s_inst, self.W_g, self.g_strength, self.level_bias) g = np.transpose(g) l_sel = self.select_level(g) y_m = np.zeros((self.L, 1, self.M)) for l in np.arange(self.L): if l == l_sel: y_m[l, :, :], self.cumulative_memory[ l, :, :] = act.sigmoid_acc_leaky( s_trans, self.V_m[l, :, :], self.cumulative_memory[l, :, :], self.LEAK[l, 0, 0], g[l, 0]) else: self.cumulative_memory[l, :, :] *= self.LEAK[l, 0, 0] y_m[l, :, :] = act.sigmoidal(self.cumulative_memory[l, :, :]) print('\t\t\t\t MEMORY_LEVEL ', l, '\t ', y_m[l, :, :]) inp_h = np.zeros((1, self.H)) for l in np.arange(self.L): inp_h = act.linear(y_m[l, :, :], self.W_m[l, :, :]) y_h = act.sigmoidal(inp_h) Q = act.linear(y_r, self.W_r) + act.linear(y_h, self.W_h) return y_r, y_m, y_h, g, l_sel, Q
def __call__(self, y_true, y_pred): if y_true.ndim == y_pred.ndim: if self.ignore_value is not None: mask = T.neq(y_true, self.ignore_value) logit = masking_softmax(y_pred, mask) logit = T.clip(logit, 1e-9, 1 - 1e-9) log_prob = y_true * T.switch(T.neq(y_true, self.ignore_value), T.log(logit), 0) batch_size = T.sum(y_true * T.neq(y_true, self.ignore_value)) return T.cast(-T.sum(log_prob) / batch_size, 'floatX') else: logit = softmax(y_pred) logit = T.clip(logit, 1e-9, 1 - 1e-9) return -T.mean(y_true * T.log(logit), axis=-1) elif y_true.ndim == y_pred.ndim - 1: no_classes = y_pred.shape[-1] total_dim = 1 for d in y_pred.shape[:-1]: total_dim *= d y_pred = y_pred.reshape((-1, no_classes)) y_true = y_true.reshape((-1, 1)) if self.ignore_value is not None: mask = T.neq(y_true, self.ignore_value) logit = masking_softmax(y_pred, mask) prob = logit[T.arange(total_dim).dimshuffle(0, 'x'), y_true] prob = T.clip(prob, 1e-9, 1 - 1e-9) log_prob = T.switch(T.neq(y_true, self.ignore_value), T.log(prob), 0) batch_size = T.sum(T.neq(y_true, self.ignore_value)) else: prob = y_pred[T.arange(total_dim).dimshuffle(0, 'x'), y_true] prob = T.clip(prob, 1e-9, 1 - 1e-9) log_prob = T.log(prob) batch_size = total_dim return T.cast(-T.sum(log_prob) / batch_size, 'floatX')
def linear_activation_forward(A_prev, W, b, activation): """ Implement the forward propagation for the LINEAR->ACTIVATION layer Arguments: A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples) W -- weights matrix: numpy array of shape (size of current layer, size of previous layer) b -- bias vector, numpy array of shape (size of the current layer, 1) activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" Returns: A -- the output of the activation function, also called the post-activation value cache -- a python dictionary containing "linear_cache" and "activation_cache"; stored for computing the backward pass efficiently """ if activation == "sigmoid": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = sigmoid(Z) elif activation == "relu": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = relu(Z) elif activation == "softmax": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = softmax(Z) assert (A.shape == (W.shape[0], A_prev.shape[1])) cache = (linear_cache, activation_cache) return A, cache
def feed_forward(self, x): output = np.dot(self.weights, x) + self.biases self.cache_x = x self.cache_output = output return softmax(output)
def feedforward(self, input): """Uses softmax to ensure that probabilities over all classes sum to one. Prediction with highest probability returns 1, rest 0.""" output = softmax(input) max_index = np.argmax(output, axis=1) output = np.zeros(output.shape) output[0, max_index] = 1 return output
def forward(self): logits = self.logits.get_data() labels = self.labels.get_data() pred = softmax(logits) res = cross_entropy(logits=pred, labels=labels) self.out.set_data_(res) self.pred.set_data_(pred) return self.out
def forward(self, x, a_prev, c_prev): self.gamma_f = sigmoid(np.dot(self.w_f, np.concatenate([a_prev, x])) + self.b_f) self.gamma_u = sigmoid(np.dot(self.w_u, np.concatenate([a_prev, x])) + self.b_u) self.gamma_o = sigmoid(np.dot(self.w_o, np.concatenate([a_prev, x])) + self.b_o) self.c_ = np.tanh(self.w_c * np.concatenate([a_prev, x]) + self.b_c) self.c = self.gamma_f * c_prev + self.gamma_u * self.c_ self.a = self.gamma_o * np.tanh(self.c) self.y = softmax(np.dot())
def activationFunction(self, z): if self.activ == Activations.SIGMOID.value: return actvtn.sigmoid(z) elif self.activ == Activations.SOFTMAX.value: return actvtn.softmax(z) elif self.activ == Activations.TANH.value: return actvtn.tanh(z) else: return z
def __one_layer_forward_prop(self, a_prev, layer_index, activation): z = np.dot(self.w[layer_index], a_prev) + self.b[layer_index] if activation == 'relu': return relu(z), z elif activation == 'sigmoid': return sigmoid(z), z elif activation == 'softmax': return softmax(z), z else: raise "Invalid activation: {}".format(activation)
def _forward_prop(self, x): self._activations[0] = x for i in range(1, self.num_layers): self._zs[i] = (self.weights[i].dot(self._activations[i - 1]) + self.biases[i]) # Use "softmax" for last layer. if i == self.num_layers - 1: self._activations[i] = activations.softmax(self._zs[i]) else: self._activations[i] = self.activation_fn(self._zs[i])
def feedforward(self,s_inst,s_trans): g_strength = 3 f_strength = 3 y_r = act.sigmoid(s_inst, self.V_r) g = act.softmax(s_inst,self.W_g, g_strength) g = np.transpose(g) f = act.hard_sigmoid(s_inst,self.W_f, f_strength) f = np.transpose(f)
def forward(x, w1, b1, w2, b2): z1 = x.dot(w1.T) + b1.T a1 = relu(z1) z2 = a1.dot(w2.T) + b2.T a2 = softmax(z2) return { 'a1': a1, 'z1': z1, 'a2': a2, 'z2': z2, }
def _forward_prop(self, x): ''' RUn forward prop. ''' a = np.array(x).reshape((len(x), 1)) for count, b, w in zip(range(self.num_layers - 1), self.biases, self.weights): if count == self.num_layers - 2: a = activations.softmax(np.dot(w, a) + b) else: a = self.activation(np.dot(w, a) + b) return a
def predict_proba(self, X): """ Perform the forward propagation. :param X: The batch - np.ndarray :return: A list of activation values - list of np.ndarray """ X = X.T for i in range(len(self.W) - 1): X = self.act_func(np.dot(self.W[i], X) + self.B[i]) X = softmax(np.dot(self.W[-1], X) + self.B[-1]) # softmax on last layer return X.T
def one_step_forward(A_prev, W, b, activation): Z = np.dot(W, A_prev) + b linear_cache = A_prev, W, b # print(A_prev.shape,W.shape,b.shape) if activation == 'relu': A, activation_cache = relu(Z) # This relu function is for L-1 layers if activation == 'softmax': A, activation_cache = softmax( Z) # The sigmoid function is for last Lth layer call return A, (linear_cache, activation_cache)
def apply_grad(self, X, Y): N, _ = X.shape gradients = [] output = self.model.predict(X) self.model.output = output for i in xrange(N): yi = Y[i:i + 1] oi = output[i:i + 1] loss = 1.0 / N * (softmax(oi) - yi) gradients.append(loss) self.model.layers[-1].apply_grad(X, Y, gradients) return self(X, Y)
def predict(network, x): W1, W2, W3 = network['W1'], network['W2'], network['W3'] b1, b2, b3 = network['b1'], network['b2'], network['b3'] a1 = np.dot(x, W1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, W2) + b2 z2 = sigmoid(a2) a3 = np.dot(z2, W3) + b3 y = softmax(a3) return y
def _back_prop(self, x, y): """ Compute gradients of Cost Returns: * (nabla_b, nabla_w) representing the gradient for the cost function C_x. nabla_b and nabla_w are similar to self.biases and self.weights. """ nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] # feedforward activation = np.array(x).reshape((len(x), 1)) # list to store all the activations, layer by layer a_ss = [activation] # list to store all the z vectors, layer by layer zs = [] count = 0 for b, w in zip(self.biases, self.weights): z = np.dot(w, activation) + b zs.append(z) if count == self.num_layers - 2: activation = activations.softmax(z) else: activation = self.activation(z) a_ss.append(activation) count += 1 # backward pass delta = self.cost_derivative(a_ss[-1], y) * activations.softmax_prime( zs[-1]) nabla_b[-1] = delta nabla_w[-1] = np.dot(delta, a_ss[-2].transpose()) for l in range(2, self.num_layers): delta = np.dot(self.weights[-l + 1].transpose(), delta) * self.activation_prime(zs[-l]) #print(delta) nabla_b[-l] = delta nabla_w[-l] = np.dot(delta, a_ss[-l - 1].transpose()) return (nabla_b, nabla_w)
def _build_network(self): """Build a 2 hidden layer neural network with ReLU activations and a softmax output layer""" self.W1 = np.random.random((self.inp_shape, self.hidden_units)) self.b1 = np.zeros((self.hidden_units, )) self.hid1 = lambda x: relu(np.dot(x, self.W1) + self.b1) self.W2 = np.random.random((self.hidden_units, self.hidden_units)) self.b2 = np.zeros((self.hidden_units, )) self.hid2 = lambda x: relu(np.dot(x, self.W2) + self.b2) self.W3 = np.random.random((self.hidden_units, self.num_classes)) self.b3 = np.zeros((self.num_classes, )) self.hid3 = lambda x: softmax(np.dot(x, self.W3) + self.b3)
def linear_activation_forward(A_prev, W, b, activation): Z, Z_cache = linear_forward(A_prev, W, b) if activation == 'relu': A, A_cache = activations.relu(Z) elif activation == 'sigmoid': A, A_cache = activations.sigmoid(Z) elif activation == 'softmax': A_cache = activations.softmax(Z) cache = (Z_cache, A_cache) return A, cache
def feedforward(self, s_inst, s_trans): y_r = act.sigmoid(s_inst, self.V_r) g = act.softmax(s_inst, self.W_g, self.g_strength) g = np.transpose(g) y_m = 1e-6 * np.ones((self.L, 1, self.M)) Q = act.linear(y_r, self.W_r) for l in np.arange(self.L): y_m[l, :, :], self.memory_content[l, :, :] = act.sigmoid_acc_leaky( s_trans, self.V_m[l, :, :], self.memory_content[l, :, :], 1 - g[l, 0], g[l, 0]) Q += act.linear(y_m[l, :, :], self.W_m[l, :, :]) #print('\t MEM STATE ',str(l),':', y_m[l,:,:],'\t alpha=',self.ALPHA[l,0,0],'\t gate=',g[l,:],'\t forget=',f[l,:]) return y_r, y_m, g, Q
def feedforward(self, inputs): """ Feeds the input through the network layers to the softmax function """ # Hidden Layer 1 z1_sum = np.matmul(self.weights['hidden1'].T, inputs) + self.bias['hidden1'] z1 = activations.sigmoid(z1_sum, 'normal') # Hidden Layer 2 # z2 = activations.sigmoid((np.einsum('ij, j->i', self.weights['hidden2'], z1) + self.bias['hidden2']), 'normal') # Output Layer out_sum = np.matmul(self.weights['out'].T, z1) + self.bias['out'] prediction = activations.softmax(out_sum, 'normal') return prediction, out_sum, z1, z1_sum
def softmax_grad(x): #x is a vector #returns a jacobian matrix f_x = np.squeeze(softmax(x)) n = len(f_x) mask = np.eye(n, dtype=bool) jac = np.zeros((n, n)) diag_idx, _ = np.where(mask == True) jac[mask] = f_x[diag_idx] * (1 - f_x[diag_idx]) i_idx, k_idx = np.where(mask == False) jac[~mask] = -f_x[i_idx] * f_x[k_idx] return jac
def output(self, pre_act=False, dropout_active=False): X = self.l_in.output(dropout_active=dropout_active) is_tensor3_softmax = X.ndim > 2 shape = X.shape if is_tensor3_softmax: #reshape for tensor3 softmax X = X.reshape((shape[0] * shape[1], self.n_in)) out = activations.softmax( T.concatenate([T.zeros( (X.shape[0], 1)), T.dot(X, self.w)], axis=1) + self.b) if is_tensor3_softmax: #reshape for tensor3 softmax out = out.reshape((shape[0], shape[1], self.size)) return out
def linear_act_forward(A, W, b, act): """ Implements the linear and activation functions of a single node Also, returns the original values for storage in cache """ if act == "sigmoid": Z, lin_cache = linear(A, W, b) A, act_cache = sigmoid(Z) elif act == "relu": Z, lin_cache = linear(A, W, b) A, act_cache = relu(Z) elif act == "softmax": Z, lin_cache = linear(A, W, b) A, act_cache = softmax(Z) cache = (lin_cache, act_cache) return A, cache