def test_softmax(self): result = af.softmax(np.array([[1, -1], [3, 4]])) expected_result = np.array([[0.119202, 0.006692], [0.880797, 0.993307]]) difference = result - expected_result self.assertTrue(np.linalg.norm(difference) < 1e-4) A = af.softmax(np.random.randn(5, 10)) self.assertTrue(np.linalg.norm(np.sum(A, axis=0, keepdims=True) - 1) < 1e-4)
def activation_forward(A_prev, W, b, activation_way): """ :param A_prev: :param W: :param b: :param activation_way: -- a text string indicate the way we activate this layer, "sigmoid","relu",... :return: A -- the activation output of this layer cache -- a dictionary contains "linear_cache' and "activation_cache" """ cache = dict() if activation_way == "relu": Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = relu(Z) elif activation_way == "sigmoid": Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = sigmoid(Z) elif activation_way == "tanh": Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = tanh(Z) elif activation_way == "softmax": Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = softmax(Z) cache["linear_cache"] = linear_cache cache["activation_cache"] = activation_cache return A, cache
def forward(self, x, t_label): """ x: input data """ self.t_label = t_label self.softmax_out = softmax(x) self.loss = _cross_entropy_calc(self.softmax_out, self.t_label) return self.loss
def mlp_fpass(data, w_1, b_1, w_2, b_2): """ Initializes the MLP weights using Xavier initialization (uniform) and the biases with zero """ z_1 = np.add(np.dot(data, w_1), b_1) a_1 = af.relu(z_1) z_2 = np.add(np.dot(a_1, w_2), b_2) a_2 = af.softmax(z_2) return z_1, a_1, z_2, a_2
def __run_network(self, registers: np.array, debug: dict = None) -> np.array: def take_params(values, i): """Return the next pair of weights and biases after the starting index and the new starting index.""" return values[i], values[i + 1], i + 2 # Extract the 0th (i.e. P( x = 0 )) component from all registers. last_hidden_layer = np.array(registers[:, 0][None, ...], dtype=np.float32) # Propogate forward to hidden layers. idx = 0 for i in range(self.context.num_hidden_layers): W, b, idx = take_params(self.context.network, idx) last_hidden_layer = relu(last_hidden_layer.dot(W) + b) controller_coefficients = [] for i, gate in enumerate(self.context.gates): coeffs = [] for j in range(gate.arity): W, b, idx = take_params(self.context.network, idx) coeff = softmax(last_hidden_layer.dot(W) + b) coeffs.append(coeff) controller_coefficients.append(coeffs) # Forward propogate to new register value coefficients. for i in range(self.context.num_regs): W, b, idx = take_params(self.context.network, idx) coeff = softmax(last_hidden_layer.dot(W) + b) controller_coefficients.append(coeff) # Forward propogate to generate willingness to complete. W, b, idx = take_params(self.context.network, idx) complete = sigmoid(last_hidden_layer.dot(W) + b) if debug is not None: debug.fi = np.around(complete.sum(), 3) return controller_coefficients, complete
def output_layer(x, w, b): """output layer 多層パーセプトロンの出力層 Args: x: input w: wight b: bias Retuens: output of mlp """ u = np.dot(x, w) + b return softmax(u)
def feedforward(self, input): input = np.array(input) self.nodesave = list(range(len(self.hidden_w) + 2)) # input self.nodesave[0] = self.actifunc( self.input_w.dot(input) + self.input_b) # hidden for i, w, b in zip(range(len(self.hidden_w)), self.hidden_w, self.hidden_b): self.nodesave[i + 1] = self.actifunc(w.dot(self.nodesave[i]) + b) # output self.nodesave[-1] = activation_functions.softmax( self.output_w.dot(self.nodesave[-2]) + self.output_b) return self.nodesave[-1]
def feedforward(self, dropout_rate=[]): for layer in range(1, self.number_of_layers): #iterates through every layer, skipping the input layer. if (layer == self.number_of_layers - 1 and self.act_funcs[-1] == "softmax"): #softmax has no weights or biases. self.z_values[-1] = self.activations[-2] - self.activations[-2].max() #computational trick to avoid overflow. self.activations[-1] = af.softmax(self.z_values[-1]) else: activate = af.func_dict(0)[self.act_funcs[layer-1]] if (self.validating == True): if (not dropout_rate) or (dropout_rate[layer-1] == 0): scale = 1 else: scale = (1 - dropout_rate[layer-1]) #present with probability 1-p. self.z_values[layer] = np.dot(self.weights[layer-1]*scale, self.activations[layer-1]) + self.biases[layer] self.activations[layer] = activate(self.z_values[layer]) else: self.z_values[layer] = np.matmul(self.weights[layer-1], self.activations[layer-1]) + self.biases[layer] self.activations[layer] = activate(self.z_values[layer]) if (dropout_rate) and (dropout_rate[layer-1] != 0): self.activations[layer], self.z_values[layer] = reg.dropout(self.activations[layer], self.z_values[layer], dropout_rate[layer-1], self.parameters[layer])
def forward(self, incoming_x): # incoming data should be (sentence length, projected_dimension) print('x shape into head is', incoming_x.shape) n_sents, vector_dimension, projected_dimension = incoming_x.shape assert vector_dimension == self.ni Q = self.query_layer.forward(incoming_x) K = self.key_layer.forward(incoming_x) V = self.value_layer.forward(incoming_x) # scaled dot product score = (Q @ K.T) / np.sqrt(self.no) #MASK would occur here, #self.mask[:, :, :T, :T] == 0, -np.inf) score = activation.softmax(score) self.score = self.dropout.forward(score, self.training_now) self.output = self.score @ V return self.output
def softmax_ce_derivation(y_hat, y): #derivation of CE(softmax(y),y_hat) respect to y return (activation_functions.softmax(y_hat) - y).T
b_hidden_2 = np.zeros(num_hidden_units_2) w_out_hidden = np.random.normal(0, 1 / (num_features**0.5), (num_hidden_units_2, num_output)) b_out_hidden = np.zeros(num_output) # Training --------------------------------------------------------------------------------------------------------------------------- for e in range(epochs + 1): for b in range(data_size // batch_size): X, Y = shuffle_and_get_batch_data(X_data, Y_data, batch_size) # Forward propagation hidden_output_1 = activation(np.dot(X, w_hidden_1) + b_hidden_1) hidden_output_2 = activation( np.dot(hidden_output_1, w_hidden_2) + b_hidden_2) nn_output = softmax( np.dot(hidden_output_2, w_out_hidden) + b_out_hidden) # Backward propagation # Step back through the network and calculate errors and deltas for the weights and biases hidden_out_error = Y - nn_output d_w_out_hidden = np.dot(hidden_output_2.T, hidden_out_error) hidden_2_error = np.multiply( np.matmul(hidden_out_error, w_out_hidden.T), activation_d(hidden_output_2)) d_w_hidden_2 = np.dot(hidden_output_1.T, hidden_2_error) hidden_in_error = np.multiply(np.matmul(hidden_2_error, w_hidden_2.T), activation_d(hidden_output_1)) d_w_in_hidden = np.dot(X.T, hidden_in_error)
def probabilities_from_activation(self, vmap): return activation_functions.softmax(vmap[self])
def loss(self, X, y=None, reg=0.0): W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N, D = X.shape """ Get scores and perform forward pass """ scores = None # Layer 1 # (N, D)x(D, H) + (1, H) = (N, H) scores = np.dot(X, W1) + b1 relu_activations = ReLU(scores) # Dropout after layer 1 # Kill whole neurons # drop1 = np.random.randn(scores.shape) # relu_activations[:, drop1 <= self.dropout_p] = 0 # Layer 2 scores = relu_activations.dot(W2) + b2 if y is None: return scores softmax_scores = softmax(scores) scores = -np.log(softmax_scores[range(N), y]) loss = np.mean(scores) loss += reg * (np.sum(W1 * W1) + np.sum(W2 * W2) + np.sum(b1 * b1) + np.sum(b2 * b2)) """ Calculating gradients via backprop """ grads = {} # Differentiate loss wrt scores for each class dsoft = softmax_scores.copy() dsoft[range(N), y] -= 1 dsoft /= N dW2 = relu_activations.T.dot(dsoft) dW2 += 2 * reg * W2 grads['dW2'] = dW2 db2 = dsoft * 1 grads['db2'] = np.sum(db2, axis=0) dx2 = np.dot(dsoft, W2.T) relu_ones = (relu_activations >= 0) * 1 # Only allow those gradients to flow back whose activations were positive drelu = dx2 * relu_ones dW1 = X.T.dot(drelu) dW1 += 2 * reg * W1 grads['dW1'] = dW1 db1 = drelu * 1 grads['db1'] = np.sum(db1, axis=0) dimage = drelu.dot(W1.T) grads['dimage'] = dimage return loss, grads