def test_softmax(self):
        result = af.softmax(np.array([[1, -1], [3, 4]]))
        expected_result = np.array([[0.119202, 0.006692], [0.880797, 0.993307]])
        difference = result - expected_result
        self.assertTrue(np.linalg.norm(difference) < 1e-4)

        A = af.softmax(np.random.randn(5, 10))
        self.assertTrue(np.linalg.norm(np.sum(A, axis=0, keepdims=True) - 1) < 1e-4)
Ejemplo n.º 2
0
def activation_forward(A_prev, W, b, activation_way):
    """

    :param A_prev:
    :param W:
    :param b:
    :param activation_way: -- a text string indicate the way we activate this layer, "sigmoid","relu",...
    :return:
    A -- the activation output of this layer
    cache -- a dictionary contains "linear_cache' and "activation_cache"
    """
    cache = dict()
    if activation_way == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    elif activation_way == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    elif activation_way == "tanh":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = tanh(Z)
    elif activation_way == "softmax":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = softmax(Z)

    cache["linear_cache"] = linear_cache
    cache["activation_cache"] = activation_cache

    return A, cache
Ejemplo n.º 3
0
 def forward(self, x, t_label):
     """
     x: input data
     """
     self.t_label = t_label
     self.softmax_out = softmax(x)
     self.loss = _cross_entropy_calc(self.softmax_out, self.t_label)
     
     return self.loss
Ejemplo n.º 4
0
def mlp_fpass(data, w_1, b_1, w_2, b_2):
    """
  Initializes the MLP weights using Xavier initialization (uniform)
  and the biases with zero
  """
    z_1 = np.add(np.dot(data, w_1), b_1)
    a_1 = af.relu(z_1)
    z_2 = np.add(np.dot(a_1, w_2), b_2)
    a_2 = af.softmax(z_2)

    return z_1, a_1, z_2, a_2
Ejemplo n.º 5
0
    def __run_network(self,
                      registers: np.array,
                      debug: dict = None) -> np.array:
        def take_params(values, i):
            """Return the next pair of weights and biases after the
            starting index and the new starting index."""
            return values[i], values[i + 1], i + 2

        # Extract the 0th (i.e. P( x = 0 )) component from all registers.
        last_hidden_layer = np.array(registers[:, 0][None, ...],
                                     dtype=np.float32)

        # Propogate forward to hidden layers.
        idx = 0
        for i in range(self.context.num_hidden_layers):
            W, b, idx = take_params(self.context.network, idx)
            last_hidden_layer = relu(last_hidden_layer.dot(W) + b)

        controller_coefficients = []
        for i, gate in enumerate(self.context.gates):
            coeffs = []
            for j in range(gate.arity):
                W, b, idx = take_params(self.context.network, idx)
                coeff = softmax(last_hidden_layer.dot(W) + b)
                coeffs.append(coeff)
            controller_coefficients.append(coeffs)

        # Forward propogate to new register value coefficients.
        for i in range(self.context.num_regs):
            W, b, idx = take_params(self.context.network, idx)
            coeff = softmax(last_hidden_layer.dot(W) + b)
            controller_coefficients.append(coeff)

        # Forward propogate to generate willingness to complete.
        W, b, idx = take_params(self.context.network, idx)
        complete = sigmoid(last_hidden_layer.dot(W) + b)

        if debug is not None:
            debug.fi = np.around(complete.sum(), 3)

        return controller_coefficients, complete
def output_layer(x, w, b):
    """output layer
    多層パーセプトロンの出力層

    Args:
        x: input
        w: wight
        b: bias

    Retuens:
        output of mlp
    """
    u = np.dot(x, w) + b
    return softmax(u)
Ejemplo n.º 7
0
    def feedforward(self, input):
        input = np.array(input)
        self.nodesave = list(range(len(self.hidden_w) + 2))

        # input
        self.nodesave[0] = self.actifunc(
            self.input_w.dot(input) + self.input_b)

        # hidden
        for i, w, b in zip(range(len(self.hidden_w)), self.hidden_w,
                           self.hidden_b):
            self.nodesave[i + 1] = self.actifunc(w.dot(self.nodesave[i]) + b)

        # output
        self.nodesave[-1] = activation_functions.softmax(
            self.output_w.dot(self.nodesave[-2]) + self.output_b)

        return self.nodesave[-1]
Ejemplo n.º 8
0
 def feedforward(self, dropout_rate=[]):
     for layer in range(1, self.number_of_layers): #iterates through every layer, skipping the input layer.
         if (layer == self.number_of_layers - 1 and self.act_funcs[-1] == "softmax"): #softmax has no weights or biases.
             self.z_values[-1] = self.activations[-2] - self.activations[-2].max() #computational trick to avoid overflow.
             self.activations[-1] = af.softmax(self.z_values[-1])
         else:
             activate = af.func_dict(0)[self.act_funcs[layer-1]]
             if (self.validating == True):
                 if (not dropout_rate) or (dropout_rate[layer-1] == 0):
                     scale = 1
                 else:
                     scale = (1 - dropout_rate[layer-1]) #present with probability 1-p.
                 self.z_values[layer] = np.dot(self.weights[layer-1]*scale, self.activations[layer-1]) + self.biases[layer]
                 self.activations[layer] = activate(self.z_values[layer])
             else:
                 self.z_values[layer] = np.matmul(self.weights[layer-1], self.activations[layer-1]) + self.biases[layer]
                 self.activations[layer] = activate(self.z_values[layer])
                 if (dropout_rate) and (dropout_rate[layer-1] != 0):
                     self.activations[layer], self.z_values[layer] = reg.dropout(self.activations[layer], self.z_values[layer], dropout_rate[layer-1], self.parameters[layer])
Ejemplo n.º 9
0
    def forward(self, incoming_x):
        # incoming data should be (sentence length, projected_dimension)
        print('x shape into head is', incoming_x.shape)
        n_sents, vector_dimension, projected_dimension = incoming_x.shape
        assert vector_dimension == self.ni

        Q = self.query_layer.forward(incoming_x)
        K = self.key_layer.forward(incoming_x)
        V = self.value_layer.forward(incoming_x)

        # scaled dot product
        score = (Q @ K.T) / np.sqrt(self.no)

        #MASK would occur here,
        #self.mask[:, :, :T, :T] == 0, -np.inf)

        score = activation.softmax(score)
        self.score = self.dropout.forward(score, self.training_now)
        self.output = self.score @ V

        return self.output
Ejemplo n.º 10
0
def softmax_ce_derivation(y_hat, y):
    #derivation of CE(softmax(y),y_hat) respect to y
    return (activation_functions.softmax(y_hat) - y).T
b_hidden_2 = np.zeros(num_hidden_units_2)

w_out_hidden = np.random.normal(0, 1 / (num_features**0.5),
                                (num_hidden_units_2, num_output))
b_out_hidden = np.zeros(num_output)

# Training ---------------------------------------------------------------------------------------------------------------------------
for e in range(epochs + 1):
    for b in range(data_size // batch_size):
        X, Y = shuffle_and_get_batch_data(X_data, Y_data, batch_size)

        # Forward propagation
        hidden_output_1 = activation(np.dot(X, w_hidden_1) + b_hidden_1)
        hidden_output_2 = activation(
            np.dot(hidden_output_1, w_hidden_2) + b_hidden_2)
        nn_output = softmax(
            np.dot(hidden_output_2, w_out_hidden) + b_out_hidden)

        # Backward propagation
        #   Step back through the network and calculate errors and deltas for the weights and biases
        hidden_out_error = Y - nn_output
        d_w_out_hidden = np.dot(hidden_output_2.T, hidden_out_error)

        hidden_2_error = np.multiply(
            np.matmul(hidden_out_error, w_out_hidden.T),
            activation_d(hidden_output_2))
        d_w_hidden_2 = np.dot(hidden_output_1.T, hidden_2_error)

        hidden_in_error = np.multiply(np.matmul(hidden_2_error, w_hidden_2.T),
                                      activation_d(hidden_output_1))
        d_w_in_hidden = np.dot(X.T, hidden_in_error)
Ejemplo n.º 12
0
 def probabilities_from_activation(self, vmap):
     return activation_functions.softmax(vmap[self])
Ejemplo n.º 13
0
 def probabilities_from_activation(self, vmap):
     return activation_functions.softmax(vmap[self])
Ejemplo n.º 14
0
    def loss(self, X, y=None, reg=0.0):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        N, D = X.shape
        """
            Get scores and perform forward pass
        """
        scores = None

        # Layer 1
        # (N, D)x(D, H) + (1, H) = (N, H)
        scores = np.dot(X, W1) + b1
        relu_activations = ReLU(scores)

        # Dropout after layer 1
        # Kill whole neurons
        # drop1 = np.random.randn(scores.shape)
        # relu_activations[:, drop1 <= self.dropout_p] = 0

        # Layer 2
        scores = relu_activations.dot(W2) + b2

        if y is None:
            return scores

        softmax_scores = softmax(scores)
        scores = -np.log(softmax_scores[range(N), y])
        loss = np.mean(scores)
        loss += reg * (np.sum(W1 * W1) + np.sum(W2 * W2) + np.sum(b1 * b1) +
                       np.sum(b2 * b2))
        """
            Calculating gradients via backprop
        """

        grads = {}

        # Differentiate loss wrt scores for each class
        dsoft = softmax_scores.copy()
        dsoft[range(N), y] -= 1
        dsoft /= N

        dW2 = relu_activations.T.dot(dsoft)
        dW2 += 2 * reg * W2
        grads['dW2'] = dW2

        db2 = dsoft * 1
        grads['db2'] = np.sum(db2, axis=0)

        dx2 = np.dot(dsoft, W2.T)
        relu_ones = (relu_activations >= 0) * 1
        # Only allow those gradients to flow back whose activations were positive
        drelu = dx2 * relu_ones

        dW1 = X.T.dot(drelu)
        dW1 += 2 * reg * W1
        grads['dW1'] = dW1

        db1 = drelu * 1
        grads['db1'] = np.sum(db1, axis=0)

        dimage = drelu.dot(W1.T)
        grads['dimage'] = dimage

        return loss, grads