Beispiel #1
0
def _check_gradients(X, Y: Tensor2D, parameters: Parameters,
                     gradients: Parameters, lamb: float):
    epsilon = 1e-7
    parameters_ = deepcopy(parameters)
    numerical_gradients = {}
    for param_name, param_values in parameters_.items():
        print("Calculating numeric gradients for {}".format(param_name))
        param_shape = shape(param_values)
        numerical_gradients[param_name] = zeros(*param_shape)
        for i in range(param_shape[0]):
            for j in range(param_shape[1]):
                numerical_gradients[param_name][i][
                    j] = _single_param_numerical_gradient(
                        X, Y, parameters_, lamb, param_name, i, j, epsilon)

    gradients_vector = _params_to_single_vector(gradients)
    numerical_gradients_vector = _params_to_single_vector(numerical_gradients)

    assert shape(gradients_vector) == shape(numerical_gradients_vector)

    delta = l2_norm(minus(numerical_gradients_vector, gradients_vector)) / (
        l2_norm(numerical_gradients_vector) + l2_norm(gradients_vector))

    if delta > epsilon:
        print("Gradient check failed delta={} > {} !!!!!".format(
            delta, epsilon))
    else:
        print("Gradient check passed delta={}".format(delta))
Beispiel #2
0
def shuffle_truncate_dataset(X: Tensor2D, Y: Tensor2D, truncate: int = None) -> Tuple[Tensor2D, Tensor2D]:
    assert shape(X)[1] == shape(Y)[1], "X and Y should have the same number of columns (training examples)"

    index = list(range(shape(X)[1]))
    shuffle(index)
    if truncate and truncate < len(index):
        index = index[:truncate]

    X_ = [[Xi[j] for j in index] for Xi in X]
    Y_ = [[Yi[j] for j in index] for Yi in Y]
    return X_, Y_
Beispiel #3
0
def _calculate_cost(Y_hat, Y: Tensor2D, parameters: Parameters,
                    lamb: float) -> float:
    batch_size = shape(Y)[1]
    Y_loss = multinomial_logistic.loss(Y_hat, Y)
    assert shape(Y_loss) == (1, batch_size)
    # average loss. sum rows and convert to single scalar
    cost = (1. / batch_size) * sum_all(Y_loss)

    # regularization
    if lamb != 0.:
        param_sq_sum = 0.
        for param_key, param_values in parameters.items():
            # only do regularization on W, not b, parameters
            if param_key.startswith('W'):
                param_sq_sum += sum_all(element_sq(param_values))

        cost += (lamb / (2. * batch_size)) * param_sq_sum

    return cost
Beispiel #4
0
def _params_to_single_vector(parameters: Parameters) -> Tensor2D:
    size = 0
    for param_values in parameters.values():
        param_shape = shape(param_values)
        size += param_shape[0] * param_shape[1]

    vector = zeros(size, 1)

    offset = 0
    for param_name in sorted(parameters.keys()):
        param_values = parameters[param_name]
        param_shape = shape(param_values)
        for i in range(param_shape[0]):
            for j in range(param_shape[1]):
                index = offset + (j * param_shape[0]) + i
                vector[index][0] = param_values[i][j]
        offset += param_shape[0] * param_shape[1]

    return vector
Beispiel #5
0
def split_into_batches(A: Tensor2D, batch_size: int) -> List[Tensor2D]:
    A_shape = shape(A)
    num_batches = floor(A_shape[1] / batch_size)
    overflow = A_shape[1] - (num_batches * batch_size)
    batches = []

    def _one_batch(start, end):
        return [Ai[start:end] for Ai in A]

    for b in range(num_batches):
        batches.append(_one_batch(b * batch_size, (b + 1) * batch_size))

    if overflow != 0:
        batches.append(_one_batch(num_batches * batch_size, A_shape[1]))

    total_cols_size = 0
    for batch in batches:
        batch_shape = shape(batch)
        assert batch_shape[0] == A_shape[0]
        assert batch_shape[1] == batch_size or batch_shape[1] == overflow
        total_cols_size += batch_shape[1]
    assert total_cols_size == A_shape[1]

    return batches
Beispiel #6
0
def softmax(Z: Tensor2D, stable=True) -> Tensor2D:
    Z_shape = shape(Z)

    if stable:
        # stable softmax via https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
        Z_max = max(Z[0])
        Z_minus_max = minus(Z, [[Z_max]])
        Z_exp = element_exp(Z_minus_max)
    else:
        Z_exp = element_exp(Z)

    Z_exp_col_sum = zeros(1, Z_shape[1])

    for i in range(Z_shape[0]):
        for j in range(Z_shape[1]):
            Z_exp_col_sum[0][j] += Z_exp[i][j]

    Z_softmax = zeros(*Z_shape)
    for i in range(Z_shape[0]):
        for j in range(Z_shape[1]):
            Z_softmax[i][j] = Z_exp[i][j] / Z_exp_col_sum[0][j]

    return Z_softmax
Beispiel #7
0
def _train_one_epoch(X_train_batches: List[Tensor2D],
                     Y_train_batches: List[Tensor2D], parameters: Parameters,
                     learning_rate: float, lamb: float) -> Parameters:
    total_batches = len(X_train_batches)
    trained_examples = 0
    for batch_index in range(len(X_train_batches)):
        batch_start_time = time.time()

        X_train_batch = X_train_batches[batch_index]
        Y_train_batch = Y_train_batches[batch_index]

        loss, parameters, train_accuracy = _train_one_mini_batch(
            X_train_batch, Y_train_batch, learning_rate, parameters, lamb)

        batch_duration = time.time() - batch_start_time

        trained_examples += shape(X_train_batch)[1]

        print(
            " batch: {}/{}  training loss: {:0.2f}  train accuracy: {:0.2f}%  duration: {:0.2f}s"
            .format(batch_index + 1, total_batches, loss,
                    train_accuracy * 100., batch_duration))

    return parameters
def test_shape(matrix, expected_shape):
    assert shape(matrix) == expected_shape
Beispiel #9
0
def _calculate_accuracy(X, Y: Tensor2D, parameters: Parameters) -> float:
    Y_shape = shape(Y)
    Y_hat, _ = _forward_propagation(X, parameters)
    num_examples = Y_shape[1]
    num_correct = sum_all(element_equals(argmax(Y_hat), argmax(Y)))
    return num_correct / num_examples
Beispiel #10
0
def _backward_propagation(X, Y: Tensor2D, parameters: Parameters, lamb: float,
                          cache: Parameters) -> Parameters:
    X_shape = shape(X)

    batch_size = X_shape[1]

    W1 = parameters["W1"]
    B1 = parameters["B1"]
    W2 = parameters["W2"]
    B2 = parameters["B2"]
    W3 = parameters["W3"]
    B3 = parameters["B3"]

    A0 = X
    Z1 = cache["Z1"]
    A1 = cache["A1"]
    Z2 = cache["Z2"]
    A2 = cache["A2"]
    Z3 = cache["Z3"]
    A3 = cache["A3"]
    Y_hat = A3

    # Layer 3 (output) derivatives
    dZ3 = minus(Y_hat, Y)
    assert shape(dZ3) == shape(Z3)
    dW3 = element_multiply([[1. / batch_size]],
                           matrix_multiply(dZ3, transpose(A2)))
    if lamb != 0.:
        dW3 = add(dW3, _regularization_gradient(lamb, batch_size, W3))
    assert shape(dW3) == shape(W3)
    dB3 = element_multiply([[1. / batch_size]], sum_rows(dZ3))
    assert shape(dB3) == shape(B3)

    # Layer 2 (hidden) derivatives
    dZ2 = element_multiply(matrix_multiply(transpose(W3), dZ3),
                           relu.relu_derivative(Z2))
    assert shape(dZ2) == shape(Z2)
    dW2 = element_multiply([[1. / batch_size]],
                           matrix_multiply(dZ2, transpose(A1)))
    if lamb != 0.:
        dW2 = add(dW2, _regularization_gradient(lamb, batch_size, W2))
    assert shape(dW2) == shape(W2)
    dB2 = element_multiply([[1. / batch_size]], sum_rows(dZ2))
    assert shape(dB2) == shape(B2)

    # Layer 1 (hidden) derivatives
    dZ1 = element_multiply(matrix_multiply(transpose(W2), dZ2),
                           relu.relu_derivative(Z1))
    assert shape(dZ1) == shape(Z1)
    dW1 = element_multiply([[1. / batch_size]],
                           matrix_multiply(dZ1, transpose(A0)))
    if lamb != 0.:
        dW1 = add(dW1, _regularization_gradient(lamb, batch_size, W1))
    assert shape(dW1) == shape(W1)
    dB1 = element_multiply([[1. / batch_size]], sum_rows(dZ1))
    assert shape(dB1) == shape(B1)

    # return gradients for weights and bias for each layer
    gradients = {
        "dW1": dW1,
        "dB1": dB1,
        "dW2": dW2,
        "dB2": dB2,
        "dW3": dW3,
        "dB3": dB3,
    }

    return gradients