Esempio n. 1
0
def least_squares_fit(xs: List[Vector],
                      ys: List[float],
                      learning_rate: float = 0.001,
                      num_steps: int = 1000,
                      batch_size: int = 1) -> Vector:
    """
    Find the beta that minimizes the sum of squared errors
    assuming the model y = dot(x, beta)
    """

    # Start with a random guess
    guess = [random.random() for _ in xs[0]]

    for _ in tqdm.trange(num_steps, desc="least squares fit"):
        for start in range(0, len(xs), batch_size):
            batch_xs = xs[start:start + batch_size]
            batch_ys = ys[start:start + batch_size]

            gradient = vector_mean([
                sqerror_gradient(x, y, guess)
                for x, y in zip(batch_xs, batch_ys)
            ])
            guess = gradient_step(guess, gradient, -learning_rate)

    return guess
Esempio n. 2
0
def train(network: List[List[Vector]], xs: Vector, ys: Vector, epochs: int, learning_rate: float) -> List[List[Vector]]:
    for epoch in tqdm.trange(epochs, desc="Neural network for xor"):
        for x, y in zip(xs, ys):
            gradients = sqerror_gradients(network, x, y)
            # Take a gradient step for each neuron in each layer
            network = [[gradient_step(neuron, grad, -learning_rate)
                        for neuron, grad in zip(layer, layer_grad)]
                        for layer, layer_grad in zip(network, gradients)]
    return network;
def first_principal_component(data: List[Vector],
                              n: int = 100,
                              step_size: float = 0.1) -> Vector:
    # Start with a random guess
    guess = [1.0 for _ in data[0]]

    with tqdm.trange(n) as t:
        for _ in t:
            dv = directional_variance(data, guess)
            gradient = directional_variance_gradient(data, guess)
            guess = gradient_step(guess, gradient, step_size)
            t.set_description(f"dv: {dv:.3f}")

    return direction(guess)
Esempio n. 4
0
def first_principal_component(data: List[Vector],
                              n: int = 100,
                              step_size: float = 0.1) -> Vector:
    # Start with a random guess
    guess = [1.0 for _ in data[0]]

    with tqdm.trange(n) as t:
        for _ in t:
            dv = directional_variance(data, guess)
            gradient = directional_variance_gradient(data, guess)
            guess = gradient_step(guess, gradient, step_size)
            t.set_description(f"dv: {dv:.3f}")

    return direction(guess)
def least_squares_fit(xs: List[Vector],
                      ys: List[float],
                      learning_rate: float = 0.001,
                      num_steps: int = 1000,
                      batch_size: int = 1) -> Vector:
    guess = [random.random() for _ in xs[0]]

    for _ in tqdm.trange(num_steps, desc="least squares fit"):
        for start in range(0, len(xs), batch_size):
            batch_xs = xs[start:start + batch_size]
            batch_ys = ys[start:start + batch_size]

            gradient = vector_mean([
                squared_gradient(x, y, guess)
                for x, y in zip(batch_xs, batch_ys)
            ])
            guess = gradient_step(guess, gradient, -learning_rate)

    return guess
Esempio n. 6
0
def least_squares_fit_ridge(xs: List[Vector],
                            ys: List[float],
                            alpha: float,
                            learning_rate: float,
                            num_steps: int,
                            batch_size: int = 1) -> Vector:
    guess = [random.random() for _ in xs[0]]

    for i in range(num_steps):
        for start in range(0, len(xs), batch_size):
            batch_xs = xs[start:start + batch_size]
            batch_ys = ys[start:start + batch_size]

            gradient = vector_mean([
                sqerror_ridge_gradient(x, y, guess, alpha)
                for x, y in zip(batch_xs, batch_ys)
            ])
            guess = gradient_step(guess, gradient, -learning_rate)

    return guess
def main():
    import random
    import tqdm
    from scratch.gradient_descent import gradient_step

    num_epochs = 10000
    random.seed(0)

    guess = [random.random(),
             random.random()]  # na początek wybierz wartość losową

    learning_rate = 0.00001

    with tqdm.trange(num_epochs) as t:
        for _ in t:
            alpha, beta = guess

            # Pochodna cząstkowa straty w odniesieniu do alpha
            grad_a = sum(
                2 * error(alpha, beta, x_i, y_i)
                for x_i, y_i in zip(num_friends_good, daily_minutes_good))

            # Pochodna cząstkowa straty w odniesieniu do beta
            grad_b = sum(
                2 * error(alpha, beta, x_i, y_i) * x_i
                for x_i, y_i in zip(num_friends_good, daily_minutes_good))

            # Obliczamy stratę, aby wstawić do opisu tqdm
            loss = sum_of_sqerrors(alpha, beta, num_friends_good,
                                   daily_minutes_good)
            t.set_description(f"loss: {loss:.3f}")

            # Na koniec zaktualizuj przewidywanie
            guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)

    # Powinniśmy otrzymać mniej więcej taki sam wynik:
    alpha, beta = guess
    assert 22.9 < alpha < 23.0
    assert 0.9 < beta < 0.905
def main():
    import random
    import tqdm
    from scratch.gradient_descent import gradient_step

    num_epochs = 10000
    random.seed(0)

    guess = [random.random(), random.random()]  # choose random value to start

    learning_rate = 0.00001

    with tqdm.trange(num_epochs) as t:
        for _ in t:
            alpha, beta = guess

            # Partial derivative of loss with respect to alpha
            grad_a = sum(
                2 * error(alpha, beta, x_i, y_i)
                for x_i, y_i in zip(num_friends_good, daily_minutes_good))

            # Partial derivative of loss with respect to beta
            grad_b = sum(
                2 * error(alpha, beta, x_i, y_i) * x_i
                for x_i, y_i in zip(num_friends_good, daily_minutes_good))

            # Compute loss to stick in the tqdm description
            loss = sum_of_sqerrors(alpha, beta, num_friends_good,
                                   daily_minutes_good)
            t.set_description(f"loss: {loss:.3f}")

            # Finally, update the guess
            guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)

    # We should get pretty much the same results:
    alpha, beta = guess
    assert 22.9 < alpha < 23.0
    assert 0.9 < beta < 0.905
def main():
    import random
    import tqdm
    from scratch.gradient_descent import gradient_step
    
    num_epochs = 10000
    random.seed(0)
    
    guess = [random.random(), random.random()]  # choose random value to start
    
    learning_rate = 0.00001
    
    with tqdm.trange(num_epochs) as t:
        for _ in t:
            alpha, beta = guess
    
            # Partial derivative of loss with respect to alpha
            grad_a = sum(2 * error(alpha, beta, x_i, y_i)
                         for x_i, y_i in zip(num_friends_good,
                                             daily_minutes_good))
    
            # Partial derivative of loss with respect to beta
            grad_b = sum(2 * error(alpha, beta, x_i, y_i) * x_i
                         for x_i, y_i in zip(num_friends_good,
                                             daily_minutes_good))
    
            # Compute loss to stick in the tqdm description
            loss = sum_of_sqerrors(alpha, beta,
                                   num_friends_good, daily_minutes_good)
            t.set_description(f"loss: {loss:.3f}")
    
            # Finally, update the guess
            guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)
    
    # We should get pretty much the same results:
    alpha, beta = guess
    assert 22.9 < alpha < 23.0
    assert 0.9 < beta < 0.905
Esempio n. 10
0
def least_squares_fit(xs: List[Vector],
                      ys: List[float],
                      learning_rate: float = 0.001,
                      num_steps: int = 1000,
                      batch_size: int = 1) -> Vector:
    """
    Znajdź beta, które minimalizuje sumę kwadratów błędów,
    zakładając model y = dot(x, beta).
    """
    # Rozpoczynamy od losowej wartości
    guess = [random.random() for _ in xs[0]]

    for _ in tqdm.trange(num_steps, desc="least squares fit"):
        for start in range(0, len(xs), batch_size):
            batch_xs = xs[start:start + batch_size]
            batch_ys = ys[start:start + batch_size]

            gradient = vector_mean([
                sqerror_gradient(x, y, guess)
                for x, y in zip(batch_xs, batch_ys)
            ])
            guess = gradient_step(guess, gradient, -learning_rate)

    return guess
Esempio n. 11
0
def gradient_descent(x: List[float] ,y: List[float]) -> float:
    num_epochs = 10000
    random.seed(0)

    guess = [random.random(), random.random()] #choose random value to start

    learning_rate = 0.00001

    with tqdm.trange(num_epochs) as t:
        for _ in t:
            alpha, beta = guess

            #Partial derivative of loss with respect to alpha
            grad_a = sum(2 * error(alpha, beta, x_i, y_i)
                         for x_i, y_i in zip(x,
                                             y))

            #Partial derivative of loss with respect to beta
            grad_b = sum(2 * error(alpha, beta, x_i, y_i) * x_i
                         for x_i, y_i in zip(x,
                                             y))

            #The loss is the error in our predicted value of m and c. The goal is to minimize this error to obtain the most accurate value of alpha,beta
            loss = sum_of_sqerrors(alpha, beta, x, y)
            t.set_description(f"loss: {loss:.3f}")

            #Finally, update the guess
            guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)

    return guess

#We expect a user with n friends to spend 22.95 + n * 0.903 minutes on the site each day
#alpha, beta = gradient_descent(num_friends_good, daily_minutes_good)
#print (alpha, beta)
#assert 22.9 < alpha < 23.0
#assert 0.9 < beta < 0.905
Esempio n. 12
0
def main():
    import random
    random.seed(0)
    
    # dane treningowe
    xs = [[0., 0], [0., 1], [1., 0], [1., 1]]
    ys = [[0.], [1.], [1.], [0.]]
    
    # rozpocznij od losowych wag
    network = [ # warstwa ukryta: 2 wartości wejściowe -> 2 wartości wyjściowe
                [[random.random() for _ in range(2 + 1)],   # pierwszy ukryty neuron 
                 [random.random() for _ in range(2 + 1)]],  # drugi ukryty neuron
                # warstwa wyjściowa: 2 wartości wejściowe -> 1 wynik
                [[random.random() for _ in range(2 + 1)]]   # pierwszy neuron wyjściowy
              ]
    
    from scratch.gradient_descent import gradient_step
    import tqdm
    
    learning_rate = 1.0
    
    for epoch in tqdm.trange(20000, desc="neural net for xor"):
        for x, y in zip(xs, ys):
            gradients = sqerror_gradients(network, x, y)
    
            # Zrób krok w kierunku gradientu dla każdego neuronu, w każdej warstwie.
            network = [[gradient_step(neuron, grad, -learning_rate)
                        for neuron, grad in zip(layer, layer_grad)]
                       for layer, layer_grad in zip(network, gradients)]
    
    # sprawdź, czy faktycznie implementuje bramkę XOR
    assert feed_forward(network, [0, 0])[-1][0] < 0.01
    assert feed_forward(network, [0, 1])[-1][0] > 0.99
    assert feed_forward(network, [1, 0])[-1][0] > 0.99
    assert feed_forward(network, [1, 1])[-1][0] < 0.01
    
    xs = [binary_encode(n) for n in range(101, 1024)]
    ys = [fizz_buzz_encode(n) for n in range(101, 1024)]
    
    NUM_HIDDEN = 25
    
    network = [
        # warstwa ukryta: 10 wejść -> NUM_HIDDEN wyjść
        [[random.random() for _ in range(10 + 1)] for _ in range(NUM_HIDDEN)],
    
        # warstwa wyjściowa: NUM_HIDDEN wejść -> 4 wyjść
        [[random.random() for _ in range(NUM_HIDDEN + 1)] for _ in range(4)]
    ]
    
    from scratch.linear_algebra import squared_distance
    
    learning_rate = 1.0
    
    with tqdm.trange(500) as t:
        for epoch in t:
            epoch_loss = 0.0
    
            for x, y in zip(xs, ys):
                predicted = feed_forward(network, x)[-1]
                epoch_loss += squared_distance(predicted, y)
                gradients = sqerror_gradients(network, x, y)
    
                # Zrób krok w kierunku gradientu dla każdego neuronu w każdej warstwie
                network = [[gradient_step(neuron, grad, -learning_rate)
                            for neuron, grad in zip(layer, layer_grad)]
                        for layer, layer_grad in zip(network, gradients)]
    
            t.set_description(f"fizz buzz (loss: {epoch_loss:.2f})")
    
    num_correct = 0
    
    for n in range(1, 101):
        x = binary_encode(n)
        predicted = argmax(feed_forward(network, x)[-1])
        actual = argmax(fizz_buzz_encode(n))
        labels = [str(n), "fizz", "buzz", "fizzbuzz"]
        print(n, labels[predicted], labels[actual])
    
        if predicted == actual:
            num_correct += 1
    
    print(num_correct, "/", 100)
Esempio n. 13
0
def main():
    from matplotlib import pyplot as plt
    plt.close()
    plt.clf()
    plt.gca().clear()

    from matplotlib import pyplot as plt
    from scratch.working_with_data import rescale
    from scratch.multiple_regression import least_squares_fit, predict
    from scratch.gradient_descent import gradient_step

    learning_rate = 0.001
    rescaled_xs = rescale(xs)
    beta = least_squares_fit(rescaled_xs, ys, learning_rate, 1000, 1)
    # [0.26, 0.43, -0.43]
    predictions = [predict(x_i, beta) for x_i in rescaled_xs]

    plt.scatter(predictions, ys)
    plt.xlabel("predicted")
    plt.ylabel("actual")
    # plt.show()

    plt.savefig('im/linear_regression_for_probabilities.png')
    plt.close()

    from scratch.machine_learning import train_test_split
    import random
    import tqdm

    random.seed(0)
    x_train, x_test, y_train, y_test = train_test_split(rescaled_xs, ys, 0.33)

    learning_rate = 0.01

    # pick a random starting point
    beta = [random.random() for _ in range(3)]

    with tqdm.trange(5000) as t:
        for epoch in t:
            gradient = negative_log_gradient(x_train, y_train, beta)
            beta = gradient_step(beta, gradient, -learning_rate)
            loss = negative_log_likelihood(x_train, y_train, beta)
            t.set_description(f"loss: {loss:.3f} beta: {beta}")

    from scratch.working_with_data import scale

    means, stdevs = scale(xs)
    beta_unscaled = [(beta[0] - beta[1] * means[1] / stdevs[1] -
                      beta[2] * means[2] / stdevs[2]), beta[1] / stdevs[1],
                     beta[2] / stdevs[2]]
    # [8.9, 1.6, -0.000288]

    assert (negative_log_likelihood(xs, ys,
                                    beta_unscaled) == negative_log_likelihood(
                                        rescaled_xs, ys, beta))

    true_positives = false_positives = true_negatives = false_negatives = 0

    for x_i, y_i in zip(x_test, y_test):
        prediction = logistic(dot(beta, x_i))

        if y_i == 1 and prediction >= 0.5:  # TP: paid and we predict paid
            true_positives += 1
        elif y_i == 1:  # FN: paid and we predict unpaid
            false_negatives += 1
        elif prediction >= 0.5:  # FP: unpaid and we predict paid
            false_positives += 1
        else:  # TN: unpaid and we predict unpaid
            true_negatives += 1

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    print(precision, recall)

    assert precision == 0.75
    assert recall == 0.8

    plt.clf()
    plt.gca().clear()

    predictions = [logistic(dot(beta, x_i)) for x_i in x_test]
    plt.scatter(predictions, y_test, marker='+')
    plt.xlabel("predicted probability")
    plt.ylabel("actual outcome")
    plt.title("Logistic Regression Predicted vs. Actual")
    # plt.show()

    plt.savefig('im/logistic_regression_predicted_vs_actual.png')
    plt.gca().clear()
    step = scalar_multiply(step_size, gradient)
    return add(v, step)


from scratch.gradient_descent import gradient_step
import random, tqdm

num_epochs = 10000
random.seed(0)
guess = [random.random(), random.random()]  # choose random value to start
learning_rate = 0.00001
with tqdm.trange(num_epochs) as t:
    for _ in t:
        alpha, beta = guess
        # Partial derivative of loss with respect to alpha
        grad_a = sum(2 * error(alpha, beta, x_i, y_i)
                     for x_i, y_i in zip(num_friends_good, daily_minutes_good))
        # Partial derivative of loss with respect to beta
        grad_b = sum(2 * error(alpha, beta, x_i, y_i) * x_i
                     for x_i, y_i in zip(num_friends_good, daily_minutes_good))
        # Compute loss to stick in the tqdm description
        loss = sum_of_sqerrors(alpha, beta, num_friends_good,
                               daily_minutes_good)
        t.set_description(f"loss: {loss:.3f}")
        # Finally, update the guess
        guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)
# We should get pretty much the same results:
alpha, beta = guess
print(alpha)
print(beta)
def main():
    
    from matplotlib import pyplot as plt
    plt.close()
    plt.clf()
    plt.gca().clear()
    
    from matplotlib import pyplot as plt
    from scratch.working_with_data import rescale
    from scratch.multiple_regression import least_squares_fit, predict
    from scratch.gradient_descent import gradient_step
    
    learning_rate = 0.001
    rescaled_xs = rescale(xs)
    beta = least_squares_fit(rescaled_xs, ys, learning_rate, 1000, 1)
    # [0.26, 0.43, -0.43]
    predictions = [predict(x_i, beta) for x_i in rescaled_xs]
    
    plt.scatter(predictions, ys)
    plt.xlabel("predicted")
    plt.ylabel("actual")
    # plt.show()
    
    
    plt.savefig('im/linear_regression_for_probabilities.png')
    plt.close()
    
    from scratch.machine_learning import train_test_split
    import random
    import tqdm
    
    random.seed(0)
    x_train, x_test, y_train, y_test = train_test_split(rescaled_xs, ys, 0.33)
    
    learning_rate = 0.01
    
    # pick a random starting point
    beta = [random.random() for _ in range(3)]
    
    with tqdm.trange(5000) as t:
        for epoch in t:
            gradient = negative_log_gradient(x_train, y_train, beta)
            beta = gradient_step(beta, gradient, -learning_rate)
            loss = negative_log_likelihood(x_train, y_train, beta)
            t.set_description(f"loss: {loss:.3f} beta: {beta}")
    
    from scratch.working_with_data import scale
    
    means, stdevs = scale(xs)
    beta_unscaled = [(beta[0]
                      - beta[1] * means[1] / stdevs[1]
                      - beta[2] * means[2] / stdevs[2]),
                     beta[1] / stdevs[1],
                     beta[2] / stdevs[2]]
    # [8.9, 1.6, -0.000288]
    
    
    
    assert (negative_log_likelihood(xs, ys, beta_unscaled) ==
            negative_log_likelihood(rescaled_xs, ys, beta))
    
    true_positives = false_positives = true_negatives = false_negatives = 0
    
    for x_i, y_i in zip(x_test, y_test):
        prediction = logistic(dot(beta, x_i))
    
        if y_i == 1 and prediction >= 0.5:  # TP: paid and we predict paid
            true_positives += 1
        elif y_i == 1:                      # FN: paid and we predict unpaid
            false_negatives += 1
        elif prediction >= 0.5:             # FP: unpaid and we predict paid
            false_positives += 1
        else:                               # TN: unpaid and we predict unpaid
            true_negatives += 1
    
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    
    
    print(precision, recall)
    
    assert precision == 0.75
    assert recall == 0.8
    
    
    
    plt.clf()
    plt.gca().clear()
    
    predictions = [logistic(dot(beta, x_i)) for x_i in x_test]
    plt.scatter(predictions, y_test, marker='+')
    plt.xlabel("predicted probability")
    plt.ylabel("actual outcome")
    plt.title("Logistic Regression Predicted vs. Actual")
    # plt.show()
    
    
    
    plt.savefig('im/logistic_regression_predicted_vs_actual.png')
    plt.gca().clear()
Esempio n. 16
0
def main():

    from matplotlib import pyplot as plt
    plt.close()
    plt.clf()
    plt.gca().clear()

    from matplotlib import pyplot as plt
    from scratch.working_with_data import rescale
    from scratch.multiple_regression import least_squares_fit, predict
    from scratch.gradient_descent import gradient_step

    learning_rate = 0.001
    rescaled_xs = rescale(xs)
    beta = least_squares_fit(rescaled_xs, ys, learning_rate, 1000, 1)
    # [0.26, 0.43, -0.43]
    predictions = [predict(x_i, beta) for x_i in rescaled_xs]

    plt.scatter(predictions, ys)
    plt.xlabel("wartosc przewidywana")
    plt.ylabel("wartosc rzeczywista")
    # plt.show()

    plt.savefig('im/linear_regression_for_probabilities.png')
    plt.close()

    from scratch.machine_learning import train_test_split
    import random
    import tqdm

    random.seed(0)
    x_train, x_test, y_train, y_test = train_test_split(rescaled_xs, ys, 0.33)

    learning_rate = 0.01

    # Wybierz losowy punkt początkowy.
    beta = [random.random() for _ in range(3)]

    with tqdm.trange(5000) as t:
        for epoch in t:
            gradient = negative_log_gradient(x_train, y_train, beta)
            beta = gradient_step(beta, gradient, -learning_rate)
            loss = negative_log_likelihood(x_train, y_train, beta)
            t.set_description(f"loss: {loss:.3f} beta: {beta}")

    from scratch.working_with_data import scale

    means, stdevs = scale(xs)
    beta_unscaled = [(beta[0] - beta[1] * means[1] / stdevs[1] -
                      beta[2] * means[2] / stdevs[2]), beta[1] / stdevs[1],
                     beta[2] / stdevs[2]]
    # [8.9, 1.6, -0.000288]

    assert (negative_log_likelihood(xs, ys,
                                    beta_unscaled) == negative_log_likelihood(
                                        rescaled_xs, ys, beta))

    true_positives = false_positives = true_negatives = false_negatives = 0

    for x_i, y_i in zip(x_test, y_test):
        prediction = logistic(dot(beta, x_i))

        if y_i == 1 and prediction >= 0.5:  # Wynik prawdziwie dodatni: użytkownik zapłacił i klasyfikator przewidział to poprawnie.
            true_positives += 1
        elif y_i == 1:  # Wynik fałszywie ujemny: użytkownik zapłacił, a klasyfikator tego nie przewidział.
            false_negatives += 1
        elif prediction >= 0.5:  # Wynik fałszywie dodatni: użytkownik nie zapłacił, a klasyfikator przewidział opłatę.
            false_positives += 1
        else:  # Wynik prawdziwie negatywny: użytkownik nie zapłacił i zostało to przewidziane przez klasyfikator.
            true_negatives += 1

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    print(precision, recall)

    assert precision == 0.75
    assert recall == 0.8

    plt.clf()
    plt.gca().clear()

    predictions = [logistic(dot(beta, x_i)) for x_i in x_test]
    plt.scatter(predictions, y_test, marker='+')
    plt.xlabel("przewidywane prawdopodobienstwo")
    plt.ylabel("wynik rzeczywisty")
    plt.title("Porownanie wartosci rzeczywistych i przewidywanych")
    plt.show()

    plt.savefig('im/logistic_regression_predicted_vs_actual.png')
    plt.gca().clear()
def main():
    import random
    random.seed(0)
    
    # training data
    xs = [[0., 0], [0., 1], [1., 0], [1., 1]]
    ys = [[0.], [1.], [1.], [0.]]
    
    # start with random weights
    network = [ # hidden layer: 2 inputs -> 2 outputs
                [[random.random() for _ in range(2 + 1)],   # 1st hidden neuron
                 [random.random() for _ in range(2 + 1)]],  # 2nd hidden neuron
                # output layer: 2 inputs -> 1 output
                [[random.random() for _ in range(2 + 1)]]   # 1st output neuron
              ]
    
    from scratch.gradient_descent import gradient_step
    import tqdm
    
    learning_rate = 1.0
    
    for epoch in tqdm.trange(20000, desc="neural net for xor"):
        for x, y in zip(xs, ys):
            gradients = sqerror_gradients(network, x, y)
    
            # Take a gradient step for each neuron in each layer
            network = [[gradient_step(neuron, grad, -learning_rate)
                        for neuron, grad in zip(layer, layer_grad)]
                       for layer, layer_grad in zip(network, gradients)]
    
    # check that it learned XOR
    assert feed_forward(network, [0, 0])[-1][0] < 0.01
    assert feed_forward(network, [0, 1])[-1][0] > 0.99
    assert feed_forward(network, [1, 0])[-1][0] > 0.99
    assert feed_forward(network, [1, 1])[-1][0] < 0.01
    
    xs = [binary_encode(n) for n in range(101, 1024)]
    ys = [fizz_buzz_encode(n) for n in range(101, 1024)]
    
    NUM_HIDDEN = 25
    
    network = [
        # hidden layer: 10 inputs -> NUM_HIDDEN outputs
        [[random.random() for _ in range(10 + 1)] for _ in range(NUM_HIDDEN)],
    
        # output_layer: NUM_HIDDEN inputs -> 4 outputs
        [[random.random() for _ in range(NUM_HIDDEN + 1)] for _ in range(4)]
    ]
    
    from scratch.linear_algebra import squared_distance
    
    learning_rate = 1.0
    
    with tqdm.trange(500) as t:
        for epoch in t:
            epoch_loss = 0.0
    
            for x, y in zip(xs, ys):
                predicted = feed_forward(network, x)[-1]
                epoch_loss += squared_distance(predicted, y)
                gradients = sqerror_gradients(network, x, y)
    
                # Take a gradient step for each neuron in each layer
                network = [[gradient_step(neuron, grad, -learning_rate)
                            for neuron, grad in zip(layer, layer_grad)]
                        for layer, layer_grad in zip(network, gradients)]
    
            t.set_description(f"fizz buzz (loss: {epoch_loss:.2f})")
    
    num_correct = 0
    
    for n in range(1, 101):
        x = binary_encode(n)
        predicted = argmax(feed_forward(network, x)[-1])
        actual = argmax(fizz_buzz_encode(n))
        labels = [str(n), "fizz", "buzz", "fizzbuzz"]
        print(n, labels[predicted], labels[actual])
    
        if predicted == actual:
            num_correct += 1
    
    print(num_correct, "/", 100)
def main():
    import random
    random.seed(0)
    
    # training data
    xs = [[0., 0], [0., 1], [1., 0], [1., 1]]
    ys = [[0.], [1.], [1.], [0.]]
    
    # start with random weights
    network = [ # hidden layer: 2 inputs -> 2 outputs
                [[random.random() for _ in range(2 + 1)],   # 1st hidden neuron
                 [random.random() for _ in range(2 + 1)]],  # 2nd hidden neuron
                # output layer: 2 inputs -> 1 output
                [[random.random() for _ in range(2 + 1)]]   # 1st output neuron
              ]
    
    from scratch.gradient_descent import gradient_step
    import tqdm
    
    learning_rate = 1.0
    
    for epoch in tqdm.trange(20000, desc="neural net for xor"):
        for x, y in zip(xs, ys):
            gradients = sqerror_gradients(network, x, y)
    
            # Take a gradient step for each neuron in each layer
            network = [[gradient_step(neuron, grad, -learning_rate)
                        for neuron, grad in zip(layer, layer_grad)]
                       for layer, layer_grad in zip(network, gradients)]
    
    # check that it learned XOR
    assert feed_forward(network, [0, 0])[-1][0] < 0.01
    assert feed_forward(network, [0, 1])[-1][0] > 0.99
    assert feed_forward(network, [1, 0])[-1][0] > 0.99
    assert feed_forward(network, [1, 1])[-1][0] < 0.01
    
    xs = [binary_encode(n) for n in range(101, 1024)]
    ys = [fizz_buzz_encode(n) for n in range(101, 1024)]
    
    NUM_HIDDEN = 25
    
    network = [
        # hidden layer: 10 inputs -> NUM_HIDDEN outputs
        [[random.random() for _ in range(10 + 1)] for _ in range(NUM_HIDDEN)],
    
        # output_layer: NUM_HIDDEN inputs -> 4 outputs
        [[random.random() for _ in range(NUM_HIDDEN + 1)] for _ in range(4)]
    ]
    
    from scratch.linear_algebra import squared_distance
    
    learning_rate = 1.0
    
    with tqdm.trange(500) as t:
        for epoch in t:
            epoch_loss = 0.0
    
            for x, y in zip(xs, ys):
                predicted = feed_forward(network, x)[-1]
                epoch_loss += squared_distance(predicted, y)
                gradients = sqerror_gradients(network, x, y)
    
                # Take a gradient step for each neuron in each layer
                network = [[gradient_step(neuron, grad, -learning_rate)
                            for neuron, grad in zip(layer, layer_grad)]
                        for layer, layer_grad in zip(network, gradients)]
    
            t.set_description(f"fizz buzz (loss: {epoch_loss:.2f})")
    
    num_correct = 0
    
    for n in range(1, 101):
        x = binary_encode(n)
        predicted = argmax(feed_forward(network, x)[-1])
        actual = argmax(fizz_buzz_encode(n))
        labels = [str(n), "fizz", "buzz", "fizzbuzz"]
        print(n, labels[predicted], labels[actual])
    
        if predicted == actual:
            num_correct += 1
    
    print(num_correct, "/", 100)