def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=.01):

    data = zip(x,y)
    theta = theta_0
    alpha = alpha_0
    min_theta, min_value = None, float("inf")
    iterations_with_no_improvement = 0

    # If we ever go to 100 iterations with no improvement, stop
    while iterations_with_no_improvement < 100:
        value = sum(target_fn(x_i, y_i, theta) for x_i, y_i in data)

        if value < min_value:
            # if we've found a new minimum, remember it
            # and go back to the original step size
            min_theta, min_value = theta, value
            iterations_with_no_improvement = 0
            alpha = alpha_0
        else:
            # otherwise we're no improving, so we should try shrinking the step size
            iterations_with_no_improvement += 1
            alpha *= .9

        # ad take a gradient step for each of the datapoints
        for x_i, y_i in in_random_order(data):
            gradient_i = gradient_fn(x_i, y_i, theta)
            theta = Ch4.vector_subtract(theta, Ch4.scalar_multiply(alpha, gradient_i))

    return min_theta
def variance(x):
    """Assumes x has at least two elements"""
    n = len(x)
    deviations = de_mean(x)
    return Ch4.sum_of_squares(deviations) / (n-1)
    return [v_i + step_size * direction_i for v_i, direction_i in zip(v, direction)]


def sum_of_squares_gradient(v):
    return [2 * v_i for v_i in v]

if isMain:
    # pick a random starting point
    v = [random.randint(-10, 10) for _ in range(3)]

    tolerance = .00000001

    while True:
        gradient = sum_of_squares_gradient(v)
        next_v = step(v, gradient, -0.01)
        if Ch4.distance(next_v, v) < tolerance:
            break
        v = next_v
    print("Approx solutions: ", v)

# But what should the step size be?


# This is a safe apply function, just in case we try a bad step size
def safe(f):
    """
        Return a new function that is the same as f, except
        that it outputs infinity whenever f produces an error
    """
    def safe_f(*args, **kwargs):
        try:
def covariance(x, y):
    n = len(x)
    return Ch4.dot(de_mean(x), de_mean(y))/(n-1)