예제 #1
0
def adam(grad, init_params, subopt=None, callback=None, break_cond=None,
         num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8):
    """Adam as described in http://arxiv.org/pdf/1412.6980.pdf.
    It's basically RMSprop with momentum and some correction terms."""
    flattened_grad, unflatten, x = flatten_func(grad, init_params)

    # dynamic step sizes
    if np.isscalar(step_size):
        step_size = np.ones(num_iters) * step_size
    assert len(step_size) == num_iters, "step schedule needs to match num iter"

    m = np.zeros(len(x))
    v = np.zeros(len(x))
    for i in range(num_iters):
        g = flattened_grad(x, i)
        if callback: callback(unflatten(x), i, unflatten(g))
        m = (1 - b1) * g      + b1 * m  # First  moment estimate.
        v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
        mhat = m / (1 - b1**(i + 1))    # Bias correction.
        vhat = v / (1 - b2**(i + 1))
        x = x - step_size[i]*mhat/(np.sqrt(vhat) + eps)

        # do line search on last 
        if subopt is not None:
            x = subopt(x, g, i)

        if break_cond is not None:
            if break_cond(x, i, g):
                break

    return unflatten(x)
예제 #2
0
def gradient_descent_beta(g, w, alpha, max_its, beta, version):
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = compute_grad(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))

    # start gradient descent loop
    z = np.zeros((np.shape(w)))  # momentum term

    # over the line
    for k in range(max_its):
        # plug in value into func and derivative
        grad_eval = grad(w)
        grad_eval.shape = np.shape(w)

        ### normalized or unnormalized descent step? ###
        if version == 'normalized':
            grad_norm = np.linalg.norm(grad_eval)
            if grad_norm == 0:
                grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1)
            grad_eval /= grad_norm

        # take descent step with momentum
        z = beta * z + grad_eval
        w = w - alpha * z

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist
예제 #3
0
def sgd(grad, init_params, subopt=None, callback=None,
        break_cond=None, num_iters=200, step_size=0.1, mass=0.9):
    """Stochastic gradient descent with momentum.
    grad() must have signature grad(x, i), where i is the iteration number."""
    flattened_grad, unflatten, x = flatten_func(grad, init_params)

    # dynamic step sizes
    if np.isscalar(step_size):
        step_size = np.ones(num_iters) * step_size
    assert len(step_size) == num_iters, "step schedule needs to match num iter"

    velocity = np.zeros(len(x))
    for i in range(num_iters):
        g = flattened_grad(x, i)
        if callback: callback(unflatten(x), i, unflatten(g))
        velocity = mass * velocity - (1.0 - mass) * g
        x = x + step_size[i] * velocity

        if subopt is not None:
            x = subopt(x, g, i)

        if break_cond is not None:
            if break_cond(x, i, g):
                break
    return unflatten(x)
예제 #4
0
def gradient_descent(g, w, alpha, max_its, beta):
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = compute_grad(g_flat)

    # record history
    w_hist = []
    # push the first w
    w_hist.append(unflatten(w))

    # start gradient descent loop
    z = np.zeros(np.shape(w))  # momentum term

    # over the line
    for k in range(max_its):
        # plug in value into func and derivative
        grad_eval = grad(w)
        grad_eval.shape = np.shape(w)

        # take descent step with momentum
        z = beta * z + grad_eval
        w = w - alpha * z

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist
예제 #5
0
def gradient_descent(g, alpha, max_its, w, num_pts, batch_size, **kwargs):
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = value_and_grad(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))

    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts, batch_size)))
    # over the line
    for k in range(max_its):
        # loop over each minibatch
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b * batch_size,
                                   min((b + 1) * batch_size, num_pts))

            # plug in value into func and derivative
            cost_eval, grad_eval = grad(w, batch_inds)
            grad_eval.shape = np.shape(w)

            # take descent step with momentum
            w = w - alpha * grad_eval

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist
    def newtons_method(self, g, w, **kwargs):
        # create gradient and hessian functions
        self.g = g

        # flatten gradient for simpler-written descent loop
        flat_g, unflatten, w = flatten_func(self.g, w)

        self.grad = compute_grad(flat_g)
        self.hess = compute_hess(flat_g)

        # parse optional arguments
        max_its = 20
        if 'max_its' in kwargs:
            max_its = kwargs['max_its']
        self.epsilon = 10**(-5)
        if 'epsilon' in kwargs:
            self.epsilon = kwargs['epsilon']
        verbose = False
        if 'verbose' in kwargs:
            verbose = kwargs['verbose']

        # create container for weight history
        w_hist = []
        w_hist.append(unflatten(w))

        # start newton's method loop
        if verbose == True:
            print('starting optimization...')

        geval_old = flat_g(w)
        for k in range(max_its):
            # compute gradient and hessian
            grad_val = self.grad(w)
            hess_val = self.hess(w)
            hess_val.shape = (np.size(w), np.size(w))

            # solve linear system for weights
            w = w - np.dot(
                np.linalg.pinv(hess_val + self.epsilon * np.eye(np.size(w))),
                grad_val)

            # eject from process if reaching singular system
            geval_new = flat_g(w)
            if k > 2 and geval_new > geval_old:
                print('singular system reached')
                time.sleep(1.5)
                clear_output()
                return w_hist
            else:
                geval_old = geval_new

            # record current weights
            w_hist.append(unflatten(w))

        if verbose == True:
            print('...optimization complete!')
            time.sleep(1.5)
            clear_output()

        return w_hist
    def gradient_descent(self,g,alpha_choice,max_its,w,v): 

        g_flat, unflatten, w = flatten_func(g, w)
        grad = value_and_grad(g_flat)

        w_hist = [unflatten(w)]
        train_hist = [g_flat(w,v)]

        alpha = 0
        for k in range(1,max_its+1):
            print('iteration: ', k, end = "\r")
            alpha = 0
            if alpha_choice == 'diminishing':
                alpha = 1/float(k)
            else:
                alpha = alpha_choice

            cost_eval,grad_eval = grad(w,v)
            grad_eval.shape = np.shape(w)

            w = w - alpha*grad_eval

            train_cost = g_flat(w,v)

            w_hist.append(unflatten(w))
            train_hist.append(train_cost)

        return w_hist,train_hist
예제 #8
0
파일: recnn.py 프로젝트: YohannFaure/recnn
def adam(grad,
         init_params,
         callback=None,
         num_iters=100,
         step_size=0.001,
         b1=0.9,
         b2=0.999,
         eps=10**-8):
    flattened_grad, unflatten, x = flatten_func(grad, init_params)

    m = np.zeros(len(x))
    v = np.zeros(len(x))

    for i in range(num_iters):
        g = flattened_grad(x, i)

        if callback:
            callback(unflatten(x), i, unflatten(g))

        m = (1 - b1) * g + b1 * m  # First  moment estimate.
        v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
        mhat = m / (1 - b1**(i + 1))  # Bias correction.
        vhat = v / (1 - b2**(i + 1))
        x = x - step_size * mhat / (np.sqrt(vhat) + eps)

    return unflatten(x)
예제 #9
0
def newtons_method(g, max_its, w, num_pts, batch_size, **kwargs):
    # flatten input funciton, in case it takes in matrices of weights
    g_flat, unflatten, w = flatten_func(g, w)

    # compute the gradient / hessian functions of our input function -
    gradient = value_and_grad(g_flat)
    hess = hessian(g_flat)

    # set numericxal stability parameter / regularization parameter
    epsilon = 10**(-7)
    if 'epsilon' in kwargs:
        epsilon = kwargs['epsilon']

    # record history
    w_hist = []
    w_hist.append(unflatten(w))
    cost_hist = [g_flat(w, np.arange(num_pts))]

    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts, batch_size)))

    # over the line
    for k in range(max_its):
        # loop over each minibatch
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b * batch_size,
                                   min((b + 1) * batch_size, num_pts))

            # evaluate the gradient, store current weights and cost function value
            cost_eval, grad_eval = gradient(w, batch_inds)

            # evaluate the hessian
            hess_eval = hess(w, batch_inds)

            # reshape for numpy linalg functionality
            hess_eval.shape = (int(
                (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5)))
            '''
            # compute minimum eigenvalue of hessian matrix 
            eigs, vecs = np.linalg.eig(hess_eval)
            smallest_eig = np.min(eigs)
            adjust = 0
            if smallest_eig < 0:
                adjust = np.abs(smallest_eig)
            '''

            # solve second order system system for weight update
            A = hess_eval + (epsilon) * np.eye(np.size(w))
            b = grad_eval
            w = np.linalg.lstsq(A, np.dot(A, w) - b)[0]

            #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval)

        # record weights after each epoch
        w_hist.append(unflatten(w))
        cost_hist.append(g_flat(w, np.arange(num_pts)))

    return w_hist, cost_hist
def gradient_descent(g, w, x_train, x_val, alpha, max_its, batch_size,
                     **kwargs):
    verbose = True
    if 'verbose' in kwargs:
        verbose = kwargs['verbose']

    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = value_and_grad(g_flat)

    # record history
    num_train = x_train.shape[1]
    num_val = x_val.shape[1]
    w_hist = [unflatten(w)]
    train_hist = [g_flat(w, x_train, np.arange(num_train))]
    val_hist = [g_flat(w, x_val, np.arange(num_val))]

    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_train, batch_size)))

    # over the line
    for k in range(max_its):
        # loop over each minibatch
        start = timer()
        train_cost = 0
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b * batch_size,
                                   min((b + 1) * batch_size, num_train))

            # plug in value into func and derivative
            cost_eval, grad_eval = grad(w, x_train, batch_inds)
            grad_eval.shape = np.shape(w)

            # take descent step with momentum
            w = w - alpha * grad_eval

        end = timer()

        # update training and validation cost
        train_cost = g_flat(w, x_train, np.arange(num_train))
        val_cost = g_flat(w, x_val, np.arange(num_val))

        # record weight update, train and val costs
        w_hist.append(unflatten(w))
        train_hist.append(train_cost)
        val_hist.append(val_cost)

        if verbose == True:
            print('step ' + str(k + 1) + ' done in ' +
                  str(np.round(end - start, 1)) + ' secs, train cost = ' +
                  str(np.round(train_hist[-1][0], 4)) + ', val cost = ' +
                  str(np.round(val_hist[-1][0], 4)))

    if verbose == True:
        print('finished all ' + str(max_its) + ' steps')
        #time.sleep(1.5)
        #clear_output()
    return w_hist, train_hist, val_hist
예제 #11
0
def newtons_method(g, max_its, w, num_pts, batch_size, **kwargs):
    # flatten input funciton, in case it takes in matrices of weights
    flat_g, unflatten, w = flatten_func(g, w)

    # compute the gradient / hessian functions of our input function -
    # note these are themselves functions.  In particular the gradient -
    # - when evaluated - returns both the gradient and function evaluations (remember
    # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use
    # an Automatic Differntiator to evaluate the gradient)
    gradient = value_and_grad(flat_g)
    hess = hessian(flat_g)

    # set numericxal stability parameter / regularization parameter
    epsilon = 10**(-7)
    if 'epsilon' in kwargs:
        epsilon = kwargs['epsilon']

    # record history
    w_hist = []
    w_hist.append(unflatten(w))

    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts, batch_size)))

    # over the line
    for k in range(max_its):
        # loop over each minibatch
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b * batch_size,
                                   min((b + 1) * batch_size, num_pts))

            # evaluate the gradient, store current weights and cost function value
            cost_eval, grad_eval = gradient(w, batch_inds)

            # evaluate the hessian
            hess_eval = hess(w, batch_inds)

            # reshape for numpy linalg functionality
            hess_eval.shape = (int(
                (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5)))

            # solve second order system system for weight update
            A = hess_eval + epsilon * np.eye(np.size(w))
            b = grad_eval
            w = np.linalg.lstsq(A, np.dot(A, w) - b)[0]

            #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval)

        # record weights after each epoch
        w_hist.append(unflatten(w))

    # collect final weights
    w_hist.append(unflatten(w))

    return w_hist
예제 #12
0
def gradient_descent(g, alpha_choice, max_its, w, version, beta):
    # flatten the input function to more easily deal with costs that have layers of parameters
    g_flat, unflatten, w = flatten_func(
        g, w)  # note here the output 'w' is also flattened

    # compute the gradient function of our input function - note this is a function too
    # that - when evaluated - returns both the gradient and function evaluations (remember
    # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use
    # an Automatic Differntiator to evaluate the gradient)
    gradient = value_and_grad(g_flat)

    # run the gradient descent loop
    weight_history = []  # container for weight history
    cost_history = []  # container for corresponding cost function history
    alpha = 0

    # start gradient descent loop
    z = np.zeros((np.shape(w)))  # momentum term

    for k in range(1, max_its + 1):
        # check if diminishing steplength rule used
        if alpha_choice == 'diminishing':
            alpha = 1 / float(k)
        else:
            alpha = alpha_choice

        # evaluate the gradient, store current (unflattened) weights and cost function value
        cost_eval, grad_eval = gradient(w)

        if version == 'normalized':
            grad_norm = np.linalg.norm(grad_eval)
            # check that magnitude of gradient is not too small, if yes pick a random direction to move
            if grad_norm == 0:
                # pick random direction and normalize to have unit legnth
                grad_eval = 10**-6 * np.sign(2 * np.random.rand(len(w)) - 1)
                grad_norm = np.linalg.norm(grad_eval)
            grad_eval /= grad_norm

        # take descent step with momentum
        z = beta * z + grad_eval
        w = w - alpha * z

        weight_history.append(unflatten(w))
        cost_history.append(cost_eval)

        # take gradient descent step
        w = w - alpha * grad_eval

    # collect final weights
    weight_history.append(unflatten(w))
    # compute final cost function value via g itself (since we aren't computing
    # the gradient at the final step we don't get the final cost function value
    # via the Automatic Differentiatoor)
    cost_history.append(g_flat(w))
    return weight_history, cost_history
def gradient_descent(g, w, a_train, s_train, alpha, max_its, verbose):
    '''
    A basic gradient descent module (full batch) for system identification training.  
    Inputs to gradient_descent function:
    
    g - function to minimize
    w - initial weights
    a_train - training action sequence
    s_train - training state sequence
    alpha - steplength / learning rate
    max_its - number of iterations to perform
    verbose - print out update each step if verbose = True
    '''

    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = value_and_grad(g_flat)

    # record history
    # num_val = y_val.size
    w_hist = [unflatten(w)]
    train_hist = [g_flat(w, a_train, s_train)]

    # over the line
    alpha_choice = 0
    for k in range(1, max_its + 1):
        # take a single descent step
        start = timer()

        # plug in value into func and derivative
        cost_eval, grad_eval = grad(w, a_train, s_train)
        grad_eval.shape = np.shape(w)

        # take descent step with momentum
        w = w - alpha * grad_eval

        end = timer()

        # update training and validation cost
        train_cost = g_flat(w, a_train, s_train)
        val_cost = np.nan

        # record weight update, train cost
        w_hist.append(unflatten(w))
        train_hist.append(train_cost)

        if verbose == True:
            print('step ' + str(k + 1) + ' done in ' +
                  str(np.round(end - start, 1)) + ' secs, train cost = ' +
                  str(np.round(train_hist[-1], 4)[0]))

    if verbose == True:
        print('finished all ' + str(max_its) + ' steps')
    return w_hist, train_hist
예제 #14
0
def rmsprop(grad, init_params, callback=None, num_iters=100,
            step_size=0.1, gamma=0.9, eps=10**-8):
    """Root mean squared prop: See Adagrad paper for details."""
    flattened_grad, unflatten, x = flatten_func(grad, init_params)

    avg_sq_grad = np.ones(len(x))
    for i in range(num_iters):
        g = flattened_grad(x, i)
        if callback: callback(unflatten(x), i, unflatten(g))
        avg_sq_grad = avg_sq_grad * gamma + g**2 * (1 - gamma)
        x = x - step_size * g/(np.sqrt(avg_sq_grad) + eps)
    return unflatten(x)
    def RMSprop(self, g, w, x_train, y_train, lam, alpha, max_its, batch_size,
                **kwargs):
        # rmsprop params
        gamma = 0.9
        eps = 10**-8
        if 'gamma' in kwargs:
            gamma = kwargs['gamma']
        if 'eps' in kwargs:
            eps = kwargs['eps']

        # flatten the input function, create gradient based on flat function
        g_flat, unflatten, w = flatten_func(g, w)
        grad = value_and_grad(g_flat)

        # initialize average gradient
        avg_sq_grad = np.ones(np.size(w))

        # record history
        num_train = y_train.size
        w_hist = [unflatten(w)]
        train_hist = [g_flat(w, x_train, y_train, lam, np.arange(num_train))]

        # how many mini-batches equal the entire dataset?
        num_batches = int(np.ceil(np.divide(num_train, batch_size)))

        # over the line
        for k in range(max_its):
            # loop over each minibatch
            for b in range(num_batches):
                # collect indices of current mini-batch
                batch_inds = np.arange(b * batch_size,
                                       min((b + 1) * batch_size, num_train))

                # plug in value into func and derivative
                cost_eval, grad_eval = grad(w, x_train, y_train, lam,
                                            batch_inds)
                grad_eval.shape = np.shape(w)

                # update exponential average of past gradients
                avg_sq_grad = gamma * avg_sq_grad + (1 - gamma) * grad_eval**2

                # take descent step
                w = w - alpha * grad_eval / (avg_sq_grad**(0.5) + eps)

            # update training and validation cost
            train_cost = g_flat(w, x_train, y_train, lam, np.arange(num_train))

            # record weight update, train and val costs
            w_hist.append(unflatten(w))
            train_hist.append(train_cost)

        return w_hist, train_hist
예제 #16
0
def gradient_descent(g, alpha, max_its, w, num_pts, batch_size, **kwargs):
    # pluck out args
    beta = 0
    if 'beta' in kwargs:
        beta = kwargs['beta']
    normalize = False
    if 'normalize' in kwargs:
        normalize = kwargs['normalize']

    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = value_and_grad(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))

    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts, batch_size)))

    # initialization for momentum direction
    h = np.zeros((w.shape))

    # over the line
    for k in range(max_its):
        # loop over each minibatch
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b * batch_size,
                                   min((b + 1) * batch_size, num_pts))

            # plug in value into func and derivative
            cost_eval, grad_eval = grad(w, batch_inds)
            grad_eval.shape = np.shape(w)

            # normalize?
            if normalize == True:
                grad_eval = np.sign(grad_eval)

            # momentum step
            # h = beta*h - (1 - beta)*grad_eval

            # take descent step with momentum
            w = w - alpha * grad_eval

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist
예제 #17
0
def newtons_method(g, max_its, w, **kwargs):
    # flatten input funciton, in case it takes in matrices of weights
    flat_g, unflatten, w = flatten_func(g, w)

    # compute the gradient / hessian functions of our input function -
    # note these are themselves functions.  In particular the gradient -
    # - when evaluated - returns both the gradient and function evaluations (remember
    # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use
    # an Automatic Differntiator to evaluate the gradient)
    gradient = value_and_grad(flat_g)
    hess = hessian(flat_g)

    # set numericxal stability parameter / regularization parameter
    epsilon = 10**(-7)
    if 'epsilon' in kwargs:
        epsilon = kwargs['epsilon']

    # run the newtons method loop
    weight_history = []  # container for weight history
    cost_history = []  # container for corresponding cost function history
    for k in range(max_its):
        # evaluate the gradient, store current weights and cost function value
        cost_eval, grad_eval = gradient(w)
        weight_history.append(unflatten(w))
        cost_history.append(cost_eval)

        # evaluate the hessian
        hess_eval = hess(w)

        # reshape for numpy linalg functionality
        hess_eval.shape = (int(
            (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5)))

        # solve second order system system for weight update
        #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval)

        # solve second order system system for weight update
        A = hess_eval + epsilon * np.eye(np.size(w))
        b = grad_eval
        w = np.linalg.lstsq(A, np.dot(A, w) - b)[0]

    # collect final weights
    weight_history.append(unflatten(w))
    # compute final cost function value via g itself (since we aren't computing
    # the gradient at the final step we don't get the final cost function value
    # via the Automatic Differentiatoor)
    cost_history.append(flat_g(w))

    return weight_history, cost_history
예제 #18
0
def minibatch_gradient_descent(g, alpha_choice, max_its, w, batch_size,
                               num_pts):
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)

    # compute the gradient function of our input function - note this is a function too
    # that - when evaluated - returns both the gradient and function evaluations (remember
    # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use
    # an Automatic Differntiator to evaluate the gradient)
    gradient = value_and_grad(g_flat)

    # run the gradient descent loop
    weight_history = []  # container for weight history
    cost_history = []  # container for corresponding cost function history
    alpha = 0

    # record history
    weight_history.append(unflatten(w))
    cost_history.append(g_flat(w, np.arange(num_pts)))

    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts, batch_size)))
    # over the line
    for k in range(max_its):
        # check if diminishing steplength rule used
        if alpha_choice == 'diminishing':
            alpha = 1 / float(k)
        else:
            alpha = alpha_choice

        # loop over each minibatch
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b * batch_size,
                                   min((b + 1) * batch_size, num_pts))

            # plug in value into func and derivative
            cost_eval, grad_eval = gradient(w, batch_inds)
            grad_eval.shape = np.shape(w)

            # take descent step with momentum
            w = w - alpha * grad_eval

        # record weight update
        weight_history.append(unflatten(w))
        cost_history.append(g_flat(w, np.arange(num_pts)))

    return weight_history, cost_history
예제 #19
0
def gradient_descent(g, w_unflat, alpha_choice, max_its, version, **kwargs):
    verbose = False
    if 'verbose' in kwargs:
        verbose = kwargs['verbose']

    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w_unflat)
    grad = compute_grad(g)

    # record history
    w_hist = []
    w_hist.append(w_unflat)

    # over the line
    for k in range(max_its):
        if verbose == True:
            if np.mod(k, 5) == 0:
                print('started iteration ' + str(k) + ' of ' + str(max_its))

        # check if diminishing steplength rule used
        if alpha_choice == 'diminishing':
            alpha = 1 / float(k)
        else:
            alpha = alpha_choice

        # plug in value into func and derivative
        grad_eval = grad(w_unflat)
        grad_eval, _ = flatten(grad_eval)

        ### normalized or unnormalized descent step? ###
        if version == 'normalized':
            grad_norm = np.linalg.norm(grad_eval)
            if grad_norm == 0:
                grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1)
            grad_eval /= grad_norm

        # take descent step
        w = w - alpha * grad_eval

        # record weight update
        w_unflat = unflatten(w)
        w_hist.append(w_unflat)

    if verbose == True:
        print('finished all ' + str(max_its) + ' iterations')

    return w_hist
예제 #20
0
파일: JazNet.py 프로젝트: albertqu/EMPJ
            def myadam(grad,
                       init_params,
                       callback=None,
                       num_iters=100,
                       step_sizes=0.001,
                       b1=0.9,
                       b2=0.999,
                       eps=10**-8,
                       gnorm_max=np.inf,
                       last_m=None,
                       last_v=None,
                       last_i=0,
                       lossfun=[],
                       printstuff=0):
                """Adam as described in http://arxiv.org/pdf/1412.6980.pdf.
				It's basically RMSprop with momentum and some correction terms."""
                flattened_grad, unflatten, x = flatten_func(grad, init_params)

                if type(step_sizes) == float or type(step_sizes) == int:
                    step_sizes = step_sizes * np.ones(num_iters)
                else:
                    assert len(step_sizes) == num_iters

                m = np.zeros(len(x)) if last_m is None else last_m
                v = np.zeros(len(x)) if last_v is None else last_v
                for i in range(num_iters):
                    g = flattened_grad(x, i)
                    gnorm = np.linalg.norm(g)
                    if gnorm > gnorm_max:
                        if printstuff:
                            print("    Gradient norm was: %0.4f" % gnorm)
                        g = g * gnorm_max / gnorm
                    gnorm = np.linalg.norm(g)
                    if printstuff:
                        print("    Gradient norm: %0.4f" % gnorm)
                        print("    Step size: %0.4f" % step_sizes[i])
                    if callback:
                        callback(unflatten(x),
                                 i,
                                 unflatten(g),
                                 lossfun=lossfun)
                    m = (1 - b1) * g + b1 * m  # First  moment estimate.
                    v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
                    mhat = m / (1 - b1**(i + last_i + 1))  # Bias correction.
                    vhat = v / (1 - b2**(i + last_i + 1))
                    x = x - step_sizes[i] * mhat / (np.sqrt(vhat) + eps)
                return unflatten(x), (m, v, i + last_i)
예제 #21
0
def normalized_gradient_descent(g, alpha, max_its, w):
    # flatten the input function to more easily deal with costs that have layers of parameters
    g_flat, unflatten, w = flatten_func(
        g, w)  # note here the output 'w' is also flattened
    print(w)

    # compute the gradient of our input function - note this is a function too!
    gradient = value_and_grad(g_flat)

    # run the gradient descent loop
    best_w = w  # weight we return, should be the one providing lowest evaluation
    best_eval, _ = gradient(w)  # lowest evaluation yet
    weight_history = []  # container for weight history
    cost_history = []  # container for corresponding cost function history

    for k in range(max_its):
        # evaluate the gradient, compute its length
        cost_eval, grad_eval = gradient(w)
        # split it up into the separate matrices for each layer
        grad_norm = np.linalg.norm(grad_eval)

        # check that magnitude of gradient is not too small, if yes pick a random direction to move
        if grad_norm == 0:
            # pick random direction and normalize to have unit legnth
            grad_eval = 10**-6 * np.sign(2 * np.random.rand(len(w)) - 1)
            grad_norm = np.linalg.norm(grad_eval)

        # do this for each matrix of weights
        grad_eval /= grad_norm

        # take gradient descent step
        w = w - alpha * grad_eval

        # return only the weight providing the lowest evaluation
        test_eval, _ = gradient(w)
        if test_eval < best_eval:
            best_eval = test_eval
            best_w = w

        print(k)

        weight_history.append(unflatten(w))
        cost_history.append(g_flat(w))

    weight_history.append(unflatten(best_w))
    cost_history.append(g_flat(best_w))
    return weight_history, cost_history
예제 #22
0
def gradient_descent(g, alpha, max_its, w, num_pts, train_portion,**kwargs):    
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = value_and_grad(g_flat)

    # containers for histories
    weight_hist = []
    train_ind_hist = []
    test_ind_hist = []
    
    # store first weights
    weight_hist.append(unflatten(w))
    
    # pick random proportion of training indecies
    train_num = int(np.round(train_portion*num_pts))
    inds = np.random.permutation(num_pts)
    train_inds = inds[:train_num]
    test_inds = inds[train_num:]
    
    # record train / test inds
    train_ind_hist.append(train_inds)
    test_ind_hist.append(test_inds)
    
    # over the line
    for k in range(max_its):   
        # plug in value into func and derivative
        cost_eval,grad_eval = grad(w,train_inds)
        grad_eval.shape = np.shape(w)

        # take descent step with momentum
        w = w - alpha*grad_eval

        # record weight update
        weight_hist.append(unflatten(w))        
        
        #### pick new train / test split ####
        # pick random proportion of training indecies
        train_num = int(np.round(train_portion*num_pts))
        inds = np.random.permutation(num_pts)
        train_inds = inds[:train_num]
        test_inds = inds[train_num:]
        
        # record train / test inds
        train_ind_hist.append(train_inds)
        test_ind_hist.append(test_inds)
        
    return weight_hist,train_ind_hist,test_ind_hist
예제 #23
0
def adam(grad,
         init_params,
         callback=None,
         num_iters=100,
         step_size=0.001,
         b1=0.9,
         b2=0.999,
         eps=10**-8,
         m=None,
         v=None,
         offset=None):
    """Adam as described in http://arxiv.org/pdf/1412.6980.pdf.
    It's basically RMSprop with momentum and some correction terms.

    :param grad: The gradient function.
    :param init_params: The initial parameters.
    :param callback: A callback function to run each iteration.
    :param num_iters: The number of iterations to run for.
    :param step_size: The step_size
    :param b1: Exponential decay rate of first moment.
    :param b2: Exponential decay rate of second moment.
    :param eps: Small term added for stability.
    :param m: The current first moment.
    :param v: The current second moment.
    :param offset: What iteration number to start with
    :return:
    """
    flattened_grad, unflatten, x = flatten_func(grad, init_params)

    if m is None:
        m = np.zeros(len(x))
    if v is None:
        v = np.zeros(len(x))
    if offset is None:
        offset = 0
    for i in range(num_iters):
        cur_iter = i + offset
        g = flattened_grad(x, cur_iter)
        if callback:
            callback(unflatten(x), cur_iter, unflatten(g))
        m = (1 - b1) * g + b1 * m  # First  moment estimate.
        v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
        mhat = m / (1 - b1**(cur_iter + 1))  # Bias correction.
        vhat = v / (1 - b2**(cur_iter + 1))
        x -= step_size * mhat / (np.sqrt(vhat) + eps)
    return unflatten(x), m, v, cur_iter
예제 #24
0
def newtons_method(g, w, x, y, max_its, **kwargs):
    # flatten input funciton, in case it takes in matrices of weights
    g_flat, unflatten, w = flatten_func(g, w)

    # compute the gradient / hessian functions of our input function
    grad = value_and_grad(g_flat)
    hess = hessian(g_flat)

    # set numericxal stability parameter / regularization parameter
    epsilon = 10**(-7)
    if 'epsilon' in kwargs:
        epsilon = kwargs['epsilon']

    # record history
    num_train = y.size
    w_hist = [unflatten(w)]
    train_hist = [g_flat(w, x, y, np.arange(num_train))]

    # over the line
    for k in range(max_its):
        # evaluate the gradient, store current weights and cost function value
        cost_eval, grad_eval = grad(w, x, y, np.arange(num_train))

        # evaluate the hessian
        hess_eval = hess(w, x, y, np.arange(num_train))

        # reshape for numpy linalg functionality
        hess_eval.shape = (int(
            (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5)))

        # solve second order system system for weight update
        A = hess_eval + epsilon * np.eye(np.size(w))
        b = grad_eval
        w = np.linalg.lstsq(A, np.dot(A, w) - b)[0]

        #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval)

        # update training and validation cost
        train_cost = g_flat(w, x, y, np.arange(num_train))

        # record weight update, train and val costs
        w_hist.append(unflatten(w))
        train_hist.append(train_cost)

    return w_hist, train_hist
예제 #25
0
def newtons_method(g, epsilon, max_its, w, num_pts, batch_size, **kwargs):

    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = value_and_grad(g_flat)
    hess = hessian(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))

    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts, batch_size)))

    # over the line
    for k in range(max_its):
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b * batch_size,
                                   min((b + 1) * batch_size, num_pts))

            # plug in value into func and derivative
            cost_eval, grad_eval = grad(w, batch_inds)
            grad_eval.shape = np.shape(w)

            # evaluate the hessian
            hess_eval = hess(w, batch_inds)

            # reshape for numpy linalg functionality
            hess_eval.shape = (int(
                (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5)))
            hess_eval += epsilon * np.eye(np.size(w))

            # solve second order system system for weight update
            A = hess_eval
            b = grad_eval
            w = np.linalg.lstsq(A, np.dot(A, w) - b)[0]

        # record weight update, train and val costs
        w_hist.append(unflatten(w))

        if np.linalg.norm(w) > 100:
            return w_hist

    return w_hist
예제 #26
0
    def gradient_descent(self,g,w,alpha,max_its,beta,version,**kwargs):
        verbose = False
        if 'verbose' in kwargs:
            verbose = kwargs['verbose']
        
        # flatten the input function, create gradient based on flat function
        g_flat, unflatten, w = flatten_func(g, w)
        grad = compute_grad(g_flat)

        # record history
        w_hist = []
        w_hist.append(unflatten(w))

        # start gradient descent loop
        z = np.zeros((np.shape(w)))      # momentum term

        if verbose == True:
            print ('starting optimization...')
            
        # over the line
        for k in range(max_its):   
            # plug in value into func and derivative
            grad_eval = grad(w)
            grad_eval.shape = np.shape(w)

            ### normalized or unnormalized descent step? ###
            if version == 'normalized':
                grad_norm = np.linalg.norm(grad_eval)
                if grad_norm == 0:
                    grad_norm += 10**-6*np.sign(2*np.random.rand(1) - 1)
                grad_eval /= grad_norm

            # take descent step with momentum
            z = beta*z + grad_eval
            w = w - alpha*z

            # record weight update
            w_hist.append(unflatten(w))

        if verbose == True:
            print ('...optimization complete!')
            time.sleep(1.5)
            clear_output()
            
        return w_hist
    def gradient_descent(self, g, w, x_train, y_train, lam, alpha_choice,
                         max_its, batch_size):
        # flatten the input function, create gradient based on flat function
        g_flat, unflatten, w = flatten_func(g, w)
        grad = value_and_grad(g_flat)

        # record history
        num_train = y_train.shape[1]
        w_hist = [unflatten(w)]
        train_hist = [g_flat(w, x_train, y_train, lam, np.arange(num_train))]

        # how many mini-batches equal the entire dataset?
        num_batches = int(np.ceil(np.divide(num_train, batch_size)))

        # over the line
        alpha = 0
        for k in range(max_its):
            # check if diminishing steplength rule used
            if alpha_choice == 'diminishing':
                alpha = 1 / float(k)
            else:
                alpha = alpha_choice

            for b in range(num_batches):
                # collect indices of current mini-batch
                batch_inds = np.arange(b * batch_size,
                                       min((b + 1) * batch_size, num_train))

                # plug in value into func and derivative
                cost_eval, grad_eval = grad(w, x_train, y_train, lam,
                                            batch_inds)
                grad_eval.shape = np.shape(w)

                # take descent step with momentum
                w = w - alpha * grad_eval

            # update training and validation cost
            train_cost = g_flat(w, x_train, y_train, lam, np.arange(num_train))

            # record weight update, train and val costs
            w_hist.append(unflatten(w))
            train_hist.append(train_cost)
        return w_hist, train_hist
예제 #28
0
파일: recnn.py 프로젝트: YohannFaure/recnn
def sgd(grad,
        init_params,
        callback=None,
        num_iters=200,
        step_size=0.1,
        mass=0.9):
    flattened_grad, unflatten, x = flatten_func(grad, init_params)

    velocity = np.zeros(len(x))

    for i in range(num_iters):
        g = flattened_grad(x, i)

        if callback:
            callback(unflatten(x), i, unflatten(g))

        velocity = mass * velocity - (1.0 - mass) * g
        x = x + step_size * velocity

    return unflatten(x)
def newtons_method(g,w,x,y,beta,max_its):        
    # flatten gradient for simpler-written descent loop
    flat_g, unflatten, w = flatten_func(g, w)

    grad = compute_grad(flat_g)
    hess = compute_hess(flat_g)  

    # create container for weight history 
    w_hist = []
    w_hist.append(unflatten(w))
    
    g_hist = []
    geval_old = flat_g(w,x,y,beta)
    g_hist.append(geval_old)

    # main loop
    epsilon = 10**(-7)
    for k in range(max_its):
        # compute gradient and hessian
        grad_val = grad(w,x,y,beta)
        hess_val = hess(w,x,y,beta)
        hess_val.shape = (np.size(w),np.size(w))

        # solve linear system for weights
        w = w - np.dot(np.linalg.pinv(hess_val + epsilon*np.eye(np.size(w))),grad_val)

        # eject from process if reaching singular system
        geval_new = flat_g(w,x,y,beta)
        if k > 2 and geval_new > geval_old:
            print ('singular system reached')
            time.sleep(1.5)
            clear_output()
            return w_hist
        else:
            geval_old = geval_new

        # record current weights
        w_hist.append(unflatten(w))
        g_hist.append(geval_new)

    return w_hist,g_hist
예제 #30
0
    def gradient_descent(self, g, w_unflat, alpha, max_its, version, **kwargs):
        verbose = False
        if 'verbose' in kwargs:
            verbose = kwargs['verbose']

        # flatten the input function, create gradient based on flat function
        g_flat, unflatten, w = flatten_func(g, w_unflat)
        grad = compute_grad(g)

        # record history
        w_hist = []
        w_hist.append(w_unflat)

        # over the line
        for k in range(max_its):
            # plug in value into func and derivative
            grad_eval = grad(w_unflat)
            grad_eval, _ = flatten(grad_eval)

            ### normalized or unnormalized descent step? ###
            if version == 'normalized':
                grad_norm = np.linalg.norm(grad_eval)
                if grad_norm == 0:
                    grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1)
                grad_eval /= grad_norm

            # take descent step
            w = w - alpha * grad_eval

            # record weight update
            w_unflat = unflatten(w)
            w_hist.append(w_unflat)

        if verbose == True:
            print('...optimization complete!')
            time.sleep(1.5)
            clear_output()

        return w_hist