Esempio n. 1
0
def lspi(maxiter, epsilon, samples, basis, discount, initial_policy):
    """
    Runs the LSPI algorithm
    """

    iteration = -1
    distance = float('inf')
    policy = initial_policy
    all_policies = [initial_policy]
    
    while (iteration < maxiter) and (distance > epsilon):

        # print the number of iterations
        iteration = iteration + 1
        print ('============================')
        print 'LSPI iteration: %i' % iteration
        if iteration == 0:
            firsttime = 1
        else:
            firsttime = 0

        policy = Policy(policy=policy)

        policy.weights = lstdq(samples, all_policies[iteration], policy)[0]

        diff = policy.weights - all_policies[iteration].weights
        LMAXnorm = LA.norm(diff, np.inf)
        L2norm = LA.norm(diff)

        distance = L2norm

        all_policies.append(policy)

    print '================================'
    if distance > epsilon:
        print 'LSPI finished in %i iterations WITHOUT convergence to a fixed point' % iteration
    else:
        print 'LSPI converged in %i iterations' % iteration
    print
    print 'weights'
    print policy.weights
    print

    return policy, all_policies