def lspi(maxiter, epsilon, samples, basis, discount, initial_policy): """ Runs the LSPI algorithm """ iteration = -1 distance = float('inf') policy = initial_policy all_policies = [initial_policy] while (iteration < maxiter) and (distance > epsilon): # print the number of iterations iteration = iteration + 1 print ('============================') print 'LSPI iteration: %i' % iteration if iteration == 0: firsttime = 1 else: firsttime = 0 policy = Policy(policy=policy) policy.weights = lstdq(samples, all_policies[iteration], policy)[0] diff = policy.weights - all_policies[iteration].weights LMAXnorm = LA.norm(diff, np.inf) L2norm = LA.norm(diff) distance = L2norm all_policies.append(policy) print '================================' if distance > epsilon: print 'LSPI finished in %i iterations WITHOUT convergence to a fixed point' % iteration else: print 'LSPI converged in %i iterations' % iteration print print 'weights' print policy.weights print return policy, all_policies