Ejemplo n.º 1
0
 def __call__(self, parameters: np.array, numEpisodes: int):
     # print("Evaluating Cartpole")
     G = []
     # self.policy.parameters = policy
     policy = SoftmaxWithLFA(4, 2, self.k)
     policy.parameters = parameters
     # threadPool = Pool(numEpisodes)
     # G = threadPool.map(self.runEpisode, [policy for i in range(numEpisodes)])
     # print("G", G)
     # self.curTrialReturns.extend(G)
     env = Cartpole()
     for ep in range(numEpisodes):
         # print("Episode ", ep)
         env.reset()
         Gi = 0
         while not env.isEnd:
             state = env.state
             action = policy.samplAction(state)
             next_state, reward, _ = env.step(action)
             Gi += reward
         G.append(Gi)
         self.lock.acquire()
         self.curTrialReturns.append(Gi)
         # print("Number of returns collected int trial", len(self.curTrialReturns))
         self.lock.release()
     # threadPool.close()
     # threadPool.join()
     print("Mean Return ", np.mean(G))
     return np.mean(G)
Ejemplo n.º 2
0
    def __init__(self, numStates: int, numActions: int, k: int):
        # self.G = []
        self._numStates = numStates
        self._numActions = numActions
        self._k = k

        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(numStates, numActions, k)
Ejemplo n.º 3
0
 def runEpisode(self, policy):
     env = Cartpole()
     Gi = 0
     while not env.isEnd:
         state = env.state
         action = policy.samplAction(state)
         next_state, reward, _ = env.step(action)
         Gi += reward
     return Gi
Ejemplo n.º 4
0
class GenEpCartpole:
    def __init__(self, numStates: int, numActions: int, k: int):
        # self.G = []
        self._numStates = numStates
        self._numActions = numActions
        self._k = k

        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(numStates, numActions, k)
#        self.policy.k = k

#    @property
#    def batchReturn(self)->str:
#        return self._G

    def __call__(self, theta: np.array, numEpisodes: int):

        self.policy.parameters = theta

        D = []

        for episode in range(numEpisodes):

            self.environment.reset()
            G_episode = 0

            counter = 0
            H = {}
            S = []
            A = []
            R = []
            while not self.environment.isEnd:
                state = self.environment.state
                action = self.policy.samplAction(state)
                _, reward, _ = self.environment.step(action)

                G_episode += reward

                phi_s = self.policy.phiS(state)
                S.append(phi_s)
                A.append(action)
                R.append(reward)

                counter += 1

            H['S'] = np.array(S)
            H['A'] = np.array(A)
            H['R'] = np.array(R)

            D.append(H)

        return D

    def reset(self):
        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(self._numStates, self._numActions)
        self.policy.k = self._k
Ejemplo n.º 5
0
def run_cartpole_episode(p, basis):
    environment = Cartpole()
    policy = LinearApproximation(state_dim=4, num_actions=2, basis=basis)
    policy.parameters = p
    is_end = False
    discounted_return = 0
    t = 0
    while not is_end:
        action = policy.samplAction(environment.state)
        new_state, reward, is_end = environment.step(action)
        discounted_return += (environment.gamma**t) * reward
        t += 1
    environment.reset()
    return discounted_return
Ejemplo n.º 6
0
def runHistory(getAction, numeps=10000):
    histories = []

    cartPole = Cartpole()
    for ep in range(numeps):
        history = []
        cartPole.reset()
        history.append(cartPole.state)
        step = 0
        while not cartPole.isEnd:
            s, r, e = cartPole.step(getAction(cartPole.state))
            history.append(cartPole.action)
            history.append(cartPole.reward)
            history.append(cartPole.state)
        histories.append(history)

    return histories
Ejemplo n.º 7
0
def runEnvironment_carpole(policy, numeps=10000):
    returns = np.zeros(numeps)

    env = Cartpole()
    for ep in range(numeps):
        env.reset()
        step = 0
        g = 0
        while not env.isEnd:
            action = policy.samplAction(env.state)
            s, r, e = env.step(action)
            g += (env.gamma**step) * r
            step += 1
            if step > 200:
                g = -50
                break
        returns[ep] = g
    return returns
Ejemplo n.º 8
0
class EvaluateCartpole:
    def __init__(self):
        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(4, 2)
        self._G = []

    @property
    def batchReturn(self) -> str:
        return self._G

    def __call__(self, theta: np.array, numEpisodes: int):

        self.policy.parameters = theta

        for episode in range(numEpisodes):

            self.environment.reset()
            G_episode = 0

            counter = 0

            while not self.environment.isEnd:

                state = self.environment.state
                action = self.policy.samplAction(state)
                _, reward, _ = self.environment.step(action)

                G_episode += reward

                counter += 1

            self._G.append(G_episode)

        return np.mean(self._G)

    def reset(self):
        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(4, 2)
        self._G = []
Ejemplo n.º 9
0
def problem6():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """
    
    #TODO
    
    populationSize = 20 #20
    numElite = 5 #5
    numEpisodes = 5 #5
    numTrials = 50 #50
    numIterations = 20 #20
    Kp = 10 #10
    alpha = 2.5 #2.5
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                cartpole.reset()
                count += 1

            return expected_reward / numEpisodes


        
        def initPopulation(populationSize : int) -> np.ndarray:
            return np.random.randn(populationSize, tabular_softmax.parameters.shape[0])


        agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha)

        

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * populationSize : count])
            print(iteration * numEpisodes * populationSize)
            print(count)
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain Genetic Algorithm (standard deviation error bars) - 50 trials', 1000)
Ejemplo n.º 10
0
def problem5():
    """
    Repeat the previous question, but using first-choice hill-climbing (as 
    described in class) on the cart-pole domain. Report the same quantities 
    and how the policy was parameterized. 
    
    """
    #TODO
    
    
    sigma = 1.0

    numEpisodes = 150 #100
    numTrials = 50 #50
    numIterations = 75 #50
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = -1


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                if(count != -1):
                    returns[trial][count] = G
                    count += 1
                cartpole.reset()

            return expected_reward / numEpisodes


        agent = FCHC(theta, sigma, evaluateFunction, numEpisodes)

        count = 0

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes : count])
            print(np.mean(returns[trial][iteration * numEpisodes : count]))
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain First Choice Hill Climbing (standard deviation error bars) - 50 trials', 1000)
Ejemplo n.º 11
0
def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO
    
    popSize = 10 #10
    numElite = 5 #5
    epsilon = 4.0 #4.0
    sigma = 1.0 #1.0
    numEpisodes = 20 #20
    numTrials = 5 #5
    numIterations = 40 #40
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations * popSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                cartpole.reset()
                count += 1

            return expected_reward / numEpisodes


        agent = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluateFunction, epsilon)

        

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * popSize : count])
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain Cross Entropy Method (standard deviation error bars) - 5 trials', 1000)
Ejemplo n.º 12
0
 def reset(self):
     self.environment = Cartpole()
     self.policy = SoftmaxThetaPhi(self._numStates, self._numActions)
     self.policy.k = self._k
Ejemplo n.º 13
0
 def __init__(self):
     self.environment = Cartpole()
     self.policy = SoftmaxThetaPhi(4, 2)
     self._G = []
Ejemplo n.º 14
0
 def reset(self):
     self.environment = Cartpole()
     self.policy = SoftmaxThetaPhi(4, 2)
     self._G = []
Ejemplo n.º 15
0
def problem6():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """

    print("ga-cartpole-softmax_theta_phi")

    #    fourier_param = 2
    def initPopFn(pop_size):
        theta_arr = np.zeros((pop_size, 2 * fourier_param**4))
        return theta_arr

    state = np.array([0, 0, 0, 0])

    env = Cartpole()
    env.nextState(state, 0)

    fourier_param = 4

    #    theta = np.zeros(2*fourier_param**4)
    #    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 5
    evaluate = EvaluateCartpole()
    #    epsilon = 0.005

    ga = GA(popSize, evaluate, initPopFn, numElite, numEpisodes)

    #    numTrials = 50
    numTrials = 10
    numIterations = 100
    #    numIterations = 250
    #    numIterations = 20
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        ga.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cart ga: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            ga.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "ga_cartpole"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)

    #TODO
    pass
Ejemplo n.º 16
0
def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO

    print("cem-cartpole-softmax_theta_phi")

    state = np.array([0, 0, 0, 0])

    env = Cartpole()
    env.nextState(state, 0)

    fourier_param = 4

    theta = np.zeros(2 * fourier_param**4)
    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 5
    #    numEpisodes = 20
    evaluate = EvaluateCartpole()
    epsilon = 0.005

    cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon)

    #    numTrials = 50
    numTrials = 10
    #    numIterations = 250
    numIterations = 100

    #    total_episodes = 20,000
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        cem.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cart cem: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            cem.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "cem_cartpole"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)