def problem4(): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ #TODO print("Problem 4") m = 4 numActions = 2 numTrials = 50 numEps = 10 numIters = 30 popSize = 10 numElite = 5 epsilon = 1.25 sigma = 0.1 k = 3 policyEval = CartPoleEvaluation(k=k) # print ("Size of theta = ", numActions*np.power(k+1, m)) agent = CEM(np.zeros(numActions * np.power(k + 1, m)), sigma, popSize, numElite, numEps, policyEval, epsilon) for trial in range(numTrials): print("Trial ", trial) for it in range(numIters): print("Iteration ", it) agent.train() policyEval.endTrial() agent.reset() policyEval.plot('learningCurve_cartpole_CEM_{}.png'.format(trial), "Learning Curve - Cartpole with CEM Agent")
def problem1(): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ #TODO print("Problem 1") numStates = 25 numActions = 4 numTrials = 50 numIters = 150 numEps = 25 popSize = 20 numElite = 10 epsilon = 1.5 sigma = 0.25 policyEval = GridworldEvaluation() agent = CEM(np.zeros(numStates * numActions), sigma, popSize, numElite, numEps, policyEval, epsilon) for trial in range(numTrials): print("Trial ", trial) for it in range(numIters): print("Iteration ", it) agent.train() policyEval.endTrial() agent.reset() policyEval.plot('learningCurve_gridworld_CEM_{}.png'.format(trial), "Learning Curve - Gridworld with CEM Agent")
def problem4(): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ #TODO print("cem-cartpole-softmax_theta_phi") state = np.array([0, 0, 0, 0]) env = Cartpole() env.nextState(state, 0) fourier_param = 4 theta = np.zeros(2 * fourier_param**4) sigma = 1 popSize = 10 numElite = 3 numEpisodes = 5 # numEpisodes = 20 evaluate = EvaluateCartpole() epsilon = 0.005 cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon) # numTrials = 50 numTrials = 10 # numIterations = 250 numIterations = 100 # total_episodes = 20,000 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 results = np.zeros((numTrials, total_episodes)) for trial in range(numTrials): cem.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("cart cem: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) cem.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial, batch_start:batch_end] = np.array(evaluate.batchReturn) average_results = np.average(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array([i for i in range(maximumEpisodes)]), average_results, std_results, marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "cem_cartpole" now = datetime.now() param_string = "_numTrials_"+str(numTrials)+"_numIter_" \ + str(numIterations) + "_popSize_" +str(popSize) dt_string = now.strftime("_t_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results)
def problem1(): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ print("cem-gridworld-tabular_softmax") theta = np.zeros(100) sigma = 1 popSize = 10 numElite = 3 numEpisodes = 10 evaluate = Evaluate() epsilon = 5 cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon) # numTrials = 50 numTrials = 50 numIterations = 250 # numIterations = 50 # numIterations = 20 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 results = np.zeros((numTrials, total_episodes)) for trial in range(numTrials): cem.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("cem: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) cem.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial, batch_start:batch_end] = np.array(evaluate.batchReturn) average_results = np.average(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array([i for i in range(maximumEpisodes)]), average_results, std_results, fmt='o', marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "cem_gridworld" now = datetime.now() param_string = "_numTrials_"+str(numTrials)+"_numIter_" \ + str(numIterations) + "_popSize_" +str(popSize) dt_string = now.strftime("_t_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results)