def problem4(config, iterations: int = 25): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ all_returns = [] def evaluate(p, episodes): returns = [] for i in range(episodes): r = run_cartpole_episode(p, config[0]) returns.append(r) all_returns.append(r) return np.mean(returns) agent_policy = LinearApproximation(state_dim=4, num_actions=2, basis=config[0]) agent = CEM(agent_policy.parameters, sigma=config[1], popSize=config[2], numElite=config[3], numEpisodes=config[4], evaluationFunction=evaluate, epsilon=config[5]) for i in range(iterations): agent_policy.parameters = agent.train() return np.array(all_returns)
def problem1(para: dict, trails: int = 50): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ sigma = para['sigma'] popSize = para['popSize'] numElite = para['numElite'] numEpisodes = para['numEpisodes'] epsilon = para['epsilon'] mean_return_log = [] print( 'sigma:{}\tpopSize:{}\tnumElite:{}\tnumEpisodes:{}\tepsilon:{}'.format( sigma, popSize, numElite, numEpisodes, epsilon)) def evaluate(theta, numEpisodes): eva_policy = TabularSoftmax(25, 4) eva_policy.parameters = theta returns = runEnvironment_gridworld(eva_policy, numEpisodes) mean_return = np.mean(returns) mean_return_log.append(mean_return) # print(mean_return) return mean_return policy = TabularSoftmax(25, 4) agent = CEM(theta=policy.parameters, sigma=sigma, popSize=popSize, numElite=numElite, numEpisodes=numEpisodes, evaluationFunction=evaluate, epsilon=epsilon) for i in range(trails): policy.parameters = agent.train() print('Episode {} finished'.format(i)) return mean_return_log
def problem4(para: dict, trails: int = 50): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ sigma = para['sigma'] popSize = para['popSize'] numElite = para['numElite'] numEpisodes = para['numEpisodes'] epsilon = para['epsilon'] mean_return_log = [] print( 'sigma:{}\tpopSize:{}\tnumElite:{}\tnumEpisodes:{}\tepsilon:{}'.format( sigma, popSize, numElite, numEpisodes, epsilon)) def evaluate(theta, numEpisodes): eva_policy = LinearSoftmax(4, 2, 2) eva_policy.parameters = theta returns = runEnvironment_carpole(eva_policy, numEpisodes) mean_return = np.mean(returns) mean_return_log.append(mean_return) # print(mean_return) return mean_return policy = LinearSoftmax(4, 2, 2) agent = CEM(theta=policy.parameters, sigma=sigma, popSize=popSize, numElite=numElite, numEpisodes=numEpisodes, evaluationFunction=evaluate, epsilon=epsilon) for i in range(trails): policy.parameters = agent.train() print('Episode {} finished'.format(i)) return mean_return_log
def problem1(config, iterations: int = 200): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ all_returns = [] def evaluate(p, episodes): returns = [] for i in range(episodes): r = run_gridworld_episode(p) returns.append(r) all_returns.append(r) return np.mean(returns) agent_policy = TabularSoftmax(25, 4) agent = CEM(agent_policy.parameters, sigma=config[0], popSize=config[1], numElite=config[2], numEpisodes=config[3], evaluationFunction=evaluate, epsilon=config[4]) bar = range(iterations) for i in bar: agent_policy.parameters = agent.train() # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5))) return np.array(all_returns)
def problem4(): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ #TODO print("Problem 4") m = 4 numActions = 2 numTrials = 50 numEps = 10 numIters = 30 popSize = 10 numElite = 5 epsilon = 1.25 sigma = 0.1 k = 3 policyEval = CartPoleEvaluation(k=k) # print ("Size of theta = ", numActions*np.power(k+1, m)) agent = CEM(np.zeros(numActions * np.power(k + 1, m)), sigma, popSize, numElite, numEps, policyEval, epsilon) for trial in range(numTrials): print("Trial ", trial) for it in range(numIters): print("Iteration ", it) agent.train() policyEval.endTrial() agent.reset() policyEval.plot('learningCurve_cartpole_CEM_{}.png'.format(trial), "Learning Curve - Cartpole with CEM Agent")
def problem1(): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ #TODO print("Problem 1") numStates = 25 numActions = 4 numTrials = 50 numIters = 150 numEps = 25 popSize = 20 numElite = 10 epsilon = 1.5 sigma = 0.25 policyEval = GridworldEvaluation() agent = CEM(np.zeros(numStates * numActions), sigma, popSize, numElite, numEps, policyEval, epsilon) for trial in range(numTrials): print("Trial ", trial) for it in range(numIters): print("Iteration ", it) agent.train() policyEval.endTrial() agent.reset() policyEval.plot('learningCurve_gridworld_CEM_{}.png'.format(trial), "Learning Curve - Gridworld with CEM Agent")
def problem4(): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ #TODO popSize = 10 #10 numElite = 5 #5 epsilon = 4.0 #4.0 sigma = 1.0 #1.0 numEpisodes = 20 #20 numTrials = 5 #5 numIterations = 40 #40 k = 2 #2 returns = np.zeros((numTrials, numEpisodes * numIterations * popSize)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) cartpole = Cartpole() tabular_softmax = TabularSoftmaxContinuous(k, 2) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 1000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = cartpole.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = cartpole.step(action) G += (discount) * reward discount *= cartpole.gamma if end == True: break state = nextstate expected_reward += G returns[trial][count] = G cartpole.reset() count += 1 return expected_reward / numEpisodes agent = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluateFunction, epsilon) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes * popSize : count]) # l = [[0 for i in range(5)] for j in range(5)] # for i in range(25): # s = tabular_softmax.getActionProbabilities(i) # print(s) # r = np.argmax(s) # if(r == 0): # l[i//5][i % 5] = '↑' # elif(r == 1): # l[i//5][i % 5] = '↓' # elif(r == 2): # l[i//5][i % 5] = '←' # elif(r == 3): # l[i//5][i % 5] = '→' # for i in range(5): # print(l[i]) print(p) plot(returns, 'Cartpole domain Cross Entropy Method (standard deviation error bars) - 5 trials', 1000)
discount *= gridworld.gamma if end == True: break elif t == 200: G = -50 break state = nextstate expected_reward += G returns[trial][count] = G gridworld.reset() count += 1 return expected_reward / numEpisodes agent = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluateFunction, epsilon) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() l = [[0 for i in range(5)] for j in range(5)] for i in range(25): k = tabular_softmax.getActionProbabilities(i) print(k) r = np.argmax(k) if(r == 0): l[i//5][i % 5] = '↑'
def problem4(): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ #TODO print("cem-cartpole-softmax_theta_phi") state = np.array([0, 0, 0, 0]) env = Cartpole() env.nextState(state, 0) fourier_param = 4 theta = np.zeros(2 * fourier_param**4) sigma = 1 popSize = 10 numElite = 3 numEpisodes = 5 # numEpisodes = 20 evaluate = EvaluateCartpole() epsilon = 0.005 cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon) # numTrials = 50 numTrials = 10 # numIterations = 250 numIterations = 100 # total_episodes = 20,000 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 results = np.zeros((numTrials, total_episodes)) for trial in range(numTrials): cem.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("cart cem: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) cem.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial, batch_start:batch_end] = np.array(evaluate.batchReturn) average_results = np.average(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array([i for i in range(maximumEpisodes)]), average_results, std_results, marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "cem_cartpole" now = datetime.now() param_string = "_numTrials_"+str(numTrials)+"_numIter_" \ + str(numIterations) + "_popSize_" +str(popSize) dt_string = now.strftime("_t_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results)
def problem1(): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ print("cem-gridworld-tabular_softmax") theta = np.zeros(100) sigma = 1 popSize = 10 numElite = 3 numEpisodes = 10 evaluate = Evaluate() epsilon = 5 cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon) # numTrials = 50 numTrials = 50 numIterations = 250 # numIterations = 50 # numIterations = 20 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 results = np.zeros((numTrials, total_episodes)) for trial in range(numTrials): cem.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("cem: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) cem.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial, batch_start:batch_end] = np.array(evaluate.batchReturn) average_results = np.average(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array([i for i in range(maximumEpisodes)]), average_results, std_results, fmt='o', marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "cem_gridworld" now = datetime.now() param_string = "_numTrials_"+str(numTrials)+"_numIter_" \ + str(numIterations) + "_popSize_" +str(popSize) dt_string = now.strftime("_t_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results)