def better_run_experiment(num_runs, num_episodes): # Same as last function all_steps = [] agent = ag.ExpectedSarsaAgent env = enviro.MountainEnvironment total_reward_per_run = [] for run in range(num_runs): start = time.time() if run % 5 == 0: print("RUN: {}".format(run)) initial_weights = np.random.uniform(-0.001, 0) agent_info = { "num_tilings": 32, "num_tiles": 4, "iht_size": 4096, "epsilon": 0.1, "gamma": 1, "alpha": 0.7 / 32, "initial_weights": initial_weights, "num_actions": 3 } env_info = { "min_position": -1.2, "max_position": 0.5, "min_velocity": -0.07, "max_velocity": 0.07, "gravity": 0.0025, "action_discount": 0.001 } rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) steps_per_episode = [] for episode in range(num_episodes): # 15000 is max steps of the episode rl_glue.rl_episode(15000) steps_per_episode.append(rl_glue.num_steps) total_reward = np.sum(steps_per_episode) * -1 all_steps.append(np.array(steps_per_episode)) print("Run time: {}".format(time.time() - start)) total_reward_per_run.append(total_reward) data = np.mean(total_reward_per_run) data_std_err = np.std(total_reward_per_run, axis=0) / np.sqrt(num_runs) plt.title("Expected Sarsa MountainCar (Alternate Parameters)", fontdict={ 'fontsize': 16, 'fontweight': 25 }, pad=15.0) plt.xlabel("Epsiode", labelpad=5.0) plt.ylabel("Steps per Episode (averaged over " + str(num_runs) + " runs)", labelpad=10.0) plt.plot(np.mean(np.array(all_steps), axis=0)) plt.show() np.save("ExpectedSarsa_test", np.array(all_steps)) print("mean: ", data) print("standard error: ", data_std_err)
def testPolicy(policy): agent = testAgent(policy) env = Environment() rlglue = RLGlue(env, agent) del env, agent rlglue.rl_init() # set up 2d array for average rewards # rewards[step] = sum of rewards across all runs for that step rewards = [0 for i in range(1000)] for run in range(1): rlglue.rl_init() #rlglue.rl_env_message('renderON') rlglue.rl_start() terminal = False for step in range(1000): if not terminal: r, s, a, terminal = rlglue.rl_step() rewards[step] += r # average rewards rewards = [i / 1 for i in rewards] return rewards
def main(agent_info, agent_class, steps, filename): env_class = floating_horsetrack_environment.Environment rl_glue = RLGlue(env_class, agent_class) max_steps = steps step = 0 episode_end = [] cum_reward = 0 agent_info.update({"actions": env_class.actions}) rl_glue.rl_init(agent_info) while step < max_steps: rl_glue.rl_start() is_terminal = False while not is_terminal and step < max_steps: reward, state, action, is_terminal = rl_glue.rl_step() cum_reward += reward step += 1 if is_terminal: episode_end.append(step) rl_glue.rl_cleanup() save_results(episode_end, len(episode_end), "data/{}".format(filename))
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters): rl_glue = RLGlue(environment, agent) # save sum of reward at the end of each episode agent_sum_reward = np.zeros((experiment_parameters["num_runs"], experiment_parameters["num_episodes"])) env_info = {} agent_info = agent_parameters # one agent setting for run in range(1, experiment_parameters["num_runs"]+1): agent_info["seed"] = run agent_info["network_config"]["seed"] = run env_info["seed"] = run rl_glue.rl_init(agent_info, env_info) for episode in tqdm(range(1, experiment_parameters["num_episodes"]+1)): # run episode rl_glue.rl_episode(experiment_parameters["timeout"]) episode_reward = rl_glue.rl_agent_message("get_sum_reward") agent_sum_reward[run - 1, episode - 1] = episode_reward save_name = "{}".format(rl_glue.agent.name) if not os.path.exists('results'): os.makedirs('results') np.save("results/sum_reward_{}".format(save_name), agent_sum_reward) shutil.make_archive('results', 'zip', 'results')
def main(): num_eps = 200000 agent = Agent() env = Environment() rlglue = RLGlue(env, agent) del agent, env solves = 0 rlglue.rl_init() rewards = [] for ep in range(num_eps): rlglue.rl_start() #rlglue.rl_env_message('renderON') terminal = False reward = 0 while not terminal: reward, state, action, terminal = rlglue.rl_step() if ep > 1000: rlglue.rl_env_message('renderON') print(state) time.sleep(0.1) rewards.append(reward) if ep >= 99: if np.average(rewards[ep-99:ep+1]) > 0.78: print('solved at episode %d' % ep+1) break else: pass
def question_4(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 50 max_eps_steps = 1000000 steps = np.zeros([num_runs, num_episodes]) rewards = [] for r in range(num_runs): print("run number : ", r + 1) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() reward = rlglue.total_reward() rewards.append(reward) mean = sum(rewards) / len(rewards) stder = statistics.stdev(rewards) / math.sqrt(len(rewards)) print("mean:", mean) print("std:", stder) np.save('bonus_steps', steps) np.save("mean", mean) np.save("stder", stder)
def main(agent_info, agent_class, env_info, env_class, steps, param_info): # env_class = horsetrack_environment.Environment rl_glue = RLGlue(env_class, agent_class) max_steps = steps max_episodes = 5 step = 0 episodes = 0 episode_end = np.ones(max_episodes) * max_steps cum_reward = 0 # max_steps = 20000 agent_info.update({"actions": env_class.actions}) rl_glue.rl_init(agent_info, env_info) while step < max_steps and episodes < max_episodes: rl_glue.rl_start() is_terminal = False while not is_terminal and step < max_steps: reward, state, action, is_terminal = rl_glue.rl_step() cum_reward += reward step += 1 if is_terminal: episode_end[episodes] = step episodes += 1 rl_glue.rl_cleanup() save_results(episode_end, "{}".format(param_info))
def run_experiment(env_info, agent_info, num_episodes=5000, experiment_name=None, plot_freq=100, true_values_file=None, value_error_threshold=1e-8): env = CliffWalkEnvironment agent = TDAgent rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) manager = Manager(env_info, agent_info, true_values_file=true_values_file, experiment_name=experiment_name) for episode in range(1, num_episodes + 1): rl_glue.rl_episode(0) # no step limit if episode % plot_freq == 0: values = rl_glue.agent.agent_message("get_values") manager.visualize(values, episode) values = rl_glue.agent.agent_message("get_values") if true_values_file is not None: # Grading: The Manager will check that the values computed using your TD agent match # the true values (within some small allowance) across the states. In addition, it also # checks whether the root mean squared value error is close to 0. manager.run_tests(values, value_error_threshold) return values
def main(): num_eps = 5000 num_runs = 10 random.seed(0) np.random.seed(0) agent = Agent() env = Environment() rlglue = RLGlue(env, agent) del agent, env for run in range(num_runs): rlglue.rl_init() performances = [] for ep in range(num_eps): rlglue.rl_start() #rlglue.rl_env_message('renderON') terminal = False while not terminal: reward, state, action, terminal = rlglue.rl_step() # Find the first policy that performs at 100% performance = testPolicy(rlglue.rl_agent_message('policy')) * 100 performances.append(performance) if performance >= 100: #print(rlglue.rl_agent_message('policy')) print('Episode: %d' % (ep + 1)) break plt.plot(performances) plt.savefig('test.png')
def run_experiment(env_info, agent_info, num_episodes=5000, experiment_name=None, plot_freq=100, true_values_file=None, value_error_threshold=1e-8): env = CliffWalkEnvironment agent = TDAgent rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) manager = Manager(env_info, agent_info, true_values_file=true_values_file, experiment_name=experiment_name) for episode in range(1, num_episodes + 1): rl_glue.rl_episode(0) # no step limit if episode % plot_freq == 0: values = rl_glue.agent.agent_message("get_values") manager.visualize(values, episode) values = rl_glue.agent.agent_message("get_values") return values
def question_3(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 1000000 for _ in range(num_runs): rlglue.rl_init() i = 0 for i in range(num_episodes): rlglue.rl_episode(max_eps_steps) print(i) fout = open('value', 'w') steps = 50 w, iht = rlglue.rl_agent_message("ValueFunction") Q = np.zeros([steps, steps]) for i in range(steps): for j in range(steps): values = [] for a in range(3): value = 0 for index in tiles(iht, 8, [ 8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 * (-0.07 + (j * 0.14 / steps)) / 0.14 ], [a]): value -= w[index] values.append(value) height = max(values) fout.write(repr(height) + ' ') Q[j][i] = height fout.write('\n') fout.close() np.save("value", Q)
def main(): env_class = horsetrack_environment.Environment agent_class = random_agent.Agent rl_glue = RLGlue(env_class, agent_class) num_episodes = 1000 max_steps = 100000 print("\tPrinting one dot for every run: {}".format(num_episodes), end=' ') print("total runs to complete.") total_steps = [0 for _ in range(max_steps)] for i in range(num_episodes): rl_glue.rl_init(agent_info={"actions": env_class.actions}) rl_glue.rl_start() is_terminal = False while rl_glue.num_steps < max_steps and not is_terminal: reward, state, action, is_terminal = rl_glue.rl_step() # optimal_action[num_steps] += 1 if "action is optimal" else 0 total_steps[i] = rl_glue.num_steps rl_glue.rl_cleanup() print(".", end='') sys.stdout.flush() # prop_optimal = [num_optimal / num_episodes for num_optimal in optimal_action] save_results(total_steps, len(total_steps), "RL_EXP_OUT.dat") print("\nDone")
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) np.random.seed(0) num_episodes = 200 see_eps = [157] num_runs = 1 max_eps_steps = 100000 # test with various stepsizes (alphas) for agent stepSizes = np.linspace(0.01, 1, 100) # best stepsize so far (comment out to test many) stepSizes = [0.559184] # seperate run for each stepsize for step in stepSizes: # initialize agent and software, with chosen stepsize rlglue.rl_init() rlglue.rl_agent_message('step:' + str(step)) # keep track of total rewards for each episode total_rewards = [] for ep in range(num_episodes): # render only selected episodes if ep in see_eps: rlglue.rl_env_message('rOFF') if ep + 1 in see_eps: rlglue.rl_env_message('rON') print("Episode %d" % (ep + 1)) # initializse for episode rlglue.rl_start() terminal = False total_reward = 0 # run episode and calculate total reward while not terminal: reward, state, action, terminal = rlglue.rl_step() total_reward += reward total_rewards.append(total_reward) # calculate average reward of the last 100 episodes if ep >= 99: total = np.sum(total_rewards[ep - 99:ep + 1]) avg = total / 100 # check if results indicate the problem is solved if avg > -110: print("Solved at episode %d, avg reward: %f" % (ep + 1, avg)) break # close environment environment.close()
def main(data_output_location="new_data"): env_class = horsetrack_environment.Environment agent_class = random_agent.Agent agent_name = agent_class.__module__[agent_class.__module__.find(".") + 1:] environment_name = env_class.__module__[env_class.__module__.find(".") + 1:] rl_glue = RLGlue(env_class, agent_class) # num_episodes = 2000 # max_steps = 1000 max_total_steps = 100_000 for epsilon in [0.0, 0.1]: for alpha in [2, 1, 0.5, 0.25, 0.125, 0.0625]: print("Running Agent: {} on Environment: {}.".format( agent_name, environment_name)) agent_init_info = { "actions": [-1, 1], "world_size": 100, "epsilon": epsilon, "alpha": alpha } termination_times = [] rl_glue.rl_init(agent_init_info=agent_init_info) step_counter = 0 while step_counter < max_total_steps: rl_glue.rl_start() is_terminal = False while step_counter < max_total_steps and not is_terminal: reward, state, action, is_terminal = rl_glue.rl_step() step_counter += 1 rl_glue.rl_cleanup() # print(".", end='') sys.stdout.flush() if is_terminal: termination_times.append(step_counter) epoch_datetime = int( (datetime.datetime.now() - datetime.datetime.utcfromtimestamp(0)).total_seconds()) save_results( termination_times, len(termination_times), "{}/{}_{}__{}__epsilon{}__alpha{}.dat".format( data_output_location, epoch_datetime, agent_name, environment_name, epsilon, alpha)) print("\nDone")
def run_experiment(): #specify hyper-parameters num_runs = 1 max_episodes = 1000000 max_steps_per_episode = 100 num_states = 181 num_actions = 2 alpha = 0.01 eps = 0.1 Q1 = 0 results = np.zeros(max_episodes) results_run = 0 agent = RandomAgent(num_states, num_actions, alpha, eps, Q1) environment = BlackJack() rlglue = RLGlue(environment, agent) print( "\nPrinting one dot for every run: {0} total runs to complete".format( num_runs)) for run in range(num_runs): np.random.seed(run) results_run = 0.0 rlglue.rl_init() for e in range(1, max_episodes + 1): rlglue.rl_start() for s in range(max_steps_per_episode): r, _, _, terminal = rlglue.rl_step() results_run += r results[e - 1] += r if terminal: break if e % 10000 == 0: print( "\nEpisode {}: average return till episode is {}, and policy is" .format(e, results_run / e)) print(rlglue.rl_agent_message("printPolicy")) print(".") print("Average return over experiment: {}".format( (results / num_runs).mean())) #save final policy to file -- change file name as necessary with open("policy.txt", 'w') as f: f.write(rlglue.rl_agent_message("printPolicy")) #save all the experiment data for analysis -- change file name as necessary save_results(results / num_runs, max_episodes, "RL_EXP_OUT.dat")
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 50 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): #print("Episode number: "+str(e)) rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() #print("Number of steps: "+str(steps)) # print(steps[r, e]) np.save('steps', steps) plotGraph() del agent, environment, rlglue agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): print("Episode number: "+str(e)) rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() #print("Number of steps: "+str(steps)) # print(steps[r, e]) #np.save('steps', steps) #plotGraph() rlglue.rl_agent_message("plot3DGraph")
def testPolicy(policy): env = Environment() agent = testAgent(policy) rlglue = RLGlue(env, agent) rlglue.rl_init() #rlglue.rl_env_message('renderON') performance = 0 for ep in range(100): rlglue.rl_start() terminal = False reward = None while not terminal: reward, state, action, terminal = rlglue.rl_step() if reward > 0: performance += 1 return performance / 100
def question_3(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 num_episodes = 1000 num_runs = 1 numActions=3 rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) weights = rlglue.rl_agent_message("3D plot of the cast-to-go") fout = open('value','w') steps = 50 z = np.zeros((50,50)) for i in range(steps): for j in range(steps): values = [] for a in range(numActions): tile = [8*(-1.2+(i*1.7/steps))/1.7,8*(-0.07+(j*0.14/steps))/0.14] inds = agent.get_index(tile,a) values.append(np.sum([weights[i] for i in inds])) height = max(values) z[j][i]=-height fout.write(repr(-height)+' ') fout.write('\n') fout.close() fig = plt.figure() ax = fig.add_subplot(111,projection ='3d') x = np.arange(-1.2,0.5,1.7/50) y = np.arange(-0.07,0.07,0.14/50) x,y = np.meshgrid(x,y) ax.set_xticks([-1.2, 0.5]) ax.set_yticks([0.07, -0.07]) ax.set_ylabel('Velocity') ax.set_xlabel('Position') ax.set_zlabel('Cost-To-Go') ax.plot_surface(x,y,z) plt.savefig('cost-to-go.png') plt.show() np.save('steps', steps)
def tiling(real_value): environment = RandomWalkEnvironment() agent = Agent2() rl = RLGlue(environment, agent) error = np.zeros(5000) for run in range(30): rl.rl_init() for episode in range(2000): rl.rl_episode(10000) estimate = rl.RL_agent_message("ValueFunction") error[episode] += np.sqrt( np.mean(np.power(real_value - estimate, 2))) rl.RL_cleanup() file2 = open("tiling_output.txt", "w") for i in range(2000): file2.write(format(error[i] / 10)) file2.close()
def question_1(num_episodes): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 steps = np.zeros(num_episodes) rlglue.rl_init() for e in tqdm(range(num_episodes)): rlglue.rl_episode(max_eps_steps) steps[e] = rlglue.num_ep_steps() # print(steps[e]) return steps
def question_1(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 5 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # print(steps[r, e]) np.save('steps', steps)
def testPolicy(policy): agent = testAgent(policy) env = Environment() rlglue = RLGlue(env, agent) del env, agent rlglue.rl_init() for run in range(1): rlglue.rl_init() rlglue.rl_env_message('renderON') rlglue.rl_start() total_reward = 0 terminal = False while not terminal: r, s, a, terminal = rlglue.rl_step() total_reward += r return total_reward
def run_experiment(env_info, agent_info, num_episodes=5000, value_error_threshold=1e-8, plot_freq=10): env = GridEnvironment agent = TDAgent rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) steps = [] for episode in range(1, num_episodes + 1): rl_glue.rl_episode(0) # no step limit steps.append(rl_glue.agent.agent_message("get_steps")) if episode % plot_freq == 0: values = rl_glue.agent.agent_message("get_values") print(rl_glue.environment.env_message("get_grid_state")) rl_glue.rl_cleanup() values = rl_glue.agent.agent_message("get_values") return [values, steps]
def question_2(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) max_eps_steps = 100000 num_episodes = 1000 rlglue.rl_init() for _ in tqdm(range(num_episodes)): rlglue.rl_episode(max_eps_steps) q3_plot = rlglue.rl_agent_message("plot") fig = plt.figure() ax = fig.gca(projection='3d') X, Y = np.meshgrid(q3_plot[0], q3_plot[1]) surf = ax.plot_surface(X, Y, q3_plot[2]) ax.set_xlim(q3_plot[0][0], q3_plot[0][-1]) ax.set_ylim(q3_plot[1][0], q3_plot[1][-1]) plt.show()
def experiment1(): agent = RandomAgent() environment = Environment1D() rlg = RLGlue(environment, agent) max_steps = 1000 # max number of steps in an episode num_runs = 2000 # number of repetitions of the experiment optimal_action = np.zeros(max_steps) for k in range(num_runs): # initialize RL-Glue rlg.rl_init() #env_init + agent_init rlg.rl_start() for i in range(max_steps): #step action = rlg.rl_step()[2] if action == environment.env_message(): optimal_action[i] += 1 ratio_optimal_action = optimal_action / num_runs return ratio_optimal_action
def question_3(): agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) # only 1 run for r in range(num_runs): print("1000 episode run : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # get the list of value functions [X,Y,Z] represents position, velocity, state-value Return = rlglue.rl_agent_message(1) return Return
def question_1(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 num_action = 3 for r in range(num_runs): print("run number : ", r) #np.random.seed(r) rlglue.rl_init() for _ in range(num_episodes): rlglue.rl_episode(max_eps_steps) weight = rlglue.rl_agent_message('get weight') # algorithm from assignment #fout = open('value','w') steps = 50 neg_q_hat = np.zeros((steps, steps)) for i in range(steps): for j in range(steps): values = [] position = -1.2 + (i * 1.7 / steps) velocity = -0.07 + (j * 0.14 / steps) for a in range(num_action): tile_idx = agent.plot_get_feature(position, velocity, a) q_hat = np.sum(weight[tile_idx]) values.append(q_hat) height = np.max(values) neg_q_hat[j][i] = -height #fout.write(repr(-height)+' ') #fout.write('\n') #fout.close() np.save('neg_q_hat', neg_q_hat)
def main(): env = drifter_distractor_env.Environment env = switched_drifter_distractor_env.Environment agents = [random_agent.Agent, weight_change_agent.Agent] agent_types = ["absolute_error", "squared_error", "weight_change"] for agent_type in agent_types: agent = agents[1] agent_info = { "num_actions": 4, "action_selection": "softmax", "agent_type": agent_type } env_info = {} num_runs = 1 num_steps = 100000 actions = [0 for _ in range(4)] errors = [] for run in range(num_runs): rl_glue = RLGlue(env, agent) rl_glue.rl_init(agent_info, env_info) rl_glue.rl_start() for step in range(num_steps): reward, state, action, is_terminal = rl_glue.rl_step() actions[action] += 1 # np.save("data/squared_error", rl_glue.agent.track_actions) np.save("data/{}".format(agent_type), rl_glue.agent.track_actions) # print(rl_glue.environment.arm_1) # print(rl_glue.environment.arm_2) # print(rl_glue.environment.arm_3) # print(rl_glue.environment.arm_4) print(actions)
def part3(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 200 num_runs = 1 max_eps_steps = 100000 steps = np.zeros([num_runs, num_episodes]) for r in range(num_runs): print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) steps[r, e] = rlglue.num_ep_steps() # print(steps[r, e]) np.save('steps', steps) fout = open('value', 'w') steps = 50 num_of_actions = 3 for i in range(steps): for j in range(steps): q = [] for a in range(num_of_actions): pos = -1.2 + (1 * 1.7 / steps) vel = -0.07 + (j * 0.14 / steps) tile = (pos, vel) inds = Agent.F(tile, self.action) q.append(np.sum(self.weights[inds])) height = max(q) fout.write(repr(-height) + '') fout.write('\n') fout.close() np.save('heights', height) np.save('steps', steps)
def question_3(): num_episodes = 1000 num_runs = 1 max_eps_steps = 100000 agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) for r in range(num_runs): start = time.time() print("run number : ", r) rlglue.rl_init() for e in range(num_episodes): rlglue.rl_episode(max_eps_steps) end = time.time() print(str(end - start) + " seconds elapsed") action_vals, pos, vel = rlglue.rl_agent_message("return info") action_vals = np.multiply(action_vals, -1) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(pos, vel, action_vals) plt.show()