def run_agents_on_mdp(agents, mdp, instances=5, episodes=100, steps=200, clear_old_results=True): ''' Args: agents (list of Agents): See agents/AgentClass.py (and friends). mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*. instances (int) [opt]: Number of times to run each agent (for confidence intervals). episodes (int) [opt]: Number of episodes for each learning instance. steps (int) [opt]: Number of steps per episode. clear_old_results (bool) [opt]: If true, removes all results files in the relevant results dir. Summary: Runs each agent on the given mdp according to the given parameters. Stores results in results/<agent_name>.csv and automatically generates a plot and opens it. ''' # Experiment (for reproducibility, plotting). exp_params = {"instances": instances, "episodes": episodes, "steps": steps} experiment = Experiment(agents=agents, mdp=mdp, params=exp_params, is_episodic=episodes > 1, clear_old_results=clear_old_results) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) times = _main_experiment_loop( agents, mdp, instances, episodes, steps, experiment, ) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(times[agent]) + " seconds." print "-------------\n" if not isinstance(mdp, GymMDP): experiment.make_plots(open_plot=True)
def play_markov_game(agent_ls, markov_game_mdp, instances=10, episodes=100, steps=30, verbose=False, open_plot=True): ''' Args: agent_list (list of Agents): See agents/AgentClass.py (and friends). markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py. instances (int): Number of times to run each agent (for confidence intervals). episodes (int): Number of episodes for each learning instance. steps (int): Number of times to run each agent (for confidence intervals). verbose (bool) open_plot (bool): If true opens plot. ''' # Put into dict. agent_dict = {} for a in agent_ls: agent_dict[a.name] = a # Experiment (for reproducibility, plotting). exp_params = {"instances": instances, "episodes": episodes, "steps": steps} experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) start = time.clock() # For each instance of the agent. for instance in range(1, instances + 1): print("\tInstance " + str(instance) + " of " + str(int(instances)) + ".") reward_dict = defaultdict(str) action_dict = {} for episode in range(1, episodes + 1): if verbose: sys.stdout.write("\tEpisode %s of %s" % (episode, episodes)) sys.stdout.write("\b" * len("\tEpisode %s of %s" % (episode, episodes))) sys.stdout.flush() # Compute initial state/reward. state = markov_game_mdp.get_init_state() for step in range(steps): # Compute each agent's policy. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Terminal check. if state.is_terminal(): experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state) continue # Execute in MDP. reward_dict, next_state = markov_game_mdp.execute_agent_action( action_dict) # Record the experience. experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state) # Update pointer. state = next_state # A final update. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Process that learning instance's info at end of learning. experiment.end_of_episode(a.name) # Reset the MDP, tell the agent the episode is over. markov_game_mdp.reset() # A final update. for a in agent_dict.values(): # Reset the agent and track experiment info. experiment.end_of_instance(a.name) a.reset() # Time stuff. print("Experiment took " + str(round(time.clock() - start, 2)) + " seconds.") experiment.make_plots(open_plot=open_plot)
def run_agents_on_mdp(agents, mdp, instances=5, episodes=100, steps=200, clear_old_results=True, rew_step_count=1, track_disc_reward=False, open_plot=True, verbose=False, reset_at_terminal=False, cumulative_plot=True, dir_for_plot="results", experiment_name_prefix="", track_success=False, success_reward=None): ''' Args: agents (list of Agents): See agents/AgentClass.py (and friends). mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*. instances (int): Number of times to run each agent (for confidence intervals). episodes (int): Number of episodes for each learning instance. steps (int): Number of steps per episode. clear_old_results (bool): If true, removes all results files in the relevant results dir. rew_step_count (int): Number of steps before recording reward. track_disc_reward (bool): If true, track (and plot) discounted reward. open_plot (bool): If true opens the plot at the end. verbose (bool): If true, prints status bars per episode/instance. reset_at_terminal (bool): If true sends the agent to the start state after terminal. cumulative_plot (bool): If true makes a cumulative plot, otherwise plots avg. reward per timestep. dir_for_plot (str): Path experiment_name_prefix (str): Adds this to the end of the usual experiment name. track_success (bool): If true, tracks whether each run is successful and generates an additional success plot at the end. success_reward (int): If set, determines the success criteria. Summary: Runs each agent on the given mdp according to the given parameters. Stores results in results/<agent_name>.csv and automatically generates a plot and opens it. ''' if track_success and success_reward is None: raise ValueError( "(simple_rl): run_agents_on_mdp must set param @success_reward when @track_success=True." ) # Experiment (for reproducibility, plotting). exp_params = {"instances": instances, "episodes": episodes, "steps": steps} experiment = Experiment(agents=agents, mdp=mdp, params=exp_params, is_episodic=episodes > 1, clear_old_results=clear_old_results, track_disc_reward=track_disc_reward, count_r_per_n_timestep=rew_step_count, cumulative_plot=cumulative_plot, dir_for_plot=dir_for_plot, experiment_name_prefix=experiment_name_prefix, track_success=track_success, success_reward=success_reward) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) time_dict = defaultdict(float) # Learn. for agent in agents: print(str(agent) + " is learning.") start = time.clock() # For each instance. for instance in range(1, instances + 1): print(" Instance " + str(instance) + " of " + str(instances) + ".") sys.stdout.flush() run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal=reset_at_terminal) # Reset the agent. agent.reset() mdp.end_of_instance() # Track how much time this agent took. end = time.clock() time_dict[agent] = round(end - start, 3) print() # Time stuff. print("\n--- TIMES ---") for agent in time_dict.keys(): print( str(agent) + " agent took " + str(round(time_dict[agent], 2)) + " seconds.") print("-------------\n") experiment.make_plots(open_plot=open_plot)
def run_agents_lifelong(agents, mdp_distr, samples=5, episodes=1, steps=100, clear_old_results=True, open_plot=True, verbose=False, track_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False, cumulative_plot=True, dir_for_plot="results"): ''' Args: agents (list) mdp_distr (MDPDistribution) samples (int) episodes (int) steps (int) clear_old_results (bool) open_plot (bool) verbose (bool) track_disc_reward (bool): If true records and plots discounted reward, discounted over episodes. So, if each episode is 100 steps, then episode 2 will start discounting as though it's step 101. reset_at_terminal (bool) resample_at_terminal (bool) cumulative_plot (bool) dir_for_plot (str) Summary: Runs each agent on the MDP distribution according to the given parameters. If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored. ''' # Experiment (for reproducibility, plotting). exp_params = {"samples": samples, "episodes": episodes, "steps": steps} experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_episodic=episodes > 1, is_lifelong=True, clear_old_results=clear_old_results, track_disc_reward=track_disc_reward, cumulative_plot=cumulative_plot, dir_for_plot=dir_for_plot) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) start = time.clock() times = defaultdict(float) # Learn. for agent in agents: print(str(agent) + " is learning.") start = time.clock() # --- SAMPLE NEW MDP --- for new_task in range(samples): print(" Sample " + str(new_task + 1) + " of " + str(samples) + ".") # Sample the MDP. mdp = mdp_distr.sample() # Run the agent. hit_terminal, total_steps_taken, _ = run_single_agent_on_mdp( agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal) # If we resample at terminal, keep grabbing MDPs until we're done. while resample_at_terminal and hit_terminal and total_steps_taken < steps: mdp = mdp_distr.sample() hit_terminal, steps_taken, _ = run_single_agent_on_mdp( agent, mdp, episodes, steps - total_steps_taken, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal) total_steps_taken += steps_taken # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print("\n--- TIMES ---") for agent in times.keys(): print( str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds.") print("-------------\n") experiment.make_plots(open_plot=open_plot)
def run_agents_seq(agents, mdp_list, samples=5, episodes=1, steps=100, clear_old_results=True, open_plot=True, verbose=False, track_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False, cumulative_plot=True, dir_for_plot="results"): ''' Args: agents (list) mdp_distr (MDPDistribution) samples (int) episodes (int) steps (int) clear_old_results (bool) open_plot (bool) verbose (bool) track_disc_reward (bool): If true records and plots discounted reward, discounted over episodes. So, if each episode is 100 steps, then episode 2 will start discounting as though it's step 101. reset_at_terminal (bool) resample_at_terminal (bool) cumulative_plot (bool) dir_for_plot (str) Summary: Runs each agent on the MDP distribution according to the given parameters. If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored. ''' # Experiment (for reproducibility, plotting). exp_params = {"samples": samples, "episodes": episodes, "steps": steps} experiment = Experiment(agents=agents, mdp=mdp_list, params=exp_params, is_episodic=episodes > 1, is_lifelong=True, clear_old_results=clear_old_results, track_disc_reward=track_disc_reward, cumulative_plot=cumulative_plot, dir_for_plot=dir_for_plot) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) start = time.clock() times = defaultdict(float) # Learn. for agent in agents: print(str(agent) + " is learning.") start = time.clock() # --- SAMPLE NEW MDP --- for new_task in range(samples): print(" Sample " + str(new_task + 1) + " of " + str(samples) + ".") # Sample the MDP. mdp = mdp_list.get_mdp(new_task) print("goal") print(mdp.goal_locs) # print(mdp.lava_locs) # Run the agent. hit_terminal, total_steps_taken, _ = run_single_agent_on_mdp( agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal) # if type(agent)==TabularRMaxAgent: ## print(agent.get_par_tensor()) ## print(mdp.get_par_tensor()) # with open('./rmse.txt', 'a') as f: # f.write(agent.name+str(np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor()))+"\n") ## print("RMSE:", np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor())) # # if type(agent)==PatternLearningAgent: ## print(agent.get_par_tensor()) ## print(mdp.get_par_tensor()) # with open('./rmse.txt', 'a') as f: # f.write(agent.name+str(np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor()))+"\n") ## print("RMSE:", np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor())) # mdp.get_patterns() # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print("\n--- TIMES ---") for agent in times.keys(): print( str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds.") print("-------------\n") experiment.make_plots(open_plot=open_plot)
def run_agents_on_mdp(agents, mdp, instances=5, episodes=100, steps=200, clear_old_results=True, rew_step_count=1, is_rec_disc_reward=False, open_plot=True, verbose=False, reset_at_terminal=False): ''' Args: agents (list of Agents): See agents/AgentClass.py (and friends). mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*. instances (int) [opt]: Number of times to run each agent (for confidence intervals). episodes (int) [opt]: Number of episodes for each learning instance. steps (int) [opt]: Number of steps per episode. clear_old_results (bool) [opt]: If true, removes all results files in the relevant results dir. rew_step_count (int): Number of steps before recording reward. is_rec_disc_reward (bool): If true, track (and plot) discounted reward. open_plot (bool): If true opens the plot at the end. verbose (bool): If true, prints status bars per episode/instance. Summary: Runs each agent on the given mdp according to the given parameters. Stores results in results/<agent_name>.csv and automatically generates a plot and opens it. ''' # Experiment (for reproducibility, plotting). exp_params = {"instances": instances, "episodes": episodes, "steps": steps} experiment = Experiment(agents=agents, mdp=mdp, params=exp_params, is_episodic=episodes > 1, clear_old_results=clear_old_results, is_rec_disc_reward=is_rec_disc_reward, count_r_per_n_timestep=rew_step_count) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) time_dict = defaultdict(float) # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # For each instance. for instance in xrange(1, instances + 1): sys.stdout.flush() print " Instance " + str(instance) + " of " + str(instances) + "." run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal=reset_at_terminal) # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() time_dict[agent] = round(end - start, 3) print # Time stuff. print "\n--- TIMES ---" for agent in time_dict.keys(): print str(agent) + " agent took " + str(round(time_dict[agent], 2)) + " seconds." print "-------------\n" # if not isinstance(mdp, GymMDP): experiment.make_plots(open_plot=open_plot)
def run_agents_multi_task(agents, mdp_distr, task_samples=5, episodes=1, steps=100, clear_old_results=True, open_plot=True, verbose=False, is_rec_disc_reward=False, reset_at_terminal=False): ''' Args: mdp_distr (MDPDistribution) task_samples episodes steps ''' # Experiment (for reproducibility, plotting). exp_params = { "task_samples": task_samples, "episodes": episodes, "steps": steps } experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_episodic=episodes > 1, is_multi_task=True, clear_old_results=clear_old_results, is_rec_disc_reward=is_rec_disc_reward) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() times = defaultdict(float) # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # --- SAMPLE NEW MDP --- for new_task in xrange(task_samples): print " Sample " + str(new_task + 1) + " of " + str(task_samples) + "." # Sample the MDP. mdp = mdp_distr.sample() # Run the agent. run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal) # Reset the agent. agent.reset() if "rmax" in agent.name: agent._reset_reward() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds." print "-------------\n" experiment.make_plots(open_plot=open_plot)
def run_agents_lifelong(agents, mdp_distribution, name_identifier=None, n_instances=1, n_tasks=5, n_episodes=1, n_steps=100, parallel_run=False, n_processes=None, clear_old_results=True, track_disc_reward=False, reset_at_terminal=False, cumulative_plot=True, dir_for_plot='results', verbose=False, do_run=True, do_plot=False, confidence=.9, open_plot=False, plot_title=True, plot_legend=True, episodes_moving_average=False, episodes_ma_width=10, tasks_moving_average=False, tasks_ma_width=10, latex_rendering=False): """ Runs each agent on the MDP distribution according to the given parameters. If @mdp_distribution has a non-zero horizon, then gamma is set to 1 and @steps is ignored. :param agents: (list) :param mdp_distribution: (MDPDistribution) :param name_identifier: (str) :param n_instances: (int) :param n_tasks: (int) :param n_episodes: (int) :param n_steps: (int) :param parallel_run: (bool) :param n_processes: (int) :param clear_old_results: (bool) :param track_disc_reward: (bool) If true records and plots discounted reward, discounted over episodes. So, if each episode is 100 steps, then episode 2 will start discounting as though it's step 101. :param reset_at_terminal: (bool) :param cumulative_plot: (bool) :param dir_for_plot: (str) :param verbose: (bool) :param do_run: (bool) :param do_plot: (bool) :param confidence: (float) :param open_plot: (bool) :param plot_title: (bool) :param plot_legend: (bool) :param episodes_moving_average: (bool) :param episodes_ma_width: (int) :param tasks_moving_average: (bool) :param tasks_ma_width: (int) :param latex_rendering: (bool) :return: """ exp_params = {"samples": n_tasks, "episodes": n_episodes, "steps": n_steps} experiment = Experiment(agents=agents, mdp=mdp_distribution, name_identifier=name_identifier, params=exp_params, is_episodic=n_episodes > 1, is_lifelong=True, clear_old_results=clear_old_results, track_disc_reward=track_disc_reward, cumulative_plot=cumulative_plot, dir_for_plot=dir_for_plot) path = experiment.exp_directory save_script(path) print("Running experiment:\n" + str(experiment)) # Sample tasks tasks = [] for _ in range(n_tasks): tasks.append(mdp_distribution.sample()) n_agents = len(agents) # Run if do_run: if parallel_run: n_processes = multiprocessing.cpu_count( ) if n_processes is None else n_processes print('Using', n_processes, 'threads.') pool = multiprocessing.Pool(processes=n_processes) # Asynchronous execution jobs = [] for i in range(n_agents): lifelong_save(init=True, path=path, agent=agents[i]) for j in range(n_instances): job = apply_async( pool, run_agent_lifelong, (agents[i], experiment, j, n_tasks, n_episodes, n_steps, tasks, track_disc_reward, reset_at_terminal, path, verbose)) jobs.append(job) for job in jobs: job.get() else: for i in range(n_agents): lifelong_save(init=True, path=path, agent=agents[i]) for j in range(n_instances): run_agent_lifelong(agents[i], experiment, j, n_tasks, n_episodes, n_steps, tasks, track_disc_reward, reset_at_terminal, path, verbose) # Plot if do_plot: lifelong_plot(agents, path, n_tasks, n_episodes, confidence, open_plot, plot_title, plot_legend, episodes_moving_average=episodes_moving_average, episodes_ma_width=episodes_ma_width, tasks_moving_average=tasks_moving_average, tasks_ma_width=tasks_ma_width, latex_rendering=latex_rendering)
def run_agents_multi_task(agents, mdp_distr, task_samples=5, episodes=1, steps=100, clear_old_results=True, open_plot=True, verbose=False, is_rec_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False): ''' Args: agents (list) mdp_distr (MDPDistribution) task_samples (int) episodes (int) steps (int) clear_old_results (bool) open_plot (bool) verbose (bool) is_rew_disc_reward (bool): If true records and plots discounted reward reset_at_terminal (bool) Summary: Runs each agent on the MDP distribution according to the given parameters. If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored. ''' # Set number of steps if the horizon is given. if mdp_distr.get_horizon() > 0: mdp_distr.set_gamma(1.0) steps = mdp_distr.get_horizon() # Experiment (for reproducibility, plotting). exp_params = {"task_samples":task_samples, "episodes":episodes, "steps":steps, "gamma":mdp_distr.get_gamma()} experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_episodic=episodes > 1, is_multi_task=True, clear_old_results=clear_old_results, is_rec_disc_reward=is_rec_disc_reward) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() times = defaultdict(float) # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # --- SAMPLE NEW MDP --- for new_task in xrange(task_samples): print " Sample " + str(new_task + 1) + " of " + str(task_samples) + "." # Sample the MDP. mdp = mdp_distr.sample() # Run the agent. hit_terminal, total_steps_taken = run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal, resample_at_terminal) # If we resample at terminal, keep grabbing MDPs until we're done. while resample_at_terminal and hit_terminal and total_steps_taken < steps: mdp = mdp_distr.sample() hit_terminal, steps_taken = run_single_agent_on_mdp(agent, mdp, episodes, steps - total_steps_taken, experiment, verbose, is_rec_disc_reward, reset_at_terminal, resample_at_terminal) total_steps_taken += steps_taken # Reset the agent. agent.reset() if "rmax" in agent.name: agent._reset_reward() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds." print "-------------\n" experiment.make_plots(open_plot=open_plot)
def run_agents_multi_task(agents, mdp_distr, task_samples=5, episodes=1, steps=100, clear_old_results=True, open_plot=True, verbose=False, is_rec_disc_reward=False, reset_at_terminal=False, include_optimal=False): ''' Args: agents (list) mdp_distr (MDPDistribution) task_samples episodes steps Summary: Runs each agent on the MDP distribution according to the given parameters. If @mdp_distr has a non-zero horizon, then gamma is set to 1 and #steps is ignored. ''' # Set number of steps if the horizon is given. if mdp_distr.get_horizon() > 0: mdp_distr.set_gamma(1.0) steps = mdp_distr.get_horizon() # Experiment (for reproducibility, plotting). exp_params = {"task_samples":task_samples, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_episodic=episodes > 1, is_multi_task=True, clear_old_results=clear_old_results, is_rec_disc_reward=is_rec_disc_reward) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() times = defaultdict(float) if include_optimal: fixed_policy_agent = FixedPolicyAgent(policy=lambda s: "", name="optimal") agents += [fixed_policy_agent] # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # --- SAMPLE NEW MDP --- for new_task in xrange(task_samples): print " Sample " + str(new_task + 1) + " of " + str(task_samples) + "." # Sample the MDP. mdp = mdp_distr.sample() if include_optimal and agent.name == "optimal": vi = ValueIteration(mdp) vi.run_vi() agent.set_policy(vi.policy) # Run the agent. run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal) # Reset the agent. agent.reset() if "rmax" in agent.name: agent._reset_reward() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds." print "-------------\n" experiment.make_plots(open_plot=open_plot)
def play_markov_game(agent_ls, markov_game_mdp, instances=10, episodes=100, steps=30, verbose=False, open_plot=True): ''' Args: agent_list (list of Agents): See agents/AgentClass.py (and friends). markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py. instances (int): Number of times to run each agent (for confidence intervals). episodes (int): Number of episodes for each learning instance. steps (int): Number of times to run each agent (for confidence intervals). verbose (bool) open_plot (bool): If true opens plot. ''' # Put into dict. agent_dict = {} for a in agent_ls: agent_dict[a.name] = a # Experiment (for reproducibility, plotting). exp_params = {"instances":instances, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) start = time.clock() # For each instance of the agent. for instance in range(1, instances + 1): print("\tInstance " + str(instance) + " of " + str(int(instances)) + ".") reward_dict = defaultdict(str) action_dict = {} for episode in range(1, episodes + 1): if verbose: sys.stdout.write("\tEpisode %s of %s" % (episode, episodes)) sys.stdout.write("\b" * len("\tEpisode %s of %s" % (episode, episodes))) sys.stdout.flush() # Compute initial state/reward. state = markov_game_mdp.get_init_state() for step in range(steps): # Compute each agent's policy. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Terminal check. if state.is_terminal(): experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state) continue # Execute in MDP. reward_dict, next_state = markov_game_mdp.execute_agent_action(action_dict) # Record the experience. experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state) # Update pointer. state = next_state # A final update. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Process that learning instance's info at end of learning. experiment.end_of_episode(a.name) # Reset the MDP, tell the agent the episode is over. markov_game_mdp.reset() # A final update. for a in agent_dict.values(): # Reset the agent and track experiment info. experiment.end_of_instance(a.name) a.reset() # Time stuff. print("Experiment took " + str(round(time.clock() - start, 2)) + " seconds.") experiment.make_plots(open_plot=open_plot)
def run_agents_on_mdp(agents, mdp, instances=5, episodes=100, steps=200, clear_old_results=True, rew_step_count=1, track_disc_reward=False, open_plot=True, verbose=False, reset_at_terminal=False, cumulative_plot=True, dir_for_plot="results", experiment_name_prefix="", track_success=False, success_reward=None): ''' Args: agents (list of Agents): See agents/AgentClass.py (and friends). mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*. instances (int): Number of times to run each agent (for confidence intervals). episodes (int): Number of episodes for each learning instance. steps (int): Number of steps per episode. clear_old_results (bool): If true, removes all results files in the relevant results dir. rew_step_count (int): Number of steps before recording reward. track_disc_reward (bool): If true, track (and plot) discounted reward. open_plot (bool): If true opens the plot at the end. verbose (bool): If true, prints status bars per episode/instance. reset_at_terminal (bool): If true sends the agent to the start state after terminal. cumulative_plot (bool): If true makes a cumulative plot, otherwise plots avg. reward per timestep. dir_for_plot (str): Path experiment_name_prefix (str): Adds this to the end of the usual experiment name. track_success (bool): If true, tracks whether each run is successful and generates an additional success plot at the end. success_reward (int): If set, determines the success criteria. Summary: Runs each agent on the given mdp according to the given parameters. Stores results in results/<agent_name>.csv and automatically generates a plot and opens it. ''' if track_success and success_reward is None: raise ValueError("(simple_rl): run_agents_on_mdp must set param @success_reward when @track_success=True.") # Experiment (for reproducibility, plotting). exp_params = {"instances":instances, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agents, mdp=mdp, params=exp_params, is_episodic= episodes > 1, clear_old_results=clear_old_results, track_disc_reward=track_disc_reward, count_r_per_n_timestep=rew_step_count, cumulative_plot=cumulative_plot, dir_for_plot=dir_for_plot, experiment_name_prefix=experiment_name_prefix, track_success=track_success, success_reward=success_reward) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) time_dict = defaultdict(float) # Learn. for agent in agents: print(str(agent) + " is learning.") start = time.clock() # For each instance. for instance in range(1, instances + 1): print(" Instance " + str(instance) + " of " + str(instances) + ".") sys.stdout.flush() run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal=reset_at_terminal) if "fixed" in agent.name: break # Reset the agent. agent.reset() mdp.end_of_instance() # Track how much time this agent took. end = time.clock() time_dict[agent] = round(end - start, 3) print() # Time stuff. print("\n--- TIMES ---") for agent in time_dict.keys(): print(str(agent) + " agent took " + str(round(time_dict[agent], 2)) + " seconds.") print("-------------\n") experiment.make_plots(open_plot=open_plot)
def run_agents_lifelong(agents, mdp_distr, samples=5, episodes=1, steps=100, clear_old_results=True, open_plot=True, verbose=False, track_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False, cumulative_plot=True, dir_for_plot="results"): ''' Args: agents (list) mdp_distr (MDPDistribution) samples (int) episodes (int) steps (int) clear_old_results (bool) open_plot (bool) verbose (bool) track_disc_reward (bool): If true records and plots discounted reward, discounted over episodes. So, if each episode is 100 steps, then episode 2 will start discounting as though it's step 101. reset_at_terminal (bool) resample_at_terminal (bool) cumulative_plot (bool) dir_for_plot (str) Summary: Runs each agent on the MDP distribution according to the given parameters. If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored. ''' # Experiment (for reproducibility, plotting). exp_params = {"samples":samples, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_episodic=episodes > 1, is_lifelong=True, clear_old_results=clear_old_results, track_disc_reward=track_disc_reward, cumulative_plot=cumulative_plot, dir_for_plot=dir_for_plot) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) start = time.clock() times = defaultdict(float) # Learn. for agent in agents: print(str(agent) + " is learning.") start = time.clock() # --- SAMPLE NEW MDP --- for new_task in range(samples): print(" Sample " + str(new_task + 1) + " of " + str(samples) + ".") # Sample the MDP. mdp = mdp_distr.sample() # Run the agent. hit_terminal, total_steps_taken, _ = run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal) # If we resample at terminal, keep grabbing MDPs until we're done. while resample_at_terminal and hit_terminal and total_steps_taken < steps: mdp = mdp_distr.sample() hit_terminal, steps_taken, _ = run_single_agent_on_mdp(agent, mdp, episodes, steps - total_steps_taken, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal) total_steps_taken += steps_taken # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print("\n--- TIMES ---") for agent in times.keys(): print(str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds.") print("-------------\n") experiment.make_plots(open_plot=open_plot)
def run_agents_on_mdp(agents, mdp, num_instances=20, num_episodes=2000, num_steps=50): ''' Args: agents (list of Agents): See agents/AgentClass.py (and friends). mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*. num_instances (int) [opt]: Number of times to run each agent (for confidence intervals). num_episodes (int) [opt]: Number of episodes for each learning instance. num_steps (int) [opt]: Number of steps per episode. Summary: Runs each agent on the given mdp according to the given parameters. Stores results in results/<agent_name>.csv and automatically generates a plot and opens it. ''' # Experiment (for reproducibility, plotting). exp_params = { "num_instances": num_instances, "num_episodes": num_episodes, "num_steps": num_steps } experiment = Experiment(agents=agents, mdp=mdp, params=exp_params) # Record how long each agent spends learning. times = defaultdict(float) print "Running experiment: \n" + str(experiment) # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # For each instance of the agent. for instance in xrange(1, num_instances + 1): print "\tInstance " + str(instance) + " of " + str( num_instances) + "." # For each episode. for episode in xrange(1, num_episodes + 1): # Compute initial state/reward. state = mdp.get_init_state() print "init:", state reward = 0 for step in xrange(num_steps): # Compute the agent's policy. action = agent.act(state, reward) # Execute the action in the MDP. reward, next_state = mdp.execute_agent_action(action) # Record the experience. experiment.add_experience(agent, state, action, reward, next_state) # Check if terminal state. if next_state.is_terminal(): break # Update pointer. state = next_state # Process experiment info at end of episode. experiment.end_of_episode(agent) # Reset the MDP, tell the agent the episode is over. mdp.reset() agent.end_of_episode() # Process that learning instance's info at end of learning. experiment.end_of_instance(agent) # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(times[agent]) + " seconds." print "-------------\n" experiment.make_plots()
def run_agents_multi_task(agents, mdp_distr, instances, num_switches, steps, clear_old_results=True): # Experiment (for reproducibility, plotting). exp_params = {"instances": instances, "steps": steps} experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_multi_task=True, clear_old_results=clear_old_results) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() times = defaultdict(float) # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # For each instance of the agent. for instance in xrange(1, instances + 1): print "\tInstance " + str(instance) + " of " + str(instances) + "." # --- SAMPLE NEW MDP --- for new_task in xrange(num_switches): mdp_id = np.random.multinomial( 1, mdp_distr.keys()).tolist().index(1) mdp = mdp_distr[mdp_distr.keys()[mdp_id]] # Compute initial state/reward. state = mdp.get_init_state() reward = 0 episode_start_time = time.clock() prog_bar_len = _make_step_progress_bar() # print prog_bar_len, steps for step in xrange(steps): # print "\t Step " + str(step) if int(prog_bar_len * float(step) / steps) > int( prog_bar_len * float(step - 1) / steps): _increment_bar() # Compute the agent's policy. action = agent.act(state, reward) # Terminal check. if state.is_terminal(): # Self loop if in a terminal state. experiment.add_experience(agent, state, action, 0, state) continue # Execute in MDP. reward, next_state = mdp.execute_agent_action(action) # Record the experience. experiment.add_experience(agent, state, action, reward, next_state) # Update pointer. state = next_state if "-sa" in agent.name: agent.new_task() elif "rmax" in agent.name: agent._reset_reward() _increment_bar() sys.stdout.write("\n") # A final update. action = agent.act(state, reward) # Process experiment info at end of episode. experiment.end_of_episode(agent) # Reset the MDP, tell the agent the episode is over. mdp.reset() agent.end_of_episode() # Process that learning instance's info at end of learning. experiment.end_of_instance(agent) # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(times[agent]) + " seconds." print "-------------\n" experiment.make_plots(open_plot=True)
def play_markov_game(agent_dict, markov_game_mdp, instances=10, episodes=100, steps=30): ''' Args: agent_dict (dict of Agents): See agents/AgentClass.py (and friends). markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py. instances (int) [opt]: Number of times to run each agent (for confidence intervals). episodes (int) [opt]: Number of episodes for each learning instance. steps (int) [opt]: Number of times to run each agent (for confidence intervals). ''' # Experiment (for reproducibility, plotting). exp_params = { "instances": instances } #, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() # For each instance of the agent. for instance in xrange(1, instances + 1): print "\tInstance " + str(instance) + " of " + str(instances) + "." reward_dict = defaultdict(str) action_dict = {} for episode in xrange(1, episodes + 1): print "\t\tEpisode " + str(episode) + " of " + str(episodes) + "." # Compute initial state/reward. state = markov_game_mdp.get_init_state() for step in xrange(steps): # Compute each agent's policy. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Terminal check. if state.is_terminal(): experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state) continue # Execute in MDP. reward_dict, next_state = markov_game_mdp.execute_agent_action( action_dict) # Record the experience. experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state) # Update pointer. state = next_state # A final update. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Process that learning instance's info at end of learning. experiment.end_of_episode(a.name) # Reset the MDP, tell the agent the episode is over. markov_game_mdp.reset() # A final update. for a in agent_dict.values(): # Reset the agent and track experiment info. experiment.end_of_instance(a.name) a.reset() # Time stuff. print "Experiment took " + str(time.clock() - start) + " seconds." experiment.make_plots(cumulative=True)