def play_markov_game(agent_ls, markov_game_mdp, instances=10, episodes=100, steps=30, verbose=False, open_plot=True): ''' Args: agent_list (list of Agents): See agents/AgentClass.py (and friends). markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py. instances (int): Number of times to run each agent (for confidence intervals). episodes (int): Number of episodes for each learning instance. steps (int): Number of times to run each agent (for confidence intervals). verbose (bool) open_plot (bool): If true opens plot. ''' # Put into dict. agent_dict = {} for a in agent_ls: agent_dict[a.name] = a # Experiment (for reproducibility, plotting). exp_params = {"instances": instances, "episodes": episodes, "steps": steps} experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) start = time.clock() # For each instance of the agent. for instance in range(1, instances + 1): print("\tInstance " + str(instance) + " of " + str(int(instances)) + ".") reward_dict = defaultdict(str) action_dict = {} for episode in range(1, episodes + 1): if verbose: sys.stdout.write("\tEpisode %s of %s" % (episode, episodes)) sys.stdout.write("\b" * len("\tEpisode %s of %s" % (episode, episodes))) sys.stdout.flush() # Compute initial state/reward. state = markov_game_mdp.get_init_state() for step in range(steps): # Compute each agent's policy. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Terminal check. if state.is_terminal(): experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state) continue # Execute in MDP. reward_dict, next_state = markov_game_mdp.execute_agent_action( action_dict) # Record the experience. experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state) # Update pointer. state = next_state # A final update. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Process that learning instance's info at end of learning. experiment.end_of_episode(a.name) # Reset the MDP, tell the agent the episode is over. markov_game_mdp.reset() # A final update. for a in agent_dict.values(): # Reset the agent and track experiment info. experiment.end_of_instance(a.name) a.reset() # Time stuff. print("Experiment took " + str(round(time.clock() - start, 2)) + " seconds.") experiment.make_plots(open_plot=open_plot)
def play_markov_game(agent_ls, markov_game_mdp, instances=10, episodes=100, steps=30, verbose=False, open_plot=True): ''' Args: agent_list (list of Agents): See agents/AgentClass.py (and friends). markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py. instances (int): Number of times to run each agent (for confidence intervals). episodes (int): Number of episodes for each learning instance. steps (int): Number of times to run each agent (for confidence intervals). verbose (bool) open_plot (bool): If true opens plot. ''' # Put into dict. agent_dict = {} for a in agent_ls: agent_dict[a.name] = a # Experiment (for reproducibility, plotting). exp_params = {"instances":instances, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True) # Record how long each agent spends learning. print("Running experiment: \n" + str(experiment)) start = time.clock() # For each instance of the agent. for instance in range(1, instances + 1): print("\tInstance " + str(instance) + " of " + str(int(instances)) + ".") reward_dict = defaultdict(str) action_dict = {} for episode in range(1, episodes + 1): if verbose: sys.stdout.write("\tEpisode %s of %s" % (episode, episodes)) sys.stdout.write("\b" * len("\tEpisode %s of %s" % (episode, episodes))) sys.stdout.flush() # Compute initial state/reward. state = markov_game_mdp.get_init_state() for step in range(steps): # Compute each agent's policy. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Terminal check. if state.is_terminal(): experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state) continue # Execute in MDP. reward_dict, next_state = markov_game_mdp.execute_agent_action(action_dict) # Record the experience. experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state) # Update pointer. state = next_state # A final update. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Process that learning instance's info at end of learning. experiment.end_of_episode(a.name) # Reset the MDP, tell the agent the episode is over. markov_game_mdp.reset() # A final update. for a in agent_dict.values(): # Reset the agent and track experiment info. experiment.end_of_instance(a.name) a.reset() # Time stuff. print("Experiment took " + str(round(time.clock() - start, 2)) + " seconds.") experiment.make_plots(open_plot=open_plot)
def run_agents_multi_task(agents, mdp_distr, instances, num_switches, steps, clear_old_results=True): # Experiment (for reproducibility, plotting). exp_params = {"instances": instances, "steps": steps} experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_multi_task=True, clear_old_results=clear_old_results) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() times = defaultdict(float) # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # For each instance of the agent. for instance in xrange(1, instances + 1): print "\tInstance " + str(instance) + " of " + str(instances) + "." # --- SAMPLE NEW MDP --- for new_task in xrange(num_switches): mdp_id = np.random.multinomial( 1, mdp_distr.keys()).tolist().index(1) mdp = mdp_distr[mdp_distr.keys()[mdp_id]] # Compute initial state/reward. state = mdp.get_init_state() reward = 0 episode_start_time = time.clock() prog_bar_len = _make_step_progress_bar() # print prog_bar_len, steps for step in xrange(steps): # print "\t Step " + str(step) if int(prog_bar_len * float(step) / steps) > int( prog_bar_len * float(step - 1) / steps): _increment_bar() # Compute the agent's policy. action = agent.act(state, reward) # Terminal check. if state.is_terminal(): # Self loop if in a terminal state. experiment.add_experience(agent, state, action, 0, state) continue # Execute in MDP. reward, next_state = mdp.execute_agent_action(action) # Record the experience. experiment.add_experience(agent, state, action, reward, next_state) # Update pointer. state = next_state if "-sa" in agent.name: agent.new_task() elif "rmax" in agent.name: agent._reset_reward() _increment_bar() sys.stdout.write("\n") # A final update. action = agent.act(state, reward) # Process experiment info at end of episode. experiment.end_of_episode(agent) # Reset the MDP, tell the agent the episode is over. mdp.reset() agent.end_of_episode() # Process that learning instance's info at end of learning. experiment.end_of_instance(agent) # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(times[agent]) + " seconds." print "-------------\n" experiment.make_plots(open_plot=True)
def run_agents_on_mdp(agents, mdp, num_instances=20, num_episodes=2000, num_steps=50): ''' Args: agents (list of Agents): See agents/AgentClass.py (and friends). mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*. num_instances (int) [opt]: Number of times to run each agent (for confidence intervals). num_episodes (int) [opt]: Number of episodes for each learning instance. num_steps (int) [opt]: Number of steps per episode. Summary: Runs each agent on the given mdp according to the given parameters. Stores results in results/<agent_name>.csv and automatically generates a plot and opens it. ''' # Experiment (for reproducibility, plotting). exp_params = { "num_instances": num_instances, "num_episodes": num_episodes, "num_steps": num_steps } experiment = Experiment(agents=agents, mdp=mdp, params=exp_params) # Record how long each agent spends learning. times = defaultdict(float) print "Running experiment: \n" + str(experiment) # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # For each instance of the agent. for instance in xrange(1, num_instances + 1): print "\tInstance " + str(instance) + " of " + str( num_instances) + "." # For each episode. for episode in xrange(1, num_episodes + 1): # Compute initial state/reward. state = mdp.get_init_state() print "init:", state reward = 0 for step in xrange(num_steps): # Compute the agent's policy. action = agent.act(state, reward) # Execute the action in the MDP. reward, next_state = mdp.execute_agent_action(action) # Record the experience. experiment.add_experience(agent, state, action, reward, next_state) # Check if terminal state. if next_state.is_terminal(): break # Update pointer. state = next_state # Process experiment info at end of episode. experiment.end_of_episode(agent) # Reset the MDP, tell the agent the episode is over. mdp.reset() agent.end_of_episode() # Process that learning instance's info at end of learning. experiment.end_of_instance(agent) # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(times[agent]) + " seconds." print "-------------\n" experiment.make_plots()
def play_markov_game(agent_dict, markov_game_mdp, instances=10, episodes=100, steps=30): ''' Args: agent_dict (dict of Agents): See agents/AgentClass.py (and friends). markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py. instances (int) [opt]: Number of times to run each agent (for confidence intervals). episodes (int) [opt]: Number of episodes for each learning instance. steps (int) [opt]: Number of times to run each agent (for confidence intervals). ''' # Experiment (for reproducibility, plotting). exp_params = { "instances": instances } #, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() # For each instance of the agent. for instance in xrange(1, instances + 1): print "\tInstance " + str(instance) + " of " + str(instances) + "." reward_dict = defaultdict(str) action_dict = {} for episode in xrange(1, episodes + 1): print "\t\tEpisode " + str(episode) + " of " + str(episodes) + "." # Compute initial state/reward. state = markov_game_mdp.get_init_state() for step in xrange(steps): # Compute each agent's policy. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Terminal check. if state.is_terminal(): experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state) continue # Execute in MDP. reward_dict, next_state = markov_game_mdp.execute_agent_action( action_dict) # Record the experience. experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state) # Update pointer. state = next_state # A final update. for a in agent_dict.values(): agent_reward = reward_dict[a.name] agent_action = a.act(state, agent_reward) action_dict[a.name] = agent_action # Process that learning instance's info at end of learning. experiment.end_of_episode(a.name) # Reset the MDP, tell the agent the episode is over. markov_game_mdp.reset() # A final update. for a in agent_dict.values(): # Reset the agent and track experiment info. experiment.end_of_instance(a.name) a.reset() # Time stuff. print "Experiment took " + str(time.clock() - start) + " seconds." experiment.make_plots(cumulative=True)