Esempio n. 1
0
def run_agents_on_mdp(agents,
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=200,
                      clear_old_results=True):
    '''
    Args:
        agents (list of Agents): See agents/AgentClass.py (and friends).
        mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*.
        instances (int) [opt]: Number of times to run each agent (for confidence intervals).
        episodes (int) [opt]: Number of episodes for each learning instance.
        steps (int) [opt]: Number of steps per episode.
        clear_old_results (bool) [opt]: If true, removes all results files in the relevant results dir.

    Summary:
        Runs each agent on the given mdp according to the given parameters.
        Stores results in results/<agent_name>.csv and automatically
        generates a plot and opens it.
    '''

    # Experiment (for reproducibility, plotting).
    exp_params = {"instances": instances, "episodes": episodes, "steps": steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            clear_old_results=clear_old_results)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)

    times = _main_experiment_loop(
        agents,
        mdp,
        instances,
        episodes,
        steps,
        experiment,
    )

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in times.keys():
        print str(agent) + " agent took " + str(times[agent]) + " seconds."
    print "-------------\n"

    if not isinstance(mdp, GymMDP):
        experiment.make_plots(open_plot=True)
Esempio n. 2
0
def play_markov_game(agent_ls,
                     markov_game_mdp,
                     instances=10,
                     episodes=100,
                     steps=30,
                     verbose=False,
                     open_plot=True):
    '''
    Args:
        agent_list (list of Agents): See agents/AgentClass.py (and friends).
        markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py.
        instances (int): Number of times to run each agent (for confidence intervals).
        episodes (int): Number of episodes for each learning instance.
        steps (int): Number of times to run each agent (for confidence intervals).
        verbose (bool)
        open_plot (bool): If true opens plot.
    '''

    # Put into dict.
    agent_dict = {}
    for a in agent_ls:
        agent_dict[a.name] = a

    # Experiment (for reproducibility, plotting).
    exp_params = {"instances": instances, "episodes": episodes, "steps": steps}
    experiment = Experiment(agents=agent_dict,
                            mdp=markov_game_mdp,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            is_markov_game=True)

    # Record how long each agent spends learning.
    print("Running experiment: \n" + str(experiment))
    start = time.clock()

    # For each instance of the agent.
    for instance in range(1, instances + 1):
        print("\tInstance " + str(instance) + " of " + str(int(instances)) +
              ".")

        reward_dict = defaultdict(str)
        action_dict = {}

        for episode in range(1, episodes + 1):
            if verbose:
                sys.stdout.write("\tEpisode %s of %s" % (episode, episodes))
                sys.stdout.write("\b" * len("\tEpisode %s of %s" %
                                            (episode, episodes)))
                sys.stdout.flush()

            # Compute initial state/reward.
            state = markov_game_mdp.get_init_state()

            for step in range(steps):

                # Compute each agent's policy.
                for a in agent_dict.values():
                    agent_reward = reward_dict[a.name]
                    agent_action = a.act(state, agent_reward)
                    action_dict[a.name] = agent_action

                # Terminal check.
                if state.is_terminal():
                    experiment.add_experience(agent_dict, state, action_dict,
                                              defaultdict(int), state)
                    continue

                # Execute in MDP.
                reward_dict, next_state = markov_game_mdp.execute_agent_action(
                    action_dict)

                # Record the experience.
                experiment.add_experience(agent_dict, state, action_dict,
                                          reward_dict, next_state)

                # Update pointer.
                state = next_state

            # A final update.
            for a in agent_dict.values():
                agent_reward = reward_dict[a.name]
                agent_action = a.act(state, agent_reward)
                action_dict[a.name] = agent_action

                # Process that learning instance's info at end of learning.
                experiment.end_of_episode(a.name)

            # Reset the MDP, tell the agent the episode is over.
            markov_game_mdp.reset()

        # A final update.
        for a in agent_dict.values():
            # Reset the agent and track experiment info.
            experiment.end_of_instance(a.name)
            a.reset()

    # Time stuff.
    print("Experiment took " + str(round(time.clock() - start, 2)) +
          " seconds.")

    experiment.make_plots(open_plot=open_plot)
Esempio n. 3
0
def run_agents_on_mdp(agents,
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=200,
                      clear_old_results=True,
                      rew_step_count=1,
                      track_disc_reward=False,
                      open_plot=True,
                      verbose=False,
                      reset_at_terminal=False,
                      cumulative_plot=True,
                      dir_for_plot="results",
                      experiment_name_prefix="",
                      track_success=False,
                      success_reward=None):
    '''
    Args:
        agents (list of Agents): See agents/AgentClass.py (and friends).
        mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*.
        instances (int): Number of times to run each agent (for confidence intervals).
        episodes (int): Number of episodes for each learning instance.
        steps (int): Number of steps per episode.
        clear_old_results (bool): If true, removes all results files in the relevant results dir.
        rew_step_count (int): Number of steps before recording reward.
        track_disc_reward (bool): If true, track (and plot) discounted reward.
        open_plot (bool): If true opens the plot at the end.
        verbose (bool): If true, prints status bars per episode/instance.
        reset_at_terminal (bool): If true sends the agent to the start state after terminal.
        cumulative_plot (bool): If true makes a cumulative plot, otherwise plots avg. reward per timestep.
        dir_for_plot (str): Path
        experiment_name_prefix (str): Adds this to the end of the usual experiment name.
        track_success (bool): If true, tracks whether each run is successful and generates an additional success plot at the end.
        success_reward (int): If set, determines the success criteria.

    Summary:
        Runs each agent on the given mdp according to the given parameters.
        Stores results in results/<agent_name>.csv and automatically
        generates a plot and opens it.
    '''
    if track_success and success_reward is None:
        raise ValueError(
            "(simple_rl): run_agents_on_mdp must set param @success_reward when @track_success=True."
        )

    # Experiment (for reproducibility, plotting).
    exp_params = {"instances": instances, "episodes": episodes, "steps": steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            clear_old_results=clear_old_results,
                            track_disc_reward=track_disc_reward,
                            count_r_per_n_timestep=rew_step_count,
                            cumulative_plot=cumulative_plot,
                            dir_for_plot=dir_for_plot,
                            experiment_name_prefix=experiment_name_prefix,
                            track_success=track_success,
                            success_reward=success_reward)

    # Record how long each agent spends learning.
    print("Running experiment: \n" + str(experiment))
    time_dict = defaultdict(float)

    # Learn.
    for agent in agents:
        print(str(agent) + " is learning.")

        start = time.clock()

        # For each instance.
        for instance in range(1, instances + 1):
            print("  Instance " + str(instance) + " of " + str(instances) +
                  ".")
            sys.stdout.flush()
            run_single_agent_on_mdp(agent,
                                    mdp,
                                    episodes,
                                    steps,
                                    experiment,
                                    verbose,
                                    track_disc_reward,
                                    reset_at_terminal=reset_at_terminal)

            # Reset the agent.
            agent.reset()
            mdp.end_of_instance()

        # Track how much time this agent took.
        end = time.clock()
        time_dict[agent] = round(end - start, 3)
        print()

    # Time stuff.
    print("\n--- TIMES ---")
    for agent in time_dict.keys():
        print(
            str(agent) + " agent took " + str(round(time_dict[agent], 2)) +
            " seconds.")
    print("-------------\n")

    experiment.make_plots(open_plot=open_plot)
Esempio n. 4
0
def run_agents_lifelong(agents,
                        mdp_distr,
                        samples=5,
                        episodes=1,
                        steps=100,
                        clear_old_results=True,
                        open_plot=True,
                        verbose=False,
                        track_disc_reward=False,
                        reset_at_terminal=False,
                        resample_at_terminal=False,
                        cumulative_plot=True,
                        dir_for_plot="results"):
    '''
    Args:
        agents (list)
        mdp_distr (MDPDistribution)
        samples (int)
        episodes (int)
        steps (int)
        clear_old_results (bool)
        open_plot (bool)
        verbose (bool)
        track_disc_reward (bool): If true records and plots discounted reward, discounted over episodes. So, if
            each episode is 100 steps, then episode 2 will start discounting as though it's step 101.
        reset_at_terminal (bool)
        resample_at_terminal (bool)
        cumulative_plot (bool)
        dir_for_plot (str)

    Summary:
        Runs each agent on the MDP distribution according to the given parameters.
        If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored.
    '''
    # Experiment (for reproducibility, plotting).
    exp_params = {"samples": samples, "episodes": episodes, "steps": steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp_distr,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            is_lifelong=True,
                            clear_old_results=clear_old_results,
                            track_disc_reward=track_disc_reward,
                            cumulative_plot=cumulative_plot,
                            dir_for_plot=dir_for_plot)

    # Record how long each agent spends learning.
    print("Running experiment: \n" + str(experiment))
    start = time.clock()

    times = defaultdict(float)

    # Learn.
    for agent in agents:
        print(str(agent) + " is learning.")
        start = time.clock()

        # --- SAMPLE NEW MDP ---
        for new_task in range(samples):
            print("  Sample " + str(new_task + 1) + " of " + str(samples) +
                  ".")

            # Sample the MDP.
            mdp = mdp_distr.sample()

            # Run the agent.
            hit_terminal, total_steps_taken, _ = run_single_agent_on_mdp(
                agent, mdp, episodes, steps, experiment, verbose,
                track_disc_reward, reset_at_terminal, resample_at_terminal)

            # If we resample at terminal, keep grabbing MDPs until we're done.
            while resample_at_terminal and hit_terminal and total_steps_taken < steps:
                mdp = mdp_distr.sample()
                hit_terminal, steps_taken, _ = run_single_agent_on_mdp(
                    agent, mdp, episodes, steps - total_steps_taken,
                    experiment, verbose, track_disc_reward, reset_at_terminal,
                    resample_at_terminal)
                total_steps_taken += steps_taken

            # Reset the agent.
            agent.reset()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print("\n--- TIMES ---")
    for agent in times.keys():
        print(
            str(agent) + " agent took " + str(round(times[agent], 2)) +
            " seconds.")
    print("-------------\n")

    experiment.make_plots(open_plot=open_plot)
Esempio n. 5
0
def run_agents_seq(agents,
                   mdp_list,
                   samples=5,
                   episodes=1,
                   steps=100,
                   clear_old_results=True,
                   open_plot=True,
                   verbose=False,
                   track_disc_reward=False,
                   reset_at_terminal=False,
                   resample_at_terminal=False,
                   cumulative_plot=True,
                   dir_for_plot="results"):
    '''
    Args:
        agents (list)
        mdp_distr (MDPDistribution)
        samples (int)
        episodes (int)
        steps (int)
        clear_old_results (bool)
        open_plot (bool)
        verbose (bool)
        track_disc_reward (bool): If true records and plots discounted reward, discounted over episodes. So, if
            each episode is 100 steps, then episode 2 will start discounting as though it's step 101.
        reset_at_terminal (bool)
        resample_at_terminal (bool)
        cumulative_plot (bool)
        dir_for_plot (str)

    Summary:
        Runs each agent on the MDP distribution according to the given parameters.
        If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored.
    '''
    # Experiment (for reproducibility, plotting).
    exp_params = {"samples": samples, "episodes": episodes, "steps": steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp_list,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            is_lifelong=True,
                            clear_old_results=clear_old_results,
                            track_disc_reward=track_disc_reward,
                            cumulative_plot=cumulative_plot,
                            dir_for_plot=dir_for_plot)

    # Record how long each agent spends learning.
    print("Running experiment: \n" + str(experiment))
    start = time.clock()

    times = defaultdict(float)

    # Learn.
    for agent in agents:
        print(str(agent) + " is learning.")
        start = time.clock()

        # --- SAMPLE NEW MDP ---
        for new_task in range(samples):
            print("  Sample " + str(new_task + 1) + " of " + str(samples) +
                  ".")

            # Sample the MDP.
            mdp = mdp_list.get_mdp(new_task)
            print("goal")
            print(mdp.goal_locs)
            #            print(mdp.lava_locs)

            # Run the agent.
            hit_terminal, total_steps_taken, _ = run_single_agent_on_mdp(
                agent, mdp, episodes, steps, experiment, verbose,
                track_disc_reward, reset_at_terminal, resample_at_terminal)

            #            if type(agent)==TabularRMaxAgent:
            ##                print(agent.get_par_tensor())
            ##                print(mdp.get_par_tensor())
            #                with open('./rmse.txt', 'a') as f:
            #                   f.write(agent.name+str(np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor()))+"\n")
            ##                print("RMSE:", np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor()))
            #
            #            if type(agent)==PatternLearningAgent:
            ##                print(agent.get_par_tensor())
            ##                print(mdp.get_par_tensor())
            #                with open('./rmse.txt', 'a') as f:
            #                   f.write(agent.name+str(np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor()))+"\n")
            ##                print("RMSE:", np.linalg.norm(mdp.get_par_tensor()-agent.get_par_tensor()))
            #                mdp.get_patterns()

            # Reset the agent.
            agent.reset()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print("\n--- TIMES ---")
    for agent in times.keys():
        print(
            str(agent) + " agent took " + str(round(times[agent], 2)) +
            " seconds.")
    print("-------------\n")

    experiment.make_plots(open_plot=open_plot)
Esempio n. 6
0
def run_agents_on_mdp(agents,
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=200,
                      clear_old_results=True,
                      rew_step_count=1,
                      is_rec_disc_reward=False,
                      open_plot=True,
                      verbose=False,
                      reset_at_terminal=False):
    '''
    Args:
        agents (list of Agents): See agents/AgentClass.py (and friends).
        mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*.
        instances (int) [opt]: Number of times to run each agent (for confidence intervals).
        episodes (int) [opt]: Number of episodes for each learning instance.
        steps (int) [opt]: Number of steps per episode.
        clear_old_results (bool) [opt]: If true, removes all results files in the relevant results dir.
        rew_step_count (int): Number of steps before recording reward.
        is_rec_disc_reward (bool): If true, track (and plot) discounted reward.
        open_plot (bool): If true opens the plot at the end.
        verbose (bool): If true, prints status bars per episode/instance.

    Summary:
        Runs each agent on the given mdp according to the given parameters.
        Stores results in results/<agent_name>.csv and automatically
        generates a plot and opens it.
    '''

    # Experiment (for reproducibility, plotting).
    exp_params = {"instances": instances, "episodes": episodes, "steps": steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            clear_old_results=clear_old_results,
                            is_rec_disc_reward=is_rec_disc_reward,
                            count_r_per_n_timestep=rew_step_count)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    time_dict = defaultdict(float)

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."

        start = time.clock()

        # For each instance.
        for instance in xrange(1, instances + 1):
            sys.stdout.flush()
            print "  Instance " + str(instance) + " of " + str(instances) + "."
            run_single_agent_on_mdp(agent,
                                    mdp,
                                    episodes,
                                    steps,
                                    experiment,
                                    verbose,
                                    is_rec_disc_reward,
                                    reset_at_terminal=reset_at_terminal)

            # Reset the agent.
            agent.reset()

        # Track how much time this agent took.
        end = time.clock()
        time_dict[agent] = round(end - start, 3)
        print

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in time_dict.keys():
        print str(agent) + " agent took " + str(round(time_dict[agent],
                                                      2)) + " seconds."
    print "-------------\n"

    # if not isinstance(mdp, GymMDP):
    experiment.make_plots(open_plot=open_plot)
Esempio n. 7
0
def run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=5,
                          episodes=1,
                          steps=100,
                          clear_old_results=True,
                          open_plot=True,
                          verbose=False,
                          is_rec_disc_reward=False,
                          reset_at_terminal=False):
    '''
    Args:
        mdp_distr (MDPDistribution)
        task_samples
        episodes
        steps
    '''
    # Experiment (for reproducibility, plotting).
    exp_params = {
        "task_samples": task_samples,
        "episodes": episodes,
        "steps": steps
    }
    experiment = Experiment(agents=agents,
                            mdp=mdp_distr,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            is_multi_task=True,
                            clear_old_results=clear_old_results,
                            is_rec_disc_reward=is_rec_disc_reward)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    start = time.clock()

    times = defaultdict(float)

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."
        start = time.clock()

        # --- SAMPLE NEW MDP ---
        for new_task in xrange(task_samples):
            print "  Sample " + str(new_task +
                                    1) + " of " + str(task_samples) + "."

            # Sample the MDP.
            mdp = mdp_distr.sample()

            # Run the agent.
            run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment,
                                    verbose, is_rec_disc_reward,
                                    reset_at_terminal)

            # Reset the agent.
            agent.reset()

            if "rmax" in agent.name:
                agent._reset_reward()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in times.keys():
        print str(agent) + " agent took " + str(round(times[agent],
                                                      2)) + " seconds."
    print "-------------\n"

    experiment.make_plots(open_plot=open_plot)
Esempio n. 8
0
def run_agents_lifelong(agents,
                        mdp_distribution,
                        name_identifier=None,
                        n_instances=1,
                        n_tasks=5,
                        n_episodes=1,
                        n_steps=100,
                        parallel_run=False,
                        n_processes=None,
                        clear_old_results=True,
                        track_disc_reward=False,
                        reset_at_terminal=False,
                        cumulative_plot=True,
                        dir_for_plot='results',
                        verbose=False,
                        do_run=True,
                        do_plot=False,
                        confidence=.9,
                        open_plot=False,
                        plot_title=True,
                        plot_legend=True,
                        episodes_moving_average=False,
                        episodes_ma_width=10,
                        tasks_moving_average=False,
                        tasks_ma_width=10,
                        latex_rendering=False):
    """
    Runs each agent on the MDP distribution according to the given parameters.
    If @mdp_distribution has a non-zero horizon, then gamma is set to 1 and @steps is ignored.

    :param agents: (list)
    :param mdp_distribution: (MDPDistribution)
    :param name_identifier: (str)
    :param n_instances: (int)
    :param n_tasks: (int)
    :param n_episodes: (int)
    :param n_steps: (int)
    :param parallel_run: (bool)
    :param n_processes: (int)
    :param clear_old_results: (bool)
    :param track_disc_reward: (bool) If true records and plots discounted reward, discounted over episodes.
    So, if each episode is 100 steps, then episode 2 will start discounting as though it's step 101.
    :param reset_at_terminal: (bool)
    :param cumulative_plot: (bool)
    :param dir_for_plot: (str)
    :param verbose: (bool)
    :param do_run: (bool)
    :param do_plot: (bool)
    :param confidence: (float)
    :param open_plot: (bool)
    :param plot_title: (bool)
    :param plot_legend: (bool)
    :param episodes_moving_average: (bool)
    :param episodes_ma_width: (int)
    :param tasks_moving_average: (bool)
    :param tasks_ma_width: (int)
    :param latex_rendering: (bool)
    :return:
    """
    exp_params = {"samples": n_tasks, "episodes": n_episodes, "steps": n_steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp_distribution,
                            name_identifier=name_identifier,
                            params=exp_params,
                            is_episodic=n_episodes > 1,
                            is_lifelong=True,
                            clear_old_results=clear_old_results,
                            track_disc_reward=track_disc_reward,
                            cumulative_plot=cumulative_plot,
                            dir_for_plot=dir_for_plot)
    path = experiment.exp_directory
    save_script(path)

    print("Running experiment:\n" + str(experiment))

    # Sample tasks
    tasks = []
    for _ in range(n_tasks):
        tasks.append(mdp_distribution.sample())
    n_agents = len(agents)

    # Run
    if do_run:
        if parallel_run:
            n_processes = multiprocessing.cpu_count(
            ) if n_processes is None else n_processes
            print('Using', n_processes, 'threads.')
            pool = multiprocessing.Pool(processes=n_processes)

            # Asynchronous execution
            jobs = []
            for i in range(n_agents):
                lifelong_save(init=True, path=path, agent=agents[i])
                for j in range(n_instances):
                    job = apply_async(
                        pool, run_agent_lifelong,
                        (agents[i], experiment, j, n_tasks, n_episodes,
                         n_steps, tasks, track_disc_reward, reset_at_terminal,
                         path, verbose))
                    jobs.append(job)

            for job in jobs:
                job.get()
        else:
            for i in range(n_agents):
                lifelong_save(init=True, path=path, agent=agents[i])
                for j in range(n_instances):
                    run_agent_lifelong(agents[i], experiment, j, n_tasks,
                                       n_episodes, n_steps, tasks,
                                       track_disc_reward, reset_at_terminal,
                                       path, verbose)

    # Plot
    if do_plot:
        lifelong_plot(agents,
                      path,
                      n_tasks,
                      n_episodes,
                      confidence,
                      open_plot,
                      plot_title,
                      plot_legend,
                      episodes_moving_average=episodes_moving_average,
                      episodes_ma_width=episodes_ma_width,
                      tasks_moving_average=tasks_moving_average,
                      tasks_ma_width=tasks_ma_width,
                      latex_rendering=latex_rendering)
Esempio n. 9
0
def run_agents_multi_task(agents,
                            mdp_distr,
                            task_samples=5,
                            episodes=1,
                            steps=100,
                            clear_old_results=True,
                            open_plot=True,
                            verbose=False,
                            is_rec_disc_reward=False,
                            reset_at_terminal=False,
                            resample_at_terminal=False):
    '''
    Args:
        agents (list)
        mdp_distr (MDPDistribution)
        task_samples (int)
        episodes (int)
        steps (int)
        clear_old_results (bool)
        open_plot (bool)
        verbose (bool)
        is_rew_disc_reward (bool): If true records and plots discounted reward
        reset_at_terminal (bool)

    Summary:
        Runs each agent on the MDP distribution according to the given parameters.
        If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored.
    '''

    # Set number of steps if the horizon is given.
    if mdp_distr.get_horizon() > 0:
        mdp_distr.set_gamma(1.0)
        steps = mdp_distr.get_horizon()

    # Experiment (for reproducibility, plotting).
    exp_params = {"task_samples":task_samples, "episodes":episodes, "steps":steps, "gamma":mdp_distr.get_gamma()}
    experiment = Experiment(agents=agents,
                mdp=mdp_distr,
                params=exp_params,
                is_episodic=episodes > 1,
                is_multi_task=True,
                clear_old_results=clear_old_results,
                is_rec_disc_reward=is_rec_disc_reward)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    start = time.clock()

    times = defaultdict(float)

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."
        start = time.clock()

        # --- SAMPLE NEW MDP ---
        for new_task in xrange(task_samples):
            print "  Sample " + str(new_task + 1) + " of " + str(task_samples) + "."

            # Sample the MDP.
            mdp = mdp_distr.sample()

            # Run the agent.
            hit_terminal, total_steps_taken = run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal, resample_at_terminal)

            # If we resample at terminal, keep grabbing MDPs until we're done.
            while resample_at_terminal and hit_terminal and total_steps_taken < steps:
                mdp = mdp_distr.sample()
                hit_terminal, steps_taken = run_single_agent_on_mdp(agent, mdp, episodes, steps - total_steps_taken, experiment, verbose, is_rec_disc_reward, reset_at_terminal, resample_at_terminal)
                total_steps_taken += steps_taken



            # Reset the agent.
            agent.reset()

            if "rmax" in agent.name:
                agent._reset_reward()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in times.keys():
        print str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds."
    print "-------------\n"

    experiment.make_plots(open_plot=open_plot)
Esempio n. 10
0
def run_agents_multi_task(agents,
                            mdp_distr,
                            task_samples=5,
                            episodes=1,
                            steps=100,
                            clear_old_results=True,
                            open_plot=True,
                            verbose=False,
                            is_rec_disc_reward=False,
                            reset_at_terminal=False,
                            include_optimal=False):
    '''
    Args:
        agents (list)
        mdp_distr (MDPDistribution)
        task_samples
        episodes
        steps

    Summary:
        Runs each agent on the MDP distribution according to the given parameters.
        If @mdp_distr has a non-zero horizon, then gamma is set to 1 and #steps is ignored.
    '''

    # Set number of steps if the horizon is given.
    if mdp_distr.get_horizon() > 0:
        mdp_distr.set_gamma(1.0)
        steps = mdp_distr.get_horizon()

    # Experiment (for reproducibility, plotting).
    exp_params = {"task_samples":task_samples, "episodes":episodes, "steps":steps}
    experiment = Experiment(agents=agents,
                mdp=mdp_distr,
                params=exp_params,
                is_episodic=episodes > 1,
                is_multi_task=True,
                clear_old_results=clear_old_results,
                is_rec_disc_reward=is_rec_disc_reward)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    start = time.clock()

    times = defaultdict(float)

    if include_optimal:
        fixed_policy_agent = FixedPolicyAgent(policy=lambda s: "", name="optimal")
        agents += [fixed_policy_agent]

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."
        start = time.clock()

        # --- SAMPLE NEW MDP ---
        for new_task in xrange(task_samples):
            print "  Sample " + str(new_task + 1) + " of " + str(task_samples) + "."

            # Sample the MDP.
            mdp = mdp_distr.sample()

            if include_optimal and agent.name == "optimal":
                vi = ValueIteration(mdp)
                vi.run_vi()
                agent.set_policy(vi.policy)

            # Run the agent.
            run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal)

            # Reset the agent.
            agent.reset()

            if "rmax" in agent.name:
                agent._reset_reward()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in times.keys():
        print str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds."
    print "-------------\n"

    experiment.make_plots(open_plot=open_plot)
Esempio n. 11
0
def play_markov_game(agent_ls, markov_game_mdp, instances=10, episodes=100, steps=30, verbose=False, open_plot=True):
    '''
    Args:
        agent_list (list of Agents): See agents/AgentClass.py (and friends).
        markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py.
        instances (int): Number of times to run each agent (for confidence intervals).
        episodes (int): Number of episodes for each learning instance.
        steps (int): Number of times to run each agent (for confidence intervals).
        verbose (bool)
        open_plot (bool): If true opens plot.
    '''

    # Put into dict.
    agent_dict = {}
    for a in agent_ls:
        agent_dict[a.name] = a

    # Experiment (for reproducibility, plotting).
    exp_params = {"instances":instances, "episodes":episodes, "steps":steps}
    experiment = Experiment(agents=agent_dict, mdp=markov_game_mdp, params=exp_params, is_episodic=episodes > 1, is_markov_game=True)

    # Record how long each agent spends learning.
    print("Running experiment: \n" + str(experiment))
    start = time.clock()

    # For each instance of the agent.
    for instance in range(1, instances + 1):
        print("\tInstance " + str(instance) + " of " + str(int(instances)) + ".")

        reward_dict = defaultdict(str)
        action_dict = {}

        for episode in range(1, episodes + 1):
            if verbose:
                sys.stdout.write("\tEpisode %s of %s" % (episode, episodes))
                sys.stdout.write("\b" * len("\tEpisode %s of %s" % (episode, episodes)))
                sys.stdout.flush()

            # Compute initial state/reward.
            state = markov_game_mdp.get_init_state()

            for step in range(steps):

                # Compute each agent's policy.
                for a in agent_dict.values():
                    agent_reward = reward_dict[a.name]
                    agent_action = a.act(state, agent_reward)
                    action_dict[a.name] = agent_action

                # Terminal check.
                if state.is_terminal():
                    experiment.add_experience(agent_dict, state, action_dict, defaultdict(int), state)
                    continue

                # Execute in MDP.
                reward_dict, next_state = markov_game_mdp.execute_agent_action(action_dict)

                # Record the experience.
                experiment.add_experience(agent_dict, state, action_dict, reward_dict, next_state)

                # Update pointer.
                state = next_state

            # A final update.
            for a in agent_dict.values():
                agent_reward = reward_dict[a.name]
                agent_action = a.act(state, agent_reward)
                action_dict[a.name] = agent_action

                # Process that learning instance's info at end of learning.
                experiment.end_of_episode(a.name)

            # Reset the MDP, tell the agent the episode is over.
            markov_game_mdp.reset()

        # A final update.
        for a in agent_dict.values():
            # Reset the agent and track experiment info.
            experiment.end_of_instance(a.name)
            a.reset()

    # Time stuff.
    print("Experiment took " + str(round(time.clock() - start, 2)) + " seconds.")

    experiment.make_plots(open_plot=open_plot)
Esempio n. 12
0
def run_agents_on_mdp(agents,
                        mdp,
                        instances=5,
                        episodes=100,
                        steps=200,
                        clear_old_results=True,
                        rew_step_count=1,
                        track_disc_reward=False,
                        open_plot=True,
                        verbose=False,
                        reset_at_terminal=False,
                        cumulative_plot=True,
                        dir_for_plot="results",
                        experiment_name_prefix="",
                        track_success=False,
                        success_reward=None):
    '''
    Args:
        agents (list of Agents): See agents/AgentClass.py (and friends).
        mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*.
        instances (int): Number of times to run each agent (for confidence intervals).
        episodes (int): Number of episodes for each learning instance.
        steps (int): Number of steps per episode.
        clear_old_results (bool): If true, removes all results files in the relevant results dir.
        rew_step_count (int): Number of steps before recording reward.
        track_disc_reward (bool): If true, track (and plot) discounted reward.
        open_plot (bool): If true opens the plot at the end.
        verbose (bool): If true, prints status bars per episode/instance.
        reset_at_terminal (bool): If true sends the agent to the start state after terminal.
        cumulative_plot (bool): If true makes a cumulative plot, otherwise plots avg. reward per timestep.
        dir_for_plot (str): Path
        experiment_name_prefix (str): Adds this to the end of the usual experiment name.
        track_success (bool): If true, tracks whether each run is successful and generates an additional success plot at the end.
        success_reward (int): If set, determines the success criteria.

    Summary:
        Runs each agent on the given mdp according to the given parameters.
        Stores results in results/<agent_name>.csv and automatically
        generates a plot and opens it.
    '''
    if track_success and success_reward is None:
        raise ValueError("(simple_rl): run_agents_on_mdp must set param @success_reward when @track_success=True.")

    # Experiment (for reproducibility, plotting).
    exp_params = {"instances":instances, "episodes":episodes, "steps":steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp,
                            params=exp_params,
                            is_episodic= episodes > 1,
                            clear_old_results=clear_old_results,
                            track_disc_reward=track_disc_reward,
                            count_r_per_n_timestep=rew_step_count,
                            cumulative_plot=cumulative_plot,
                            dir_for_plot=dir_for_plot,
                            experiment_name_prefix=experiment_name_prefix,
                            track_success=track_success,
                            success_reward=success_reward)

    # Record how long each agent spends learning.
    print("Running experiment: \n" + str(experiment))
    time_dict = defaultdict(float)

    # Learn.
    for agent in agents:
        print(str(agent) + " is learning.")

        start = time.clock()

        # For each instance.
        for instance in range(1, instances + 1):
            print("  Instance " + str(instance) + " of " + str(instances) + ".")
            sys.stdout.flush()
            run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal=reset_at_terminal)
            if "fixed" in agent.name:
                break
            # Reset the agent.
            agent.reset()
            mdp.end_of_instance()

        # Track how much time this agent took.
        end = time.clock()
        time_dict[agent] = round(end - start, 3)
        print()

    # Time stuff.
    print("\n--- TIMES ---")
    for agent in time_dict.keys():
        print(str(agent) + " agent took " + str(round(time_dict[agent], 2)) + " seconds.")
    print("-------------\n")

    experiment.make_plots(open_plot=open_plot)
Esempio n. 13
0
def run_agents_lifelong(agents,
                            mdp_distr,
                            samples=5,
                            episodes=1,
                            steps=100,
                            clear_old_results=True,
                            open_plot=True,
                            verbose=False,
                            track_disc_reward=False,
                            reset_at_terminal=False,
                            resample_at_terminal=False,
                            cumulative_plot=True,
                            dir_for_plot="results"):
    '''
    Args:
        agents (list)
        mdp_distr (MDPDistribution)
        samples (int)
        episodes (int)
        steps (int)
        clear_old_results (bool)
        open_plot (bool)
        verbose (bool)
        track_disc_reward (bool): If true records and plots discounted reward, discounted over episodes. So, if
            each episode is 100 steps, then episode 2 will start discounting as though it's step 101.
        reset_at_terminal (bool)
        resample_at_terminal (bool)
        cumulative_plot (bool)
        dir_for_plot (str)

    Summary:
        Runs each agent on the MDP distribution according to the given parameters.
        If @mdp_distr has a non-zero horizon, then gamma is set to 1 and @steps is ignored.
    '''
    # Experiment (for reproducibility, plotting).
    exp_params = {"samples":samples, "episodes":episodes, "steps":steps}
    experiment = Experiment(agents=agents,
                    mdp=mdp_distr,
                    params=exp_params,
                    is_episodic=episodes > 1,
                    is_lifelong=True,
                    clear_old_results=clear_old_results,
                    track_disc_reward=track_disc_reward,
                    cumulative_plot=cumulative_plot,
                    dir_for_plot=dir_for_plot)

    # Record how long each agent spends learning.
    print("Running experiment: \n" + str(experiment))
    start = time.clock()

    times = defaultdict(float)

    # Learn.
    for agent in agents:
        print(str(agent) + " is learning.")
        start = time.clock()

        # --- SAMPLE NEW MDP ---
        for new_task in range(samples):
            print("  Sample " + str(new_task + 1) + " of " + str(samples) + ".")

            # Sample the MDP.
            mdp = mdp_distr.sample()

            # Run the agent.
            hit_terminal, total_steps_taken, _ = run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal)

            # If we resample at terminal, keep grabbing MDPs until we're done.
            while resample_at_terminal and hit_terminal and total_steps_taken < steps:
                mdp = mdp_distr.sample()
                hit_terminal, steps_taken, _ = run_single_agent_on_mdp(agent, mdp, episodes, steps - total_steps_taken, experiment, verbose, track_disc_reward, reset_at_terminal, resample_at_terminal)
                total_steps_taken += steps_taken

            # Reset the agent.
            agent.reset()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)


    # Time stuff.
    print("\n--- TIMES ---")
    for agent in times.keys():
        print(str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds.")
    print("-------------\n")

    experiment.make_plots(open_plot=open_plot)
Esempio n. 14
0
def run_agents_on_mdp(agents,
                      mdp,
                      num_instances=20,
                      num_episodes=2000,
                      num_steps=50):
    '''
    Args:
        agents (list of Agents): See agents/AgentClass.py (and friends).
        mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*.
        num_instances (int) [opt]: Number of times to run each agent (for confidence intervals).
        num_episodes (int) [opt]: Number of episodes for each learning instance.
        num_steps (int) [opt]: Number of steps per episode.

    Summary:
        Runs each agent on the given mdp according to the given parameters.
        Stores results in results/<agent_name>.csv and automatically
        generates a plot and opens it.
    '''

    # Experiment (for reproducibility, plotting).
    exp_params = {
        "num_instances": num_instances,
        "num_episodes": num_episodes,
        "num_steps": num_steps
    }
    experiment = Experiment(agents=agents, mdp=mdp, params=exp_params)

    # Record how long each agent spends learning.
    times = defaultdict(float)
    print "Running experiment: \n" + str(experiment)

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."
        start = time.clock()

        # For each instance of the agent.
        for instance in xrange(1, num_instances + 1):
            print "\tInstance " + str(instance) + " of " + str(
                num_instances) + "."

            # For each episode.
            for episode in xrange(1, num_episodes + 1):

                # Compute initial state/reward.
                state = mdp.get_init_state()
                print "init:", state
                reward = 0

                for step in xrange(num_steps):
                    # Compute the agent's policy.
                    action = agent.act(state, reward)

                    # Execute the action in the MDP.
                    reward, next_state = mdp.execute_agent_action(action)

                    # Record the experience.
                    experiment.add_experience(agent, state, action, reward,
                                              next_state)

                    # Check if terminal state.
                    if next_state.is_terminal():
                        break

                    # Update pointer.
                    state = next_state

                # Process experiment info at end of episode.
                experiment.end_of_episode(agent)

                # Reset the MDP, tell the agent the episode is over.
                mdp.reset()
                agent.end_of_episode()

            # Process that learning instance's info at end of learning.
            experiment.end_of_instance(agent)

            # Reset the agent.
            agent.reset()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in times.keys():
        print str(agent) + " agent took " + str(times[agent]) + " seconds."
    print "-------------\n"

    experiment.make_plots()
Esempio n. 15
0
def run_agents_multi_task(agents,
                          mdp_distr,
                          instances,
                          num_switches,
                          steps,
                          clear_old_results=True):
    # Experiment (for reproducibility, plotting).
    exp_params = {"instances": instances, "steps": steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp_distr,
                            params=exp_params,
                            is_multi_task=True,
                            clear_old_results=clear_old_results)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    start = time.clock()

    times = defaultdict(float)

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."
        start = time.clock()

        # For each instance of the agent.
        for instance in xrange(1, instances + 1):
            print "\tInstance " + str(instance) + " of " + str(instances) + "."

            # --- SAMPLE NEW MDP ---
            for new_task in xrange(num_switches):
                mdp_id = np.random.multinomial(
                    1, mdp_distr.keys()).tolist().index(1)
                mdp = mdp_distr[mdp_distr.keys()[mdp_id]]

                # Compute initial state/reward.
                state = mdp.get_init_state()

                reward = 0
                episode_start_time = time.clock()

                prog_bar_len = _make_step_progress_bar()
                # print prog_bar_len, steps
                for step in xrange(steps):
                    # print "\t  Step " + str(step)
                    if int(prog_bar_len * float(step) / steps) > int(
                            prog_bar_len * float(step - 1) / steps):
                        _increment_bar()

                    # Compute the agent's policy.
                    action = agent.act(state, reward)

                    # Terminal check.
                    if state.is_terminal():
                        # Self loop if in a terminal state.
                        experiment.add_experience(agent, state, action, 0,
                                                  state)
                        continue

                    # Execute in MDP.
                    reward, next_state = mdp.execute_agent_action(action)

                    # Record the experience.
                    experiment.add_experience(agent, state, action, reward,
                                              next_state)

                    # Update pointer.
                    state = next_state

                if "-sa" in agent.name:
                    agent.new_task()
                elif "rmax" in agent.name:
                    agent._reset_reward()

                _increment_bar()
                sys.stdout.write("\n")

                # A final update.
                action = agent.act(state, reward)

                # Process experiment info at end of episode.
                experiment.end_of_episode(agent)

                # Reset the MDP, tell the agent the episode is over.
                mdp.reset()
                agent.end_of_episode()

            # Process that learning instance's info at end of learning.
            experiment.end_of_instance(agent)

            # Reset the agent.
            agent.reset()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in times.keys():
        print str(agent) + " agent took " + str(times[agent]) + " seconds."
    print "-------------\n"

    experiment.make_plots(open_plot=True)
Esempio n. 16
0
def play_markov_game(agent_dict,
                     markov_game_mdp,
                     instances=10,
                     episodes=100,
                     steps=30):
    '''
    Args:
        agent_dict (dict of Agents): See agents/AgentClass.py (and friends).
        markov_game_mdp (MarkovGameMDP): See mdp/markov_games/MarkovGameMDPClass.py.
        instances (int) [opt]: Number of times to run each agent (for confidence intervals).
        episodes (int) [opt]: Number of episodes for each learning instance.
        steps (int) [opt]: Number of times to run each agent (for confidence intervals).
    '''

    # Experiment (for reproducibility, plotting).
    exp_params = {
        "instances": instances
    }  #, "episodes":episodes, "steps":steps}
    experiment = Experiment(agents=agent_dict,
                            mdp=markov_game_mdp,
                            params=exp_params,
                            is_episodic=episodes > 1,
                            is_markov_game=True)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    start = time.clock()

    # For each instance of the agent.
    for instance in xrange(1, instances + 1):
        print "\tInstance " + str(instance) + " of " + str(instances) + "."

        reward_dict = defaultdict(str)
        action_dict = {}

        for episode in xrange(1, episodes + 1):
            print "\t\tEpisode " + str(episode) + " of " + str(episodes) + "."
            # Compute initial state/reward.
            state = markov_game_mdp.get_init_state()

            for step in xrange(steps):

                # Compute each agent's policy.
                for a in agent_dict.values():
                    agent_reward = reward_dict[a.name]
                    agent_action = a.act(state, agent_reward)
                    action_dict[a.name] = agent_action

                # Terminal check.
                if state.is_terminal():
                    experiment.add_experience(agent_dict, state, action_dict,
                                              defaultdict(int), state)
                    continue

                # Execute in MDP.
                reward_dict, next_state = markov_game_mdp.execute_agent_action(
                    action_dict)

                # Record the experience.
                experiment.add_experience(agent_dict, state, action_dict,
                                          reward_dict, next_state)

                # Update pointer.
                state = next_state

            # A final update.
            for a in agent_dict.values():
                agent_reward = reward_dict[a.name]
                agent_action = a.act(state, agent_reward)
                action_dict[a.name] = agent_action

                # Process that learning instance's info at end of learning.
                experiment.end_of_episode(a.name)

            # Reset the MDP, tell the agent the episode is over.
            markov_game_mdp.reset()

        # A final update.
        for a in agent_dict.values():
            # Reset the agent and track experiment info.
            experiment.end_of_instance(a.name)
            a.reset()

    # Time stuff.
    print "Experiment took " + str(time.clock() - start) + " seconds."

    experiment.make_plots(cumulative=True)