def _make_dqn_option_policy(mdp, subgoal, n_trajs=100, n_steps=100):
    '''
    LEARN-DQN-AGENT-ON-MDP
    '''

    # TODO: mdp + subgoal
    # TODO: How much should we train each policy?
    #       Near optimal for now?

    env_name = mdp.env_name
    in_mdp = IntrinsicMDP(subgoal, env_name)

    # Build a subgoal reward function
    # TODO: implement a reward function based on the eigenvector
    # def intrinsic_r(state):
    #     if state in subgoal:
    #         return 1.0
    #     else:
    #         return 0.0

    # print('type(subgoal)=', subgoal)

    num_feats = in_mdp.get_num_state_feats()
    dqn_agent = LinearQAgent(in_mdp.get_actions(), num_feats)

    run_single_agent_on_mdp(dqn_agent, in_mdp, episodes=n_trajs, steps=n_steps)

    return dqn_agent.policy
Esempio n. 2
0
def plot_parameters(pars, md):
    cur_cell_rewards = [
        pars["white"][0], pars["yellow"][0], pars["red"][0], pars["green"][0],
        pars["purple"][0], -500
    ]
    # cur_cell_rewards = pars
    print(cur_cell_rewards)
    md.mdp = NavigationWorldMDP(width=md.side,
                                height=md.side,
                                nav_cell_types=md.nav_cell_types,
                                nav_cell_rewards=cur_cell_rewards,
                                nav_cell_p_or_locs=md.nav_cell_p_or_locs,
                                goal_cell_types=md.goal_cell_types,
                                goal_cell_rewards=md.goal_rew,
                                goal_cell_locs=md.goal_cell_loc,
                                init_loc=md.start_loc,
                                rand_init=False,
                                gamma=0.95,
                                slip_prob=0,
                                step_cost=0)

    md.agent = QLearningAgent(md.mdp.get_actions(), epsilon=md.eps)
    run_single_agent_on_mdp(md.agent,
                            md.mdp,
                            episodes=md.episodes,
                            steps=md.steps)
    md.agent.epsilon = 0
    md.mdp.slip_prob = 0
    _, steps_taken, reward, states = md.run_experiment(md.agent, md.mdp)
    # print('Best result observation:')
    print([md.count_turns(states), steps_taken, reward])
    # print('Observed data result:')
    # print(md.observed_data)
    md.mdp.visualize_grid(trajectories=[states], plot=False)
    return [md.count_turns(states), steps_taken, reward]
def main():

    # Setup MDP, Agents.
    mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4)
    viz = parse_args()

    # Choose viz type.
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
    elif viz == "interactive":
        mdp.visualize_interaction()
Esempio n. 4
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01)
    # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) 
    rm_agent = RMaxAgent(mdp.get_actions())
    viz = parse_args()
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
Esempio n. 5
0
def main():
    
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.1)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.2) 
    viz = parse_args()

    # Choose viz type.
    viz = "value"

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200)
    elif viz == "interactive":
        # Press <1>, <2>, <3>, and so on to execute action 1, action 2, etc.
    	mdp.visualize_interaction()
def main(open_plot=True):
    # Taxi initial state attributes..
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
    walls = []
    mdp = TaxiOOMDP(width=4,
                    height=4,
                    agent=agent,
                    walls=walls,
                    passengers=passengers)

    # Agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    viz = False
    if viz:
        # Visualize Taxi.
        run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
        mdp.visualize_agent(ql_agent)
    else:
        # Run experiment and make plot.
        run_agents_on_mdp([ql_agent, rand_agent],
                          mdp,
                          instances=10,
                          episodes=1,
                          steps=500,
                          reset_at_terminal=True,
                          open_plot=open_plot)
Esempio n. 7
0
    def func(self, *params, n_obs=100, batch_size=1, random_state=None):
        """Generate a sequence of samples from the Open AI env.

        Parameters
        ----------
        params : array of envs
        random_state : RandomState, optional

        """

        # fix locations instead of probabilities! fixed map multiple init_locs!
        rewards = []
        params = np.array(params).reshape(self.param_dim, -1)
        batches = params.shape[1]

        for i in range(batches):
            cur_cell_rewards = [x for x in params[:, i]]
            # reward for black cells is fixed
            cur_cell_rewards.append(-500)

            if self.prev_cell_rewards != cur_cell_rewards:
                self.mdp = NavigationWorldMDP(
                    width=self.side,
                    height=self.side,
                    nav_cell_types=self.nav_cell_types,
                    nav_cell_rewards=cur_cell_rewards,
                    nav_cell_p_or_locs=self.nav_cell_p_or_locs,
                    goal_cell_types=self.goal_cell_types,
                    goal_cell_rewards=self.goal_rew,
                    goal_cell_locs=self.goal_cell_loc,
                    init_loc=self.start_loc,
                    rand_init=False,
                    slip_prob=0)

                self.agent = QLearningAgent(self.mdp.get_actions(),
                                            epsilon=self.eps)
                run_single_agent_on_mdp(self.agent,
                                        self.mdp,
                                        episodes=self.episodes,
                                        steps=self.steps)

            self.agent.epsilon = 0
            self.mdp.slip_prob = self.slip

            # print('Parameters:')
            # print(cur_cell_rewards)
            for j in range(1):
                finished, steps_taken, reward, states = self.run_experiment(
                    self.agent, self.mdp)
                turns = self.count_turns(states)
                ep_reward = [turns, steps_taken, reward]
                # print('Corresponding reward:')
                # print([turns, steps_taken, reward])
                # self.mdp.visualize_grid(trajectories=[states], traj_colors_auto=False)

            rewards.append(ep_reward)

            self.prev_cell_rewards = cur_cell_rewards
        return rewards
    def __init__(self):
        self.base_human_model = PuddleMDP()
        self.base_agent = LinearQAgent(
            actions=self.base_human_model.get_actions(), num_features=2)
        run_single_agent_on_mdp(self.base_agent,
                                self.base_human_model,
                                episodes=10000,
                                steps=4)
        # TODO Add other settings

        self.current_agent = self.base_agent
        self.current_mdp = self.base_human_model
Esempio n. 9
0
    def __init__(self):
        self.base_human_model = PuddleMDP()
        self.base_agent = ModQLearningAgent(
            actions=self.base_human_model.get_actions(),
            epsilon=0.5,
            anneal=True)
        run_single_agent_on_mdp(self.base_agent,
                                self.base_human_model,
                                episodes=10000,
                                steps=60,
                                verbose=True)
        print("Q func", self.base_agent.q_func)
        self.test_run = False

        if self.test_run:
            self.novice_model_1 = self.base_human_model
            self.novice_model_2 = self.base_human_model
            self.fully_actulized_model = self.base_human_model

            self.novice_agent_1 = self.base_agent
            self.novice_agent_2 = self.base_agent
            self.fully_actulized_agent = self.base_agent
        else:
            self.novice_model_1 = PuddleMDP2()
            self.novice_agent_1 = ModQLearningAgent(
                actions=self.novice_model_1.get_actions(),
                epsilon=0.5,
                anneal=True)
            run_single_agent_on_mdp(self.novice_agent_1,
                                    self.novice_model_1,
                                    episodes=10000,
                                    steps=60,
                                    verbose=True)

            self.novice_model_2 = PuddleMDP3()
            self.novice_agent_2 = ModQLearningAgent(
                actions=self.novice_model_2.get_actions(),
                epsilon=0.5,
                anneal=True)
            run_single_agent_on_mdp(self.novice_agent_2,
                                    self.novice_model_2,
                                    episodes=10000,
                                    steps=60,
                                    verbose=True)

            self.fully_actulized_model = PuddleMDP4()
            self.fully_actulized_agent = ModQLearningAgent(
                actions=self.fully_actulized_model.get_actions(),
                epsilon=0.5,
                anneal=True)
            run_single_agent_on_mdp(self.fully_actulized_agent,
                                    self.fully_actulized_model,
                                    episodes=10000,
                                    steps=60,
                                    verbose=True)

        # TODO Add other settings

        self.current_agent = self.base_agent
        self.current_mdp = self.base_human_model
Esempio n. 10
0
def main(open_plot=True):
    # Taxi initial state attributes..
    agent = {"x":1, "y":1, "has_passenger":0}
    passengers = [{"x":3, "y":2, "dest_x":2, "dest_y":3, "in_taxi":0}]
    walls = []
    mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers)

    # Agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions()) 
    rand_agent = RandomAgent(actions=mdp.get_actions())

    viz = False
    if viz:
        # Visualize Taxi.
        run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
        mdp.visualize_agent(ql_agent)
    else:
        # Run experiment and make plot.
        run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, reset_at_terminal=True, open_plot=open_plot)
Esempio n. 11
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9)], gamma=0.95)
    ql_agent = QLearnerAgent(mdp.get_actions())

    viz = parse_args()

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        vi = ValueIteration(mdp)
        vi.run_vi()
        policy = vi.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print "\n", str(ql_agent), "interacting with", str(mdp)
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
Esempio n. 12
0
def branching_factor_experiment(min_options=0,
                                max_options=20,
                                increment=2,
                                instances=5,
                                epsilon=0.05):
    '''
    Args:
        min_options (int)
        max_options (int)
        increment (int)

    Summary:
        Runs an experiment contrasting learning performance for different # options.
    '''
    # Define MDP.
    grid_size = 7
    mdp = FourRoomMDP(width=grid_size,
                      height=grid_size,
                      goal_locs=[(grid_size, grid_size)])

    # Make State Abstraction.
    states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50)
    state_abstr = core.compute_phi_given_m(mdp,
                                           four_rooms_predicate_9x9,
                                           level=1,
                                           states=states)

    x_axis = range(min_options, max_options + 1, increment)
    y_axis = defaultdict(list)  #[] #[0] * len(x_axis)
    conf_intervals = defaultdict(list)
    num_options_performance = defaultdict(lambda: defaultdict(list))

    # Choose dependent variable (either #steps per episode or #episodes).
    d_var_range = [(20, 5), (40, 250), (400, 2500)]

    for steps, episodes in d_var_range:
        print "steps, episodes", steps, episodes

        # Evaluate.
        for i, instance in enumerate(range(instances)):
            print "\tInstance", instance + 1, "of", str(instances) + "."

            # Make initial Options.
            for num_options in x_axis:

                options, _ = make_near_optimal_phi_relative_options(
                    mdp,
                    state_abstr,
                    'eps-greedy',
                    num_rand_opts=num_options - 1,
                    eps=epsilon)
                action_abstr = ActionAbstraction(
                    options=options, prim_actions=mdp.get_actions())

                # Make agent.
                AgentClass = RMaxAgent  # DoubleQAgent, QLearningAgent, SarsaAgent
                sa_aa_agent = AbstractionWrapper(
                    AgentClass,
                    agent_params={"actions": mdp.get_actions()},
                    state_abstr=state_abstr,
                    action_abstr=action_abstr,
                    name_ext="-$\\phi,O$")

                _, _, value_per_episode = run_single_agent_on_mdp(
                    sa_aa_agent, mdp, episodes=episodes, steps=steps)
                mdp.reset()

                num_options_performance[(steps, episodes)][num_options].append(
                    value_per_episode[-1])

    ############
    # Other types

    # Just state abstraction.
    steps, episodes = d_var_range[-1][0], d_var_range[-1][1]
    sa_agent = AbstractionWrapper(AgentClass,
                                  agent_params={"actions": mdp.get_actions()},
                                  state_abstr=state_abstr,
                                  action_abstr=None,
                                  name_ext="-$\\phi$")
    _, _, value_per_episode = run_single_agent_on_mdp(sa_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["phi"].append(
                                 value_per_episode[-1])
    y_axis["phi"] = [value_per_episode[-1]]

    # Run random options.
    options = make_fixed_random_options(mdp, state_abstr)
    action_abstr = ActionAbstraction(options=options,
                                     prim_actions=mdp.get_actions())
    AgentClass = QLearningAgent
    rand_opt_agent = AbstractionWrapper(
        AgentClass,
        agent_params={"actions": mdp.get_actions()},
        state_abstr=state_abstr,
        action_abstr=action_abstr,
        name_ext="-$\\phi,O_{\text{random}}$")
    _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["random"].append(
                                 value_per_episode[-1])
    y_axis["random"] = [value_per_episode[-1]]

    # Makeoptimal agent.
    value_iter = ValueIteration(mdp)
    value_iter.run_vi()
    optimal_agent = FixedPolicyAgent(value_iter.policy)
    _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    y_axis["optimal"] = [value_per_episode[-1]]
    total_steps = d_var_range[0][0] * d_var_range[0][1]

    # Confidence intervals.
    for dependent_var in d_var_range:
        for num_options in x_axis:
            # Compute mean and standard error.
            avg_for_n = float(
                sum(num_options_performance[dependent_var]
                    [num_options])) / instances
            std_deviation = np.std(
                num_options_performance[dependent_var][num_options])
            std_error = 1.96 * (std_deviation / math.sqrt(
                len(num_options_performance[dependent_var][num_options])))
            y_axis[dependent_var].append(avg_for_n)
            conf_intervals[dependent_var].append(std_error)

    plt.xlabel("$|O_\\phi|$")
    plt.xlim([1, len(x_axis)])
    plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$")
    plt.tight_layout()  # Keeps the spacing nice.

    # Add just state abstraction.
    ep_val_del_q_phi = y_axis["phi"]
    label = "$O_{\\phi}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis),
             marker="+",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[-1],
             label=label)

    # Add random options.
    ep_val_del_q = y_axis["random"]
    label = "$O_{random}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q] * len(x_axis),
             marker="x",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[0])  #, label=label)

    # Add optimal.
    ep_val_optimal = y_axis["optimal"]
    plt.plot(x_axis, [ep_val_optimal] * len(x_axis),
             linestyle="-",
             linewidth=1.0,
             color=PLOT_COLORS[1])  #, label="$\\pi^*$")

    for i, dependent_var in enumerate(d_var_range):
        total_steps = dependent_var[0] * dependent_var[1]
        label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str(
            str(total_steps).count("0")) + "$"
        plt.plot(x_axis,
                 y_axis[dependent_var],
                 marker="x",
                 color=PLOT_COLORS[i + 2],
                 linewidth=1.5,
                 label=label)

        # Confidence intervals.
        top = np.add(y_axis[dependent_var], conf_intervals[dependent_var])
        bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var])
        plt.fill_between(x_axis,
                         top,
                         bot,
                         alpha=0.25,
                         color=PLOT_COLORS[i + 2])

    plt.legend()
    plt.savefig("branching_factor_results.pdf", format="pdf")
    plt.cla()
    plt.close()
Esempio n. 13
0
from simple_rl.agents import QLearnerAgent, RandomAgent
from simple_rl.tasks import TaxiOOMDP, BlockDudeOOMDP
from simple_rl.run_experiments import run_agents_on_mdp, run_single_agent_on_mdp

# Taxi initial state attributes..
agent = {"x": 1, "y": 1, "has_passenger": 0}
passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
walls = []
mdp = TaxiOOMDP(width=4,
                height=4,
                agent=agent,
                walls=walls,
                passengers=passengers)

ql_agent = QLearnerAgent(actions=mdp.get_actions())
rand_agent = RandomAgent(actions=mdp.get_actions())

viz = False
if viz:
    # Visualize Taxi.
    run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
    mdp.visualize_agent(ql_agent)
else:
    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=100,
                      steps=150,
                      reset_at_terminal=True)
Esempio n. 14
0
def prior_use_experiment(run_experiment=True, open_plot=True, verbose=True):
    """
    Prior use experiment:
    Record the ratio of prior use during the model's distance computation in the simple setting of interacting
    sequentially with two different environments.
    :param run_experiment: (bool) set to False for plot only
    :param open_plot: (bool) set to False to disable plot (only saving)
    :param verbose: (bool)
    :return: None
    """
    w = 4
    h = 4
    walls = [(2, 2), (3, 2), (4, 2), (2, 4)]
    env1 = HeatMap(width=w,
                   height=h,
                   init_loc=(1, 1),
                   goal_locs=[(w, h)],
                   is_goal_terminal=False,
                   walls=walls,
                   slip_prob=0.1,
                   goal_reward=1.0,
                   reward_span=1.0)
    env2 = HeatMap(width=w,
                   height=h,
                   init_loc=(1, 1),
                   goal_locs=[(w - 1, h)],
                   is_goal_terminal=False,
                   walls=walls,
                   slip_prob=0.05,
                   goal_reward=0.6,
                   reward_span=1.5)

    # Compute needed number of samples for L-R-MAX to achieve epsilon optimal behavior with probability (1 - delta)
    epsilon = .1
    delta = .05
    m_r = np.log(2. / delta) / (2. * epsilon**2)
    m_t = 2. * (np.log(2**(float(w * h) - float(len(walls))) - 2.) -
                np.log(delta)) / (epsilon**2)
    m = int(max(m_r, m_t))

    names = []

    for p in PRIORS:
        results = []
        name = 'default'
        for i in range(N_INSTANCES):
            agent = LRMaxExp(actions=env1.get_actions(),
                             gamma=GAMMA,
                             count_threshold=m,
                             epsilon=epsilon,
                             prior=p)
            name = agent.name

            if run_experiment:
                if verbose:
                    print('Running instance', i + 1, 'of', N_INSTANCES,
                          'for agent', name)

                run_single_agent_on_mdp(agent,
                                        env1,
                                        episodes=N_EPISODES,
                                        steps=N_STEPS,
                                        experiment=None,
                                        verbose=False,
                                        track_disc_reward=False,
                                        reset_at_terminal=False,
                                        resample_at_terminal=False)
                agent.reset()
                run_single_agent_on_mdp(agent,
                                        env2,
                                        episodes=N_EPISODES,
                                        steps=N_STEPS,
                                        experiment=None,
                                        verbose=False,
                                        track_disc_reward=False,
                                        reset_at_terminal=False,
                                        resample_at_terminal=False)

                results.append(agent.get_results())

        names.append(name)

        # Save results
        if run_experiment:
            utils.save_result(results, ROOT_PATH, name)

    # Plot
    utils.plot_computation_number_results(ROOT_PATH, names, open_plot)
    utils.plot_time_step_results(ROOT_PATH, names, open_plot)