Esempio n. 1
0
    def __init__(self,
                 actions,
                 num_features,
                 rand_init=True,
                 name="ql-linear",
                 alpha=0.2,
                 gamma=0.99,
                 epsilon=0.2,
                 explore="uniform",
                 rbf=False,
                 anneal=True):
        name = name + "-rbf" if rbf else name
        QLearnerAgent.__init__(self,
                               actions=list(actions),
                               name=name,
                               alpha=alpha,
                               gamma=gamma,
                               epsilon=epsilon,
                               explore=explore,
                               anneal=anneal)
        self.num_features = num_features
        # Add a basis feature.
        if rand_init:
            self.weights = np.random.random(self.num_features *
                                            len(self.actions))
        else:
            self.weights = np.zeros(self.num_features * len(self.actions))

        self.rbf = rbf
Esempio n. 2
0
def main():
    # Setup MDP, Agents.
    size = 5
    agent = {
        "x": 1,
        "y": 1,
        "dx": 1,
        "dy": 0,
        "dest_x": size,
        "dest_y": size,
        "has_block": 0
    }
    blocks = [{"x": size, "y": 1}]
    lavas = [{
        "x": x,
        "y": y
    } for x, y in map(lambda z: (z + 1, (size + 1) / 2), xrange(size))]

    mdp = TrenchOOMDP(size, size, agent, blocks, lavas)
    ql_agent = QLearnerAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    # run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250)

    vi = ValueIteration(mdp, delta=0.0001, max_iterations=5000)
    iters, val = vi.run_vi()
    print " done."
    states = vi.get_states()
    num_states = len(states)
    print num_states, states
Esempio n. 3
0
def main():
    # Command line args.
    task, rom = parse_args()

    # Setup the MDP.
    mdp = choose_mdp(task, rom)
    actions = mdp.get_actions()
    gamma = mdp.get_gamma()

    # Setup agents.
    from simple_rl.agents import RandomAgent, RMaxAgent, QLearnerAgent, LinearQLearnerAgent

    random_agent = RandomAgent(actions)
    rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=4, s_a_threshold=2)
    qlearner_agent = QLearnerAgent(actions, gamma=gamma, explore="uniform")
    lqlearner_agent = LinearQLearnerAgent(actions,
                                          gamma=gamma,
                                          explore="uniform")
    agents = [qlearner_agent, random_agent]

    # Run Agents.
    if isinstance(mdp, MarkovGameMDP):
        # Markov Game.
        agents = {
            qlearner_agent.name: qlearner_agent,
            random_agent.name: random_agent
        }
        play_markov_game(agents, mdp, instances=100, episodes=1, steps=500)
    else:
        # Regular experiment.
        run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
Esempio n. 4
0
def main(open_plot=True):
    # Taxi initial state attributes..
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
    walls = []
    mdp = TaxiOOMDP(width=4,
                    height=4,
                    agent=agent,
                    walls=walls,
                    passengers=passengers)

    # Agents.
    ql_agent = QLearnerAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    viz = False
    if viz:
        # Visualize Taxi.
        run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
        mdp.visualize_agent(ql_agent)
    else:
        # Run experiment and make plot.
        run_agents_on_mdp([ql_agent, rand_agent],
                          mdp,
                          instances=50,
                          episodes=1,
                          steps=2000,
                          reset_at_terminal=True,
                          open_plot=open_plot)
Esempio n. 5
0
def main(open_plot=True):
    # Make MDP distribution, agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")
    ql_agent = QLearnerAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Run experiment and make plot.
    run_agents_multi_task([ql_agent, rand_agent], mdp_distr, task_samples=50, episodes=1, steps=1500, reset_at_terminal=True, open_plot=open_plot)
Esempio n. 6
0
def main(open_plot=True):
	# Setup MDP, Agents.
	markov_game = GatheringMDP()
	ql_agent = QLearnerAgent(actions=markov_game.get_actions())
	fixed_action = random.choice(markov_game.get_actions())
	fixed_agent = FixedPolicyAgent(policy=lambda s:fixed_action)

	# Run experiment and make plot.
	play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40, open_plot=open_plot)
Esempio n. 7
0
def main(open_plot=True):
    state_colors = defaultdict(lambda:defaultdict(lambda:"white"))
    state_colors[3][2] = "red"

    # Setup MDP, Agents.
    mdp = ColoredGridWorldMDP(state_colors)
    ql_agent = QLearnerAgent(actions=mdp.get_actions()) 
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot) 
Esempio n. 8
0
def main():
    # Make MDP distribution, agents.
    mdp_distr = make_mdp_distr(mdp_class="grid")
    ql_agent = QLearnerAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Run experiment and make plot.
    run_agents_multi_task([ql_agent, rand_agent],
                          mdp_distr,
                          task_samples=30,
                          episodes=100,
                          steps=50,
                          reset_at_terminal=True,
                          include_optimal=True)
Esempio n. 9
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=10,
                       height=10,
                       init_loc=(1, 1),
                       goal_locs=[(10, 10)])
    ql_agent = QLearnerAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=150,
                      open_plot=open_plot)
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")
    ql_agent = QLearnerAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Make goal-based option agent.
    goal_based_options = aa_helpers.make_goal_based_options(mdp_distr)
    goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(),
                                      options=goal_based_options)
    option_agent = AbstractionWrapper(QLearnerAgent,
                                      actions=mdp_distr.get_actions(),
                                      action_abstr=goal_based_aa)

    # Run experiment and make plot.
    run_agents_multi_task([ql_agent, rand_agent, option_agent],
                          mdp_distr,
                          task_samples=10,
                          episodes=100,
                          steps=150,
                          open_plot=open_plot)
Esempio n. 11
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9)], gamma=0.95)
    ql_agent = QLearnerAgent(mdp.get_actions())

    viz = parse_args()

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        vi = ValueIteration(mdp)
        vi.run_vi()
        policy = vi.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print "\n", str(ql_agent), "interacting with", str(mdp)
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
Esempio n. 12
0
def main():
    # Command line args.
    task, rom = parse_args()

    # Setup the MDP.
    mdp = choose_mdp(task, rom)
    actions = mdp.get_actions()
    gamma = mdp.get_gamma()

    # Setup agents.
    random_agent = RandomAgent(actions)
    rmax_agent = RMaxAgent(actions, gamma=gamma)
    qlearner_agent = QLearnerAgent(actions, gamma=gamma)
    lin_approx_agent = LinearApproxQLearnerAgent(actions, gamma=gamma)
    grad_boost_agent = GradientBoostingAgent(actions,
                                             gamma=gamma,
                                             explore="softmax")

    # Choose agents.
    agents = [lin_approx_agent, random_agent]

    # Run experiments.
    run_agents_on_mdp(agents, mdp)
def main(eps=0.1, open_plot=True):

    mdp_class, is_goal_terminal, samples, alg = parse_args()

    # Setup multitask setting.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    ### Yuu

    transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy,
                                            name="transferFixed")
    rand_agent = RandomAgent(actions, name="$\pi^u$")

    opt_q_func = compute_optimistic_q_function(mdp_distr)
    avg_q_func = avg_mdp_vi.get_q_function()

    if alg == "q":
        pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0")
        qmax = 1.0 * (1 - 0.99)
        # qmax = 1.0
        pure_ql_agent_opt = QLearnerAgent(actions,
                                          epsilon=eps,
                                          default_q=qmax,
                                          name="Q-vmax")
        transfer_ql_agent_optq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-max")
        transfer_ql_agent_optq.set_init_q_function(opt_q_func)
        transfer_ql_agent_avgq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-avg")
        transfer_ql_agent_avgq.set_init_q_function(avg_q_func)

        agents = [
            pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq,
            transfer_ql_agent_avgq
        ]
    elif alg == "rmax":
        pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax")
        updating_trans_rmax_agent = UpdatingRMaxAgent(actions,
                                                      name="RMAX-updating_max")
        trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max")
        trans_rmax_agent.set_init_q_function(opt_q_func)
        agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent]
    elif alg == "delayed-q":
        pure_delayed_ql_agent = DelayedQLearnerAgent(actions,
                                                     opt_q_func,
                                                     name="DelayedQ-vmax")
        pure_delayed_ql_agent.set_vmax()
        updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent(
            actions, name="DelayedQ-updating_max")
        trans_delayed_ql_agent = DelayedQLearnerAgent(
            actions, opt_q_func, name="DelayedQ-trans-max")
        agents = [
            pure_delayed_ql_agent, updating_delayed_ql_agent,
            trans_delayed_ql_agent
        ]
    else:
        print "Unknown type of agents:", alg
        print "(q, rmax, delayed-q)"
        assert (False)

    # Run task.
    # TODO: Function for Learning on each MDP
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=is_goal_terminal,
                          is_rec_disc_reward=False,
                          cumulative_plot=True,
                          open_plot=open_plot)
Esempio n. 14
0
#!/usr/bin/env python

# Python imports.
import random

# Other imports.
import srl_example_setup
from simple_rl.agents import QLearnerAgent, FixedPolicyAgent
from simple_rl.tasks import RockPaperScissorsMDP
from simple_rl.run_experiments import play_markov_game

# Setup MDP, Agents.
markov_game = RockPaperScissorsMDP()
ql_agent = QLearnerAgent(actions=markov_game.get_actions())
fixed_action = random.choice(markov_game.get_actions())
fixed_agent = FixedPolicyAgent(policy=lambda s: fixed_action)

# Run experiment and make plot.
play_markov_game([ql_agent, fixed_agent],
                 markov_game,
                 instances=15,
                 episodes=1,
                 steps=40)
Esempio n. 15
0
 def reset(self):
     self.weights = np.zeros(self.num_features * len(self.actions))
     QLearnerAgent.reset(self)
Esempio n. 16
0
def _setup_agents(solar_mdp):
    '''
    Args:
        solar_mdp (SolarOOMDP)

    Returns:
        (list): of Agents
    '''
    # Get relevant MDP params.
    actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma(
    ), solar_mdp.get_panel_step()

    # Setup fixed agent.
    static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel")
    optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal")

    # Grena single axis and double axis trackers from time/loc.
    grena_tracker = SolarTracker(tb.grena_tracker,
                                 panel_step=panel_step,
                                 dual_axis=solar_mdp.dual_axis)
    grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(),
                                           name="grena-tracker")

    # Setup RL agents
    alpha, epsilon = 0.1, 0.05
    rand_init = True
    num_features = solar_mdp.get_num_state_feats()
    lin_ucb_agent = LinUCBAgent(solar_mdp.get_bandit_actions(),
                                name="lin-ucb",
                                rand_init=rand_init,
                                alpha=2.0)
    sarsa_agent_g0 = LinearSarsaAgent(actions,
                                      num_features=num_features,
                                      name="sarsa-lin-g0",
                                      rand_init=rand_init,
                                      alpha=alpha,
                                      epsilon=epsilon,
                                      gamma=0,
                                      rbf=False,
                                      anneal=True)
    sarsa_agent = LinearSarsaAgent(actions,
                                   num_features=num_features,
                                   name="sarsa-lin",
                                   rand_init=rand_init,
                                   alpha=alpha,
                                   epsilon=epsilon,
                                   gamma=gamma,
                                   rbf=False,
                                   anneal=True)
    ql_agent = QLearnerAgent(actions,
                             alpha=alpha,
                             epsilon=epsilon,
                             gamma=gamma)
    random_agent = RandomAgent(actions)

    # Regular experiments.
    agents = [lin_ucb_agent, sarsa_agent, sarsa_agent_g0
              ]  #, grena_tracker_agent, static_agent] #, optimal_agent]
    # agents = [lin_ucb_agent, grena_tracker_agent] #, sarsa_agent, sarsa_agent_g0, grena_tracker_agent, static_agent] #, optimal_agent]
    # agents = [grena_tracker_agent, static_agent] #, optimal_agent]

    return agents
def main():

    # Setup environment.
    mdp_class, agent_type, samples = parse_args()
    is_goal_terminal = False
    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute priors.

    # Stochastic mixture.
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy)

    # Avg MDP
    avg_mdp = ape.compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    # Make agents.

    # Q Learning
    ql_agent = QLearnerAgent(actions)
    shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy,
                                         actions=actions,
                                         name="Prior-QLearning")
    shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy,
                                          actions=actions,
                                          name="AvgMDP-QLearning")

    # RMax
    rmax_agent = RMaxAgent(actions)
    shaped_rmax_agent_prior = ShapedRMaxAgent(
        shaping_policy=opt_stoch_policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="Prior-RMax")
    shaped_rmax_agent_avgmdp = ShapedRMaxAgent(
        shaping_policy=avg_mdp_vi.policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="AvgMDP-RMax")
    prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr)

    if agent_type == "rmax":
        agents = [
            rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp,
            prune_rmax_agent
        ]
    else:
        agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=200,
                          is_rec_disc_reward=False,
                          verbose=True)
Esempio n. 18
0
import srl_example_setup
from simple_rl.agents import QLearnerAgent, RandomAgent
from simple_rl.tasks import TaxiOOMDP, BlockDudeOOMDP
from simple_rl.run_experiments import run_agents_on_mdp, run_single_agent_on_mdp

# Taxi initial state attributes..
agent = {"x": 1, "y": 1, "has_passenger": 0}
passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
walls = []
mdp = TaxiOOMDP(width=4,
                height=4,
                agent=agent,
                walls=walls,
                passengers=passengers)

ql_agent = QLearnerAgent(actions=mdp.get_actions())
rand_agent = RandomAgent(actions=mdp.get_actions())

viz = False
if viz:
    # Visualize Taxi.
    run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
    mdp.visualize_agent(ql_agent)
else:
    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=100,
                      steps=150,
                      reset_at_terminal=True)