def __init__(self,
                 actions,
                 rand_init=True,
                 name="Linear-Q",
                 alpha=0.001,
                 gamma=0.99,
                 epsilon=0.2,
                 explore="uniform",
                 feature=None,
                 anneal=True,
                 sarsa=False):
        # name = name + "-rbf" if rbf else name
        QLearningAgent.__init__(self,
                                actions=list(actions),
                                name=name,
                                alpha=alpha,
                                gamma=gamma,
                                epsilon=epsilon,
                                explore=explore,
                                anneal=anneal)

        self.sarsa = sarsa

        self.feature = feature  # function (state, action) -> features (numpy vector)

        self.num_features = self.feature.num_features()
        # Add a basis feature.
        self.rand_init = rand_init
        if rand_init:
            self.weights = np.random.random(self.num_features *
                                            len(self.actions))
        else:
            self.weights = np.zeros(self.num_features * len(self.actions))

        self.max_weight = 0.0
Esempio n. 2
0
    def __init__(self,
                 actions,
                 num_features,
                 rand_init=True,
                 name="Linear-Q",
                 alpha=0.2,
                 gamma=0.99,
                 epsilon=0.2,
                 explore="uniform",
                 rbf=False,
                 anneal=True):
        name = name + "-rbf" if rbf else name
        QLearningAgent.__init__(self,
                                actions=list(actions),
                                name=name,
                                alpha=alpha,
                                gamma=gamma,
                                epsilon=epsilon,
                                explore=explore,
                                anneal=anneal)
        self.num_features = num_features
        self.rand_init = rand_init

        # Add a basis feature.
        if rand_init:
            self.weights = np.random.random(self.num_features *
                                            len(self.actions))
        else:
            self.weights = np.zeros(self.num_features * len(self.actions))

        self.rbf = rbf
Esempio n. 3
0
def main():

    # Setup MDP.

    actual_args = {
        "width":
        10,
        "height":
        10,
        "init_loc": (1, 1),
        "goal_locs": [(10, 10)],
        "lava_locs": [(1, 10), (3, 10), (5, 10), (7, 10), (9, 10)],
        "gamma":
        0.9,
        "walls": [
            (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9),
            (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9),
            (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9),
            (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9)
        ],
        "slip_prob":
        0.01,
        "lava_cost":
        1.0,
        "step_cost":
        0.1
    }

    mdp = GridWorldMDP(**actual_args)

    # Initialize the custom Q function for a q-learning agent. This should be equivalent to potential shaping.
    # This should cause the Q agent to learn more quickly.
    custom_q = defaultdict(lambda: defaultdict(lambda: 0))
    custom_q[GridWorldState(5, 1)]['right'] = 1.0
    custom_q[GridWorldState(2, 1)]['right'] = 1.0

    # Make a normal q-learning agent and another initialized with the custom_q above.
    # Finally, make a random agent to compare against.
    ql_agent = QLearningAgent(actions=mdp.get_actions(),
                              epsilon=0.2,
                              alpha=0.4)
    ql_agent_pot = QLearningAgent(actions=mdp.get_actions(),
                                  epsilon=0.2,
                                  alpha=0.4,
                                  custom_q_init=custom_q,
                                  name="PotQ")
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, ql_agent_pot, rand_agent],
                      mdp,
                      instances=2,
                      episodes=60,
                      steps=200,
                      open_plot=True,
                      verbose=True)
Esempio n. 4
0
 def __init__(self, actions, num_features, rand_init=True, name="Linear-Q", alpha=0.2, gamma=0.99, epsilon=0.2, explore="uniform", anneal=True):
     QLearningAgent.__init__(self, actions=list(actions), name=name, alpha=alpha, gamma=gamma, epsilon=epsilon, explore=explore, anneal=anneal)
     self.num_features = num_features
     self.rand_init = rand_init
     
     # Add a basis feature.
     if rand_init:
         self.weights = np.random.random(self.num_features*len(self.actions))
     else:
         self.weights = np.zeros(self.num_features*len(self.actions))
Esempio n. 5
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01)
    # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) 
    rm_agent = RMaxAgent(mdp.get_actions())
    viz = parse_args()
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
Esempio n. 6
0
def main():
    # Setup MDP, Agents.
    size = 5
    agent = {
        "x": 1,
        "y": 1,
        "dx": 1,
        "dy": 0,
        "dest_x": size,
        "dest_y": size,
        "has_block": 0
    }
    blocks = [{"x": size, "y": 1}]
    lavas = [{
        "x": x,
        "y": y
    } for x, y in map(lambda z: (z + 1, (size + 1) / 2), range(size))]

    mdp = TrenchOOMDP(size, size, agent, blocks, lavas)
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=30,
                      episodes=250,
                      steps=250)
Esempio n. 7
0
def main(open_plot=True):

    # Setup MDP.
    mdp = GridWorldMDP(width=8,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(8, 3)],
                       lava_locs=[(4, 2)],
                       gamma=0.95,
                       walls=[(2, 2)],
                       slip_prob=0.05)

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=20,
                      episodes=300,
                      steps=20,
                      open_plot=open_plot,
                      track_success=True,
                      success_reward=1)
def main(open_plot=True):
    # Taxi initial state attributes..
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
    walls = []
    mdp = TaxiOOMDP(width=4,
                    height=4,
                    agent=agent,
                    walls=walls,
                    passengers=passengers)

    # Agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    viz = False
    if viz:
        # Visualize Taxi.
        run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
        mdp.visualize_agent(ql_agent)
    else:
        # Run experiment and make plot.
        run_agents_on_mdp([ql_agent, rand_agent],
                          mdp,
                          instances=10,
                          episodes=1,
                          steps=500,
                          reset_at_terminal=True,
                          open_plot=open_plot)
Esempio n. 9
0
    def __init__(self, balancer_node):
        super().__init__(balancer_node)

        self.data = {}
        self.agent = QLearningAgent(
            ['NONE', 'UP', 'DOWN'],
            epsilon=0.3,
            anneal=True,
            gamma=0.3,
            alpha=0.2,
            # explore='softmax'
        )
        self._set_q()

        self.max_node_num = len(self.balancer_node.get_nodes())
        self._impossible = False
def main():

    # Setup MDP, Agents.
    mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4)
    viz = parse_args()

    # Choose viz type.
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
    elif viz == "interactive":
        mdp.visualize_interaction()
Esempio n. 11
0
def main():
    
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.1)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.2) 
    viz = parse_args()

    # Choose viz type.
    viz = "value"

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200)
    elif viz == "interactive":
        # Press <1>, <2>, <3>, and so on to execute action 1, action 2, etc.
    	mdp.visualize_interaction()
Esempio n. 12
0
def main():
    # Command line args.
    task, rom = parse_args()

    # Setup the MDP.
    mdp = choose_mdp(task, rom)
    actions = mdp.get_actions()
    gamma = mdp.get_gamma()

    # Setup agents.
    from simple_rl.agents import RandomAgent, QLearningAgent

    random_agent = RandomAgent(actions)
    qlearner_agent = QLearningAgent(actions, gamma=gamma, explore="uniform")
    agents = [qlearner_agent, random_agent]

    # Run Agents.
    if isinstance(mdp, MarkovGameMDP):
        # Markov Game.
        agents = {
            qlearner_agent.name: qlearner_agent,
            random_agent.name: random_agent
        }
        play_markov_game(agents, mdp, instances=100, episodes=1, steps=500)
    else:
        # Regular experiment.
        run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
Esempio n. 13
0
def main(open_plot=True):
    # Setup MDP.
    mdp = GridWorldMDP(width=4,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(4, 3)],
                       lava_locs=[(4, 2)],
                       gamma=0.95,
                       walls=[(2, 2)],
                       slip_prob=0.05)

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())
    tabular_agent = CherryQAgent(mdp,
                                 model=lambda *x: ActionValueFunction(*x, init=1.0),
                                 name='Tabular',
                                 lr=0.7)
    linear_agent = CherryQAgent(mdp,
                                model=lambda *x: nn.Linear(*x),
                                name='Linear',
                                lr=0.1)
    mlp_agent = CherryQAgent(mdp,
                             model=lambda *x: MLP(*x),
                             name='MLP',
                             lr=0.07)

    # Run experiment and make plot.
    agents = [rand_agent, ql_agent, tabular_agent, linear_agent, mlp_agent]
    run_agents_on_mdp(agents,
                      mdp,
                      instances=10,
                      episodes=50,
                      steps=50,
                      open_plot=open_plot)
Esempio n. 14
0
def plot_parameters(pars, md):
    cur_cell_rewards = [
        pars["white"][0], pars["yellow"][0], pars["red"][0], pars["green"][0],
        pars["purple"][0], -500
    ]
    # cur_cell_rewards = pars
    print(cur_cell_rewards)
    md.mdp = NavigationWorldMDP(width=md.side,
                                height=md.side,
                                nav_cell_types=md.nav_cell_types,
                                nav_cell_rewards=cur_cell_rewards,
                                nav_cell_p_or_locs=md.nav_cell_p_or_locs,
                                goal_cell_types=md.goal_cell_types,
                                goal_cell_rewards=md.goal_rew,
                                goal_cell_locs=md.goal_cell_loc,
                                init_loc=md.start_loc,
                                rand_init=False,
                                gamma=0.95,
                                slip_prob=0,
                                step_cost=0)

    md.agent = QLearningAgent(md.mdp.get_actions(), epsilon=md.eps)
    run_single_agent_on_mdp(md.agent,
                            md.mdp,
                            episodes=md.episodes,
                            steps=md.steps)
    md.agent.epsilon = 0
    md.mdp.slip_prob = 0
    _, steps_taken, reward, states = md.run_experiment(md.agent, md.mdp)
    # print('Best result observation:')
    print([md.count_turns(states), steps_taken, reward])
    # print('Observed data result:')
    # print(md.observed_data)
    md.mdp.visualize_grid(trajectories=[states], plot=False)
    return [md.count_turns(states), steps_taken, reward]
Esempio n. 15
0
    def func(self, *params, n_obs=100, batch_size=1, random_state=None):
        """Generate a sequence of samples from the Open AI env.

        Parameters
        ----------
        params : array of envs
        random_state : RandomState, optional

        """

        # fix locations instead of probabilities! fixed map multiple init_locs!
        rewards = []
        params = np.array(params).reshape(self.param_dim, -1)
        batches = params.shape[1]

        for i in range(batches):
            cur_cell_rewards = [x for x in params[:, i]]
            # reward for black cells is fixed
            cur_cell_rewards.append(-500)

            if self.prev_cell_rewards != cur_cell_rewards:
                self.mdp = NavigationWorldMDP(
                    width=self.side,
                    height=self.side,
                    nav_cell_types=self.nav_cell_types,
                    nav_cell_rewards=cur_cell_rewards,
                    nav_cell_p_or_locs=self.nav_cell_p_or_locs,
                    goal_cell_types=self.goal_cell_types,
                    goal_cell_rewards=self.goal_rew,
                    goal_cell_locs=self.goal_cell_loc,
                    init_loc=self.start_loc,
                    rand_init=False,
                    slip_prob=0)

                self.agent = QLearningAgent(self.mdp.get_actions(),
                                            epsilon=self.eps)
                run_single_agent_on_mdp(self.agent,
                                        self.mdp,
                                        episodes=self.episodes,
                                        steps=self.steps)

            self.agent.epsilon = 0
            self.mdp.slip_prob = self.slip

            # print('Parameters:')
            # print(cur_cell_rewards)
            for j in range(1):
                finished, steps_taken, reward, states = self.run_experiment(
                    self.agent, self.mdp)
                turns = self.count_turns(states)
                ep_reward = [turns, steps_taken, reward]
                # print('Corresponding reward:')
                # print([turns, steps_taken, reward])
                # self.mdp.visualize_grid(trajectories=[states], traj_colors_auto=False)

            rewards.append(ep_reward)

            self.prev_cell_rewards = cur_cell_rewards
        return rewards
Esempio n. 16
0
def main(open_plot=True):
    # Setup MDP, Agents.
    markov_game = RockPaperScissorsMDP()
    ql_agent = QLearningAgent(actions=markov_game.get_actions())
    fixed_action = random.choice(markov_game.get_actions())
    fixed_agent = FixedPolicyAgent(policy=lambda s: fixed_action)

    # Run experiment and make plot.
    play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40, open_plot=open_plot) 
def main(open_plot=True):
    # Make MDP distribution, agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")

    ql_agent = QLearningAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Run experiment and make plot.
    run_agents_lifelong([ql_agent, rand_agent], mdp_distr, samples=10, episodes=50, steps=100, reset_at_terminal=True, open_plot=open_plot)
Esempio n. 18
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)])
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())
    abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9}, actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
Esempio n. 19
0
def main(open_plot=True):
    state_colors = defaultdict(lambda: defaultdict(lambda: "white"))
    state_colors[3][2] = "red"

    # Setup MDP, Agents.
    mdp = ColoredGridWorldMDP(state_colors)
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot)
Esempio n. 20
0
def test_utility(args, mdp):
    # The number of options to the performance
    # TODO: Compare the utility of point options vs. subgoal options?
    now_ts = str(datetime.now().timestamp())
    origMatrix, intToS = GetAdjacencyMatrix(mdp)
    known_region = list(intToS.values())  # Known region is a set of MDPStates.

    n_ops_list = [2, 4, 8, 16, 32]

    agents = []
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    agents.append(ql_agent)

    method = 'fiedler'

    for n_ops in n_ops_list:
        _, foptions, _, fvectors = GetOption(mdp,
                                             n_ops,
                                             matrix=origMatrix,
                                             intToS=intToS,
                                             option_type=args.optiontype,
                                             method=method)
        print('#options=', n_ops)
        print(foptions)

        if args.optiontype == 'subgoal':
            known_region = list(
                intToS.values())  # Known region is a set of MDPStates.
            eigenoption_agent = build_subgoal_option_agent(
                mdp,
                foptions,
                known_region,
                vectors=fvectors,
                name='-' + method + '-' + args.optiontype + '-' + str(n_ops))
        else:
            eigenoption_agent = build_point_option_agent(
                mdp,
                foptions,
                agent=QLearningAgent,
                policy='vi',
                name='-' + method + '-' + args.optiontype + '-' + str(n_ops))

        agents.append(eigenoption_agent)

    run_agents_on_mdp(agents,
                      mdp,
                      instances=args.ninstances,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      open_plot=True,
                      track_disc_reward=True,
                      cumulative_plot=True,
                      dir_for_plot="results/")
def main(open_plot=True):

    # Setup MDP.

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    if args.visualize:
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))

    else:
        custom_q = parse_custom_q_table(args.custom_q, args.default_q)

        agents = []
        for agent in args.agents:
            if agent == 'q_learning':
                agents.append(QLearningAgent(actions=mdp.get_actions()))
            elif agent == 'potential_q':
                agents.append(
                    QLearningAgent(actions=mdp.get_actions(),
                                   custom_q_init=custom_q,
                                   name="Potential_Q"))
            elif agent == 'random':
                agents.append(RandomAgent(actions=mdp.get_actions()))
            elif agent == 'rmax':
                agents.append(RMaxAgent(mdp.get_actions()))

        # Run experiment and make plot.
        run_agents_on_mdp(agents,
                          mdp,
                          instances=1,
                          episodes=100,
                          steps=100,
                          open_plot=open_plot,
                          verbose=True)
Esempio n. 22
0
def main():
    import OptimalBeliefAgentClass

    # Setup multitask setting.
    # R ~ D : Puddle, Rock Sample
    # G ~ D : octo, four_room
    # T ~ D : grid

    mdp_class, is_goal_terminal, samples = parse_args()

    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()
    print "done."  #, iters, value
    sys.stdout.flush()

    # Agents.
    print "Making agents...",
    sys.stdout.flush()
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy)
    opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy,
                                              name="$\pi_{prior}$")
    opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent(
        mdp_distr, actions)
    vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$")
    rand_agent = RandomAgent(actions, name="$\pi^u$")
    ql_agent = QLearningAgent(actions)
    print "done."

    agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=False,
                          track_disc_reward=False,
                          cumulative_plot=True)
Esempio n. 23
0
def main():
    # create mdp using own definition
    mdp = tfeMDP()

    # Three different agents to compare how each do against each other
    rand_agent = RandomAgent(actions=mdp.get_actions())
    rmax_agent = RMaxAgent(actions=mdp.get_actions())
    agent = QLearningAgent(actions=mdp.get_actions())

    # Function that actually runs everything and generates the appropriate
    # graphs and statistics defining how each agent did
    run_agents_on_mdp([agent, rmax_agent, rand_agent], mdp,
                      instances=200, episodes=100, steps=1000)
Esempio n. 24
0
def get_combo_experiment_agents(environment):
    '''
    Args:
        environment (simple_rl.MDPDistribution)

    Returns:
        (list)
    '''
    actions = environment.get_actions()
    gamma = environment.get_gamma()

    sa, aa = get_directed_option_sa_pair(
        environment,
        indic_func=ind_funcs._q_disc_approx_indicator,
        max_options=100)
    sa_qds_test = get_sa(environment,
                         indic_func=ind_funcs._q_disc_approx_indicator,
                         epsilon=0.05)
    sa_qs_test = get_sa(environment,
                        indic_func=ind_funcs._q_eps_approx_indicator,
                        epsilon=0.1)

    # QLearner.
    ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    rmax_agent = RMaxAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)

    # Combos.
    ql_sa_qds_agent = AbstractionWrapper(QLearningAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=sa_qds_test,
                                         name_ext="$\phi_{Q_d^*}$")
    ql_sa_qs_agent = AbstractionWrapper(QLearningAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=sa_qs_test,
                                        name_ext="$\phi_{Q_\epsilon^*}$")

    # sa_agent = AbstractionWrapper(QLearningAgent, actions, str(environment), state_abstr=sa, name_ext="sa")
    aa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={"actions": actions},
                                  action_abstr=aa,
                                  name_ext="aa")
    sa_aa_agent = AbstractionWrapper(QLearningAgent,
                                     agent_params={"actions": actions},
                                     state_abstr=sa,
                                     action_abstr=aa,
                                     name_ext="$\phi_{Q_d^*}+aa$")

    agents = [ql_agent, ql_sa_qds_agent, ql_sa_qs_agent, aa_agent, sa_aa_agent]

    return agents
def main():

    # Make MDP Distribution.
    mdp_distr = make_mdp_distr(mdp_class="four_room",
                               grid_dim=11,
                               slip_prob=0.05,
                               gamma=0.99)

    # Make SA.
    multitask_sa_beta_1 = make_multitask_sa_info_sa(mdp_distr,
                                                    beta=1.0,
                                                    is_deterministic_ib=True)
    multitask_sa_beta_10 = make_multitask_sa_info_sa(mdp_distr,
                                                     beta=10.0,
                                                     is_deterministic_ib=True)
    multitask_sa_beta_100 = make_multitask_sa_info_sa(mdp_distr,
                                                      beta=100.0,
                                                      is_deterministic_ib=True)
    multitask_sa_beta_1000 = make_multitask_sa_info_sa(
        mdp_distr, beta=1000.0, is_deterministic_ib=True)

    # Make agent.
    ql_agent = QLearningAgent(mdp_distr.get_actions())
    abstr_ql_b1 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_1,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 1}$")
    abstr_ql_b10 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_10,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 10}$")
    abstr_ql_b100 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_100,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 100}$")
    abstr_ql_b1000 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_1000,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 1000}$")
    run_agents_lifelong(
        [abstr_ql_b1, abstr_ql_b10, abstr_ql_b100, abstr_ql_b1000, ql_agent],
        mdp_distr,
        steps=200,
        samples=50,
        episodes=200)
Esempio n. 26
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = BanditMDP()

    lin_agent = LinUCBAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, lin_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=1,
                      steps=500,
                      open_plot=open_plot)
Esempio n. 27
0
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy (lambda : simple_rl.State --> str)
        beta_list (list)
        is_deterministic_ib (bool)

    Summary:
        Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction.
    '''
    # Run info_sa.
    dict_of_phi_pmfs = {}
    for beta in beta_list:
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib)

        # Translate abstractions.
        prob_s_phi = ProbStateAbstraction(phi_pmf)
        crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
        #ground state to abstract state
        dict_of_phi_pmfs[beta] = crisp_s_phi
        print("crisp_s_phi:" )
        for single_state in crisp_s_phi.get_abs_states():
            print(str(type(single_state)))
            print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state)))
        print("ground states:")
        for ground_states in crisp_s_phi.get_ground_states():
            print(str(type(ground_states)))
        print(len(crisp_s_phi.get_ground_states()))
        print(len(crisp_s_phi.get_abs_states()))

    # Make agents.
    demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$")
    ql_agent = QLearningAgent(mdp.get_actions())
    agent_dict = {}
    for beta in beta_list:
        beta_phi = dict_of_phi_pmfs[beta]
        ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True})
        agent_dict[beta] = ql_abstr_agent

    # Learn.
    run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5)

    # Print num abstract states.
    for beta in dict_of_phi_pmfs.keys():
        print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states()
    print
Esempio n. 28
0
def _setup_agents(solar_mdp):
    '''
    Args:
        solar_mdp (SolarOOMDP)

    Returns:
        (list): of Agents
    '''
    # Get relevant MDP params.
    actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma(
    ), solar_mdp.get_panel_step()

    # Setup fixed agent.
    static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel")
    optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal")

    # Grena single axis and double axis trackers from time/loc.
    grena_tracker = SolarTracker(tb.grena_tracker,
                                 panel_step=panel_step,
                                 dual_axis=solar_mdp.dual_axis,
                                 actions=solar_mdp.get_bandit_actions())
    grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(),
                                           name="grena-tracker")

    # Setup RL agents
    alpha, epsilon = 0.1, 0.05
    rand_init = True
    num_features = solar_mdp.get_num_state_feats()
    lin_ucb_agent = LinUCBAgent(solar_mdp.get_bandit_actions(),
                                context_size=num_features,
                                name="lin-ucb",
                                rand_init=rand_init,
                                alpha=2.0)
    # sarsa_agent_g0 = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin-g0", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=0, rbf=False, anneal=True)
    # sarsa_agent = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=False, anneal=True)
    ql_agent = QLearningAgent(actions,
                              alpha=alpha,
                              epsilon=epsilon,
                              gamma=gamma)
    random_agent = RandomAgent(actions)

    # Regular experiments.
    # agents = [lin_ucb_agent, sarsa_agent, sarsa_agent_g0, grena_tracker_agent, static_agent]
    agents = [grena_tracker_agent, static_agent]

    return agents
Esempio n. 29
0
def main():

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    ql_agent = QLearningAgent(mdp.get_actions(),
                              epsilon=args.epsilon,
                              alpha=args.alpha,
                              explore=args.explore,
                              anneal=args.anneal)
    viz = args.mode

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        rand_agent = RandomAgent(actions=mdp.get_actions())
        run_agents_on_mdp([rand_agent, ql_agent],
                          mdp,
                          open_plot=True,
                          episodes=60,
                          steps=200,
                          instances=5,
                          success_reward=1)
        # mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent,
                               delay=0.005,
                               num_ep=500,
                               num_steps=200)
Esempio n. 30
0
def main():

    # ========================
    # === Make Environment ===
    # ========================
    mdp_class = "hrooms"
    environment = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = environment.get_actions()

    # ==========================
    # === Make SA, AA Stacks ===
    # ==========================
    # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3)
    sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3)

    # Debug.
    print("\n" + ("=" * 30))
    print("== Done making abstraction. ==")
    print("=" * 30 + "\n")
    sa_stack.print_state_space_sizes()
    print("Num Action Abstractions:", len(aa_stack.get_aa_list()))

    # ===================
    # === Make Agents ===
    # ===================
    baseline_agent = QLearningAgent(actions)
    rmax_agent = RMaxAgent(actions)
    rand_agent = RandomAgent(actions)
    l0_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$")
    l1_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$")
    # l2_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$")
    dynamic_hierarch_agent = DynamicHierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")
    # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")

    print("\n" + ("=" * 26))
    print("== Running experiments. ==")
    print("=" * 26 + "\n")

    # ======================
    # === Run Experiment ===
    # ======================
    agents = [l1_hierarch_agent, dynamic_hierarch_agent, baseline_agent]
    run_agents_multi_task(agents, environment, task_samples=10, steps=1500, episodes=1, reset_at_terminal=True)
Esempio n. 31
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=4,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(4, 3)],
                       gamma=0.95,
                       walls=[(2, 2)])

    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=1,
                      steps=20,
                      open_plot=open_plot)
Esempio n. 32
0
 def reset(self):
     self.weights = np.zeros(self.num_features*len(self.actions))
     QLearningAgent.reset(self)