Ejemplo n.º 1
0
def main():
    height = 2  # vertical space
    task = DroneTask("red", "None")
    room1 = DroneRoom("room1", [(x, y, z) for x in range(0, 4)
                                for y in range(0, 1)
                                for z in range(height)], "red")
    room2 = DroneRoom("room2", [(x, y, z) for x in range(0, 2)
                                for y in range(2, 3) for z in range(height)],
                      color="green")
    room3 = DroneRoom("room3", [(x, y, z) for x in range(3, 4)
                                for y in range(2, 3) for z in range(height)],
                      color="blue")
    block1 = DroneBlock("block1", 0, 2, 0, color="red")
    block2 = DroneBlock("block2", 2, 0, -1, color="green")
    block3 = DroneBlock("block3", 3, 2, 0, color="blue")
    rooms = [room1, room2, room3]
    blocks = [block1, block2, block3]
    doors = [DroneDoor(1, 1, height), DroneDoor(3, 1, height)]
    mdp = DroneMDP((0, 0, 0), task, rooms=rooms, blocks=blocks, doors=doors)

    # print("Start Q learning")
    # ql_agent = QLearningAgent(actions=mdp.get_actions())
    # # run_agents_on_mdp([ql_agent], mdp, instances=2, episodes=2500, steps=100, reset_at_terminal=True, verbose=True)
    # run_single_agent_on_mdp(ql_agent, mdp, episodes=2000, steps=200)
    print("Start Value Iteration")
    vi = ValueIteration(mdp)
    vi.run_vi()
    action_seq, state_seq = vi.plan(mdp.init_state)
    policy = defaultdict()
    for i in range(len(action_seq)):
        policy[state_seq[i]] = action_seq[i]
    print("Start AirSim")
    # mdp.visualize_agent(ql_agent)
    mdp.visualize_policy(policy)
Ejemplo n.º 2
0
def run_no_speech(task_block, task_room, photo_pos, drone_pos, pub,
                  drone_path):
    """
    Assume the block is on the floor of each cell
    Get initial pos of drone from caller
    """
    height = 2  # vertical space
    task = DroneTask(task_block, task_room)
    room1 = DroneRoom("room1", [(x, y, z) for x in range(4) for y in range(1)
                                for z in range(height)], "red")
    room2 = DroneRoom("room2", [(x, y, z) for x in range(0, 2)
                                for y in range(2, 4) for z in range(height)],
                      color="green")
    room3 = DroneRoom("room3", [(x, y, z) for x in range(3, 4)
                                for y in range(2, 4) for z in range(height)],
                      color="blue")
    block1 = DroneBlock("block1",
                        photo_pos[0],
                        photo_pos[1],
                        photo_pos[2] - 1,
                        color="photo")
    rooms = [room1, room2, room3]
    blocks = [block1]
    doors = [DroneDoor(1, 1, height), DroneDoor(3, 1, height)]
    mdp = DroneMDP(drone_pos, task, rooms=rooms, blocks=blocks, doors=doors)

    print("Start Value Iteration")
    vi = ValueIteration(mdp)
    vi.run_vi()
    action_seq, state_seq = vi.plan(mdp.init_state)
    policy = defaultdict()
    for i in range(len(action_seq)):
        policy[state_seq[i]] = action_seq[i]
    print("Start Flying")
    mdp.send_path(policy, pub, drone_path)
Ejemplo n.º 3
0
def plan_with_vi(gamma=0.99):
    '''
    Args:
        gamma (float): discount factor

    Running value iteration on the problem to test the correctness of the policy returned by BSS
    '''
    mdp = GridWorldMDP(gamma=gamma, goal_locs=[(4, 3)], slip_prob=0.0)
    value_iter = ValueIteration(mdp, sample_rate=5)
    value_iter.run_vi()

    action_seq, state_seq = value_iter.plan(mdp.get_init_state())

    print "[ValueIteration] Plan for {}".format(mdp)
    for i in range(len(action_seq)):
        print 'pi({}) --> {}'.format(state_seq[i], action_seq[i])
Ejemplo n.º 4
0
def main():

    # ========================
    # === Make Environment ===
    # ========================
    mdp_class = "four_room"
    environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=10)
    actions = environment.get_actions()

    # ==========================
    # === Make SA, AA Stacks ===
    # ==========================
    # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3)
    sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment,
                                                          num_levels=3)

    mdp = environment.sample()
    HVI = HierarchicalValueIteration(mdp, sa_stack, aa_stack)
    VI = ValueIteration(mdp)

    h_iters, h_val = HVI.run_vi()
    iters, val = VI.run_vi()

    print "H:", h_iters, h_val
    print "V:", iters, val
Ejemplo n.º 5
0
def compute_sub_opt_func_for_mdp_distr(mdp_distr):
    '''
    Args:
        mdp_distr (dict)

    Returns:
        (list): Contains the suboptimality function for each MDP in mdp_distr.
            subopt: V^*(s) - Q^(s,a)
    '''
    actions = mdp_distr.get_actions()
    sub_opt_funcs = []

    i = 0
    for mdp in mdp_distr.get_mdps():
        print "\t mdp", i + 1, "of", mdp_distr.get_num_mdps()
        vi = ValueIteration(mdp, delta=0.001, max_iterations=1000)
        iters, value = vi.run_vi()

        new_sub_opt_func = defaultdict(float)
        for s in vi.get_states():
            max_q = float("-inf")
            for a in actions:
                next_q = vi.get_q_value(s, a)
                if next_q > max_q:
                    max_q = next_q

            for a in actions:
                new_sub_opt_func[(s, a)] = max_q - vi.get_q_value(s, a)

        sub_opt_funcs.append(new_sub_opt_func)
        i += 1

    return sub_opt_funcs
    def get_policy(self, mdp, verbose=False):
        '''
        Args:
            mdp (MDP): MDP (same level as the current Policy Generator)
        Returns:
            policy (defaultdict): optimal policy in mdp
        '''
        vi = ValueIteration(mdp, sample_rate=1)
        vi.run_vi()

        policy = defaultdict()
        action_seq, state_seq = vi.plan(mdp.init_state)

        if verbose: print('Plan for {}:'.format(mdp))
        for i in range(len(action_seq)):
            if verbose:
                print("\tpi[{}] -> {}".format(state_seq[i], action_seq[i]))
            policy[state_seq[i]] = action_seq[i]
        return policy
Ejemplo n.º 7
0
def compute_avg_mdp(mdp_distr, sample_rate=5):
    '''
    Args:
        mdp_distr (defaultdict)

    Returns:
        (MDP)
    '''

    # Get normal components.
    init_state = mdp_distr.get_init_state()
    actions = mdp_distr.get_actions()
    gamma = mdp_distr.get_gamma()
    T = mdp_distr.get_all_mdps()[0].get_transition_func()

    # Compute avg reward.
    avg_rew = defaultdict(lambda: defaultdict(float))
    avg_trans_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(
        float)))  # Stores T_i(s,a,s') * Pr(M_i)
    for mdp in mdp_distr.get_mdps():
        prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp)

        # Get a vi instance to compute state space.
        vi = ValueIteration(mdp,
                            delta=0.0001,
                            max_iterations=2000,
                            sample_rate=sample_rate)
        iters, value = vi.run_vi()
        states = vi.get_states()

        for s in states:
            for a in actions:
                r = mdp.reward_func(s, a)

                avg_rew[s][a] += prob_of_mdp * r

                for repeat in range(sample_rate):
                    s_prime = mdp.transition_func(s, a)
                    avg_trans_counts[s][a][s_prime] += prob_of_mdp

    avg_trans_probs = defaultdict(
        lambda: defaultdict(lambda: defaultdict(float)))
    for s in avg_trans_counts.keys():
        for a in actions:
            for s_prime in avg_trans_counts[s][a].keys():
                avg_trans_probs[s][a][s_prime] = avg_trans_counts[s][a][
                    s_prime] / sum(avg_trans_counts[s][a].values())

    def avg_rew_func(s, a):
        return avg_rew[s][a]

    avg_trans_func = T
    avg_mdp = MDP(actions, avg_trans_func, avg_rew_func, init_state, gamma)

    return avg_mdp
def get_distance(mdp, epsilon=0.05):

    vi = ValueIteration(mdp)
    vi.run_vi()
    vstar = vi.value_func  # dictionary of state -> float

    states = vi.get_states()  # list of state

    distance = defaultdict(lambda: defaultdict(float))

    v_df = ValueIterationDist(mdp, vstar)
    v_df.run_vi()
    d_to_s = v_df.distance
    for t in states:
        for s in states:
            distance[t][s] = max(d_to_s[t] - 1, 0)

    for s in states:  # s: state
        vis = ValueIterationDist(mdp, vstar)
        vis.add_fixed_val(s, vstar[s])
        vis.run_vi()
        d_to_s = vis.distance
        for t in states:
            distance[t][s] = min(d_to_s[t], distance[t][s])

    sToInd = OrderedDict()
    indToS = OrderedDict()
    for i, s in enumerate(states):
        sToInd[s] = i
        indToS[i] = s

    d = np.zeros((len(states), len(states)), dtype=int)
    # print "type(d)=", type(d)
    # print "d.shape=", d.shape
    for s in states:
        for t in states:
            # print 's, t=', index[s], index[t]
            d[sToInd[s]][sToInd[t]] = distance[s][t]

    return sToInd, indToS, d
Ejemplo n.º 9
0
def main():
    import OptimalBeliefAgentClass

    # Setup multitask setting.
    # R ~ D : Puddle, Rock Sample
    # G ~ D : octo, four_room
    # T ~ D : grid

    mdp_class, is_goal_terminal, samples = parse_args()

    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()
    print "done."  #, iters, value
    sys.stdout.flush()

    # Agents.
    print "Making agents...",
    sys.stdout.flush()
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy)
    opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy,
                                              name="$\pi_{prior}$")
    opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent(
        mdp_distr, actions)
    vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$")
    rand_agent = RandomAgent(actions, name="$\pi^u$")
    ql_agent = QLearningAgent(actions)
    print "done."

    agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=False,
                          track_disc_reward=False,
                          cumulative_plot=True)
Ejemplo n.º 10
0
    def planFromAtoB(self, Maps, nearestVertex, kStepConfig):

        # if not self.computedMDP:
        #     self.wallLocations = []
        #     for x in range(len(self.Maps.occupancyMap)):
        #         for y in range(len(self.Maps.occupancyMap[x])):
        #             if self.Maps.occupancyMap[x][y] == Env.WALL:
        #                 self.wallLocations.append(Loc.Location(x,y))
        #     self.computedMDP = True

        mdp = GridWorldMDP(width=len(Maps.occupancyMap),
                           height=len(Maps.occupancyMap[0]),
                           init_loc=(nearestVertex.x, nearestVertex.y),
                           goal_locs=[(kStepConfig.x, kStepConfig.y)],
                           gamma=0.95)
        vi = ValueIteration(mdp)
        vi.run_vi()
        action_seq, state_seq = vi.plan()

        #check if conflict
        for s in state_seq:
            if Maps.occupancyMap[s[0], s[1]] == env.WALL:
                return False
        return True
Ejemplo n.º 11
0
def _make_mini_mdp_option_policy(mini_mdp):
    '''
    Args:
        mini_mdp (MDP)

    Returns:
        Policy
    '''
    # Solve the MDP defined by the terminal abstract state.
    mini_mdp_vi = ValueIteration(mini_mdp, delta=0.001, max_iterations=1000, sample_rate=10)
    iters, val = mini_mdp_vi.run_vi()

    o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states())
    o_policy = PolicyFromDict(o_policy_dict)

    return o_policy.get_action
Ejemplo n.º 12
0
def _make_mini_mdp_option_policy(mini_mdp):
    '''
    Args:
        mini_mdp (MDP)

    Returns:
        Policy
    '''
    # Solve the MDP defined by the terminal abstract state.
    mini_mdp_vi = ValueIteration(mini_mdp,
                                 delta=0.005,
                                 max_iterations=1000,
                                 sample_rate=30)
    iters, val = mini_mdp_vi.run_vi()

    o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy,
                                          mini_mdp_vi.get_states())
    o_policy = PolicyFromDict(o_policy_dict)

    return o_policy.get_action, mini_mdp_vi
def compute_optimistic_q_function(mdp_distr, sample_rate=5):
    '''
    Instead of transferring an average Q-value, we transfer the highest Q-value in MDPs so that
    it will not under estimate the Q-value.
    '''
    opt_q_func = defaultdict(lambda: defaultdict(lambda: float("-inf")))
    for mdp in mdp_distr.get_mdps():
        # prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp)

        # Get a vi instance to compute state space.
        vi = ValueIteration(mdp,
                            delta=0.0001,
                            max_iterations=1000,
                            sample_rate=sample_rate)
        iters, value = vi.run_vi()
        q_func = vi.get_q_function()
        # print "value =", value
        for s in q_func:
            for a in q_func[s]:
                opt_q_func[s][a] = max(opt_q_func[s][a], q_func[s][a])
    return opt_q_func
Ejemplo n.º 14
0
def compute_optimal_stoch_policy(mdp_distr):
    '''
    Args:
        mdp_distr (defaultdict)

    Returns:
        (lambda)
    '''

    # Key: state
    # Val: dict
    # Key: action
    # Val: probability
    policy_dict = defaultdict(lambda: defaultdict(float))

    # Compute optimal policy for each MDP.
    for mdp in mdp_distr.get_all_mdps():
        # Solve the MDP and get the optimal policy.
        vi = ValueIteration(mdp, delta=0.001, max_iterations=1000)
        iters, value = vi.run_vi()
        vi_policy = vi.policy
        states = vi.get_states()

        # Compute the probability each action is optimal in each state.
        prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp)
        for s in states:
            a_star = vi_policy(s)
            policy_dict[s][a_star] += prob_of_mdp

    # Create the lambda.
    def policy_from_dict(state):
        action_id = np.random.multinomial(
            1, policy_dict[state].values()).tolist().index(1)
        action = policy_dict[state].keys()[action_id]

        return action

    return policy_from_dict
Ejemplo n.º 15
0
 def update_init_q_function(self, mdp):
     if self.task_number == 0:
         self.default_q_func = copy.deepcopy(self.default_q_func)
     elif self.task_number < self.num_sample_tasks:
         new_q_func = self.q_func
         for x in new_q_func:
             for y in new_q_func[x]:
                 self.default_q_func[x][y] = max(new_q_func[x][y],
                                                 self.default_q_func[x][y])
     elif self.task_number == self.num_sample_tasks:
         vi = ValueIteration(mdp,
                             delta=0.1,
                             max_iterations=2,
                             sample_rate=1)
         _, _ = vi.run_vi()
         new_q_func = vi.get_q_function()  # VI to enumerate all states
         for s in new_q_func:
             for a in new_q_func[s]:
                 if self.default_q_func[s][
                         a] < 0:  # If (s, a) is never visited set Vmax
                     self.default_q_func[s][a] = self.default_q
         print(self.name, "Initial Q func from", self.task_number, "tasks")
         self.print_dict(self.default_q_func)
def main(eps=0.1, open_plot=True):

    mdp_class, is_goal_terminal, samples, alg = parse_args()

    # Setup multitask setting.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    ### Yuu

    transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy,
                                            name="transferFixed")
    rand_agent = RandomAgent(actions, name="$\pi^u$")

    opt_q_func = compute_optimistic_q_function(mdp_distr)
    avg_q_func = avg_mdp_vi.get_q_function()

    if alg == "q":
        pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0")
        qmax = 1.0 * (1 - 0.99)
        # qmax = 1.0
        pure_ql_agent_opt = QLearnerAgent(actions,
                                          epsilon=eps,
                                          default_q=qmax,
                                          name="Q-vmax")
        transfer_ql_agent_optq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-max")
        transfer_ql_agent_optq.set_init_q_function(opt_q_func)
        transfer_ql_agent_avgq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-avg")
        transfer_ql_agent_avgq.set_init_q_function(avg_q_func)

        agents = [
            pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq,
            transfer_ql_agent_avgq
        ]
    elif alg == "rmax":
        pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax")
        updating_trans_rmax_agent = UpdatingRMaxAgent(actions,
                                                      name="RMAX-updating_max")
        trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max")
        trans_rmax_agent.set_init_q_function(opt_q_func)
        agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent]
    elif alg == "delayed-q":
        pure_delayed_ql_agent = DelayedQLearnerAgent(actions,
                                                     opt_q_func,
                                                     name="DelayedQ-vmax")
        pure_delayed_ql_agent.set_vmax()
        updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent(
            actions, name="DelayedQ-updating_max")
        trans_delayed_ql_agent = DelayedQLearnerAgent(
            actions, opt_q_func, name="DelayedQ-trans-max")
        agents = [
            pure_delayed_ql_agent, updating_delayed_ql_agent,
            trans_delayed_ql_agent
        ]
    else:
        print "Unknown type of agents:", alg
        print "(q, rmax, delayed-q)"
        assert (False)

    # Run task.
    # TODO: Function for Learning on each MDP
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=is_goal_terminal,
                          is_rec_disc_reward=False,
                          cumulative_plot=True,
                          open_plot=open_plot)
Ejemplo n.º 17
0
    def update_init_q_function(self, mdp):
        '''
        If sample_with_q is True, run Q-learning for sample tasks.
        If qstar_transfer is True, run value iteration for sample tasks to get Q*.
        Else, run delayed Q-learning for sample tasks
        '''
        if self.sample_with_q:
            if self.task_number == 0:
                self.init_q_func = copy.deepcopy(self.q_agent.q_func)
            elif self.task_number < self.num_sample_tasks:
                new_q_func = self.q_agent.q_func
                for x in new_q_func:
                    for y in new_q_func[x]:
                        self.init_q_func[x][y] = max(new_q_func[x][y],
                                                     self.init_q_func[x][y])
        elif self.qstar_transfer:
            if self.task_number == 0:
                self.init_q_func = defaultdict(
                    lambda: defaultdict(lambda: float("-inf")))
            # else:
            elif self.task_number < self.num_sample_tasks:
                vi = ValueIteration(mdp,
                                    delta=0.0001,
                                    max_iterations=2000,
                                    sample_rate=5)
                _, _ = vi.run_vi()
                new_q_func = vi.get_q_function()
                for x in new_q_func:
                    for y in new_q_func[x]:
                        self.init_q_func[x][y] = max(new_q_func[x][y],
                                                     self.init_q_func[x][y])
        else:
            if self.task_number == 0:
                self.init_q_func = defaultdict(
                    lambda: defaultdict(lambda: float("-inf")))
            elif self.task_number < self.num_sample_tasks:
                new_q_func = self.q_func
                for x in new_q_func:
                    assert len(self.init_q_func[x]) <= len(new_q_func[x])
                    for y in new_q_func[x]:
                        self.init_q_func[x][y] = max(new_q_func[x][y],
                                                     self.init_q_func[x][y])
                        assert (self.init_q_func[x][y] <= self.default_q)

                ### Uncomment the code below to check if Q-value is converging to the optimal enough
                # Compare q_func learned vs. the true Q value.
                # vi = ValueIteration(mdp, delta=0.001, max_iterations=2000, sample_rate=5)
                # _, _ = vi.run_vi()
                # qstar_func = vi.get_q_function()  # VI to enumerate all states
                # print "Q-function learned by delayed-Q"
                # self.print_dict(new_q_func)
                # print "Optimal Q-function"
                # self.print_dict(qstar_func)

        if self.task_number == self.num_sample_tasks:
            vi = ValueIteration(mdp,
                                delta=0.1,
                                max_iterations=2,
                                sample_rate=1)
            _, _ = vi.run_vi()
            new_q_func = vi.get_q_function()  # VI to enumerate all states
            for s in new_q_func:
                for a in new_q_func[s]:
                    if self.init_q_func[s][
                            a] < 0:  # If (s, a) is never visited set Vmax
                        self.init_q_func[s][a] = self.default_q
            print(self.name, "Initial Q func from", self.task_number, "tasks")
            self.print_dict(self.init_q_func)
Ejemplo n.º 18
0
def main(open_plot=True):
    episodes = 100
    steps = 100
    gamma = 0.95

    mdp_class, is_goal_terminal, samples, alg = parse_args()

    # Setup multitask setting.
    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal,
                               gamma=gamma)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print("Making and solving avg MDP...", end='')
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    ### Yuu

    # transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed")
    rand_agent = RandomAgent(actions, name="$\\pi^u$")

    opt_q_func = compute_optimistic_q_function(mdp_distr)
    avg_q_func = get_q_func(avg_mdp_vi)

    best_v = -100  # Maximum possible value an agent can get in the environment.
    for x in opt_q_func:
        for y in opt_q_func[x]:
            best_v = max(best_v, opt_q_func[x][y])
    print("Vmax =", best_v)
    vmax = best_v

    vmax_func = defaultdict(lambda: defaultdict(lambda: vmax))

    if alg == "q":
        eps = 0.1
        lrate = 0.1
        pure_ql_agent = QLearningAgent(actions,
                                       gamma=gamma,
                                       alpha=lrate,
                                       epsilon=eps,
                                       name="Q-0")
        pure_ql_agent_opt = QLearningAgent(actions,
                                           gamma=gamma,
                                           alpha=lrate,
                                           epsilon=eps,
                                           default_q=vmax,
                                           name="Q-Vmax")
        ql_agent_upd_maxq = UpdatingQLearnerAgent(actions,
                                                  alpha=lrate,
                                                  epsilon=eps,
                                                  gamma=gamma,
                                                  default_q=vmax,
                                                  name="Q-MaxQInit")

        transfer_ql_agent_optq = QLearningAgent(actions,
                                                gamma=gamma,
                                                alpha=lrate,
                                                epsilon=eps,
                                                name="Q-UO")
        transfer_ql_agent_optq.set_init_q_function(opt_q_func)

        transfer_ql_agent_avgq = QLearningAgent(actions,
                                                gamma=gamma,
                                                alpha=lrate,
                                                epsilon=eps,
                                                name="Q-AverageQInit")
        transfer_ql_agent_avgq.set_init_q_function(avg_q_func)

        agents = [
            transfer_ql_agent_optq, ql_agent_upd_maxq, transfer_ql_agent_avgq,
            pure_ql_agent_opt, pure_ql_agent
        ]
    elif alg == "rmax":
        """
        Note that Rmax is a model-based algorithm and is very slow compared to other model-free algorithms like Q-learning and delayed Q-learning.
        """
        known_threshold = 10
        min_experience = 5
        pure_rmax_agent = RMaxAgent(actions,
                                    gamma=gamma,
                                    horizon=known_threshold,
                                    s_a_threshold=min_experience,
                                    name="RMAX-Vmax")
        updating_trans_rmax_agent = UpdatingRMaxAgent(
            actions,
            gamma=gamma,
            horizon=known_threshold,
            s_a_threshold=min_experience,
            name="RMAX-MaxQInit")
        trans_rmax_agent = RMaxAgent(actions,
                                     gamma=gamma,
                                     horizon=known_threshold,
                                     s_a_threshold=min_experience,
                                     name="RMAX-UO")
        trans_rmax_agent.set_init_q_function(opt_q_func)
        agents = [
            trans_rmax_agent, updating_trans_rmax_agent, pure_rmax_agent,
            rand_agent
        ]
    elif alg == "delayed-q":
        torelance = 0.1
        min_experience = 5
        pure_delayed_ql_agent = DelayedQAgent(actions,
                                              gamma=gamma,
                                              m=min_experience,
                                              epsilon1=torelance,
                                              name="DelayedQ-Vmax")
        pure_delayed_ql_agent.set_q_function(vmax_func)
        updating_delayed_ql_agent = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            name="DelayedQ-MaxQInit")
        updating_delayed_ql_agent.set_q_function(vmax_func)
        trans_delayed_ql_agent = DelayedQAgent(actions,
                                               gamma=gamma,
                                               m=min_experience,
                                               epsilon1=torelance,
                                               name="DelayedQ-UO")
        trans_delayed_ql_agent.set_q_function(opt_q_func)

        agents = [
            pure_delayed_ql_agent, updating_delayed_ql_agent,
            trans_delayed_ql_agent, rand_agent
        ]
        # agents = [updating_delayed_ql_agent, trans_delayed_ql_agent, rand_agent]
    elif alg == "sample-effect":
        """
        This runs a comparison of MaxQInit with different number of MDP samples to calculate the initial Q function. Note that the performance of the sampled MDP is ignored for this experiment. It reproduces the result of Figure 4 of "Policy and Value Transfer for Lifelong Reinforcement Learning".
        """
        torelance = 0.1
        min_experience = 5
        pure_delayed_ql_agent = DelayedQAgent(actions,
                                              opt_q_func,
                                              m=min_experience,
                                              epsilon1=torelance,
                                              name="DelayedQ-Vmax")
        pure_delayed_ql_agent.set_vmax()
        dql_60samples = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            num_sample_tasks=60,
            name="$DelayedQ-MaxQInit60$")
        dql_40samples = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            num_sample_tasks=40,
            name="$DelayedQ-MaxQInit40$")
        dql_20samples = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            num_sample_tasks=20,
            name="$DelayedQ-MaxQInit20$")

        # Sample MDPs. Note that the performance of the sampled MDP is ignored and not included in the average in the final plot.
        run_agents_lifelong([dql_20samples],
                            mdp_distr,
                            samples=int(samples * 1 / 5.0),
                            episodes=episodes,
                            steps=steps,
                            reset_at_terminal=is_goal_terminal,
                            track_disc_reward=False,
                            cumulative_plot=True,
                            open_plot=open_plot)
        # mdp_distr.reset_tasks()
        run_agents_lifelong([dql_40samples],
                            mdp_distr,
                            samples=int(samples * 2 / 5.0),
                            episodes=episodes,
                            steps=steps,
                            reset_at_terminal=is_goal_terminal,
                            track_disc_reward=False,
                            cumulative_plot=True,
                            open_plot=open_plot)
        # mdp_distr.reset_tasks()
        run_agents_lifelong([dql_60samples],
                            mdp_distr,
                            samples=int(samples * 3 / 5.0),
                            episodes=episodes,
                            steps=steps,
                            reset_at_terminal=is_goal_terminal,
                            track_disc_reward=False,
                            cumulative_plot=True,
                            open_plot=open_plot)
        # mdp_distr.reset_tasks()
        # agents = [pure_delayed_ql_agent]
        agents = [
            dql_60samples, dql_40samples, dql_20samples, pure_delayed_ql_agent
        ]
    else:
        msg = "Unknown type of agent:" + alg + ". Use -agent_type (q, rmax, delayed-q)"
        assert False, msg

    # Run task.
    run_agents_lifelong(agents,
                        mdp_distr,
                        samples=samples,
                        episodes=episodes,
                        steps=steps,
                        reset_at_terminal=is_goal_terminal,
                        track_disc_reward=False,
                        cumulative_plot=True,
                        open_plot=open_plot)
class PUDDLER:
    def __init__(self):
        self.base_human_model = PuddleMDP(step_cost=1.0)
        self.base_agent = ValueIteration(self.base_human_model,
                                         max_iterations=5000,
                                         sample_rate=1)
        self.sample_agent = ModQLearningAgent(
            actions=self.base_human_model.get_actions(),
            epsilon=0.5,
            anneal=True)
        #run_single_agent_on_mdp(self.base_agent, self.base_human_model, episodes=10000, steps=60, verbose=True)
        self.base_agent.run_vi()

        #print ("Q func", self.base_agent.q_func)
        self.test_run = False

        if self.test_run:
            self.novice_model_1 = self.base_human_model
            self.novice_model_2 = self.base_human_model
            self.fully_actulized_model = self.base_human_model

            self.novice_agent_1 = self.base_agent
            self.novice_agent_2 = self.base_agent
            self.fully_actulized_agent = self.base_agent
        else:

            self.novice_model_1 = PuddleMDP2(step_cost=1.0)
            self.novice_agent_1 = ValueIteration(self.novice_model_1)
            self.novice_agent_1.run_vi()

            self.novice_model_2 = PuddleMDP3(step_cost=1.0)
            self.novice_agent_2 = ValueIteration(self.novice_model_2)
            self.novice_agent_2.run_vi()

            self.fully_actulized_model = PuddleMDP4(step_cost=1.0)
            self.fully_actulized_agent = ValueIteration(
                self.fully_actulized_model)
            self.fully_actulized_agent.run_vi()
            #self.fully_actulized_agent = ModQLearningAgent(actions=self.fully_actulized_model.get_actions(), epsilon=0.5, anneal=True)
            #run_single_agent_on_mdp(self.fully_actulized_agent, self.fully_actulized_model, episodes=10000, steps=60, verbose=True)

        # TODO Add other settings

        self.current_agent = self.base_agent
        self.current_mdp = self.base_human_model

    def get_init_info(self):
        data_points = []
        return data_points

    def get_human_reinf_from_prev_step(self,
                                       state,
                                       action,
                                       explanation_features=[0, 0]):
        delta = 0.1
        print(explanation_features)
        if explanation_features[1] == 1 and explanation_features[0] == 1:
            self.current_mdp = self.fully_actulized_model
            self.current_agent = self.fully_actulized_agent
        elif explanation_features[0] == 1:
            self.current_mdp = self.novice_model_1
            self.current_agent = self.novice_agent_1
        elif explanation_features[1] == 1:
            self.current_mdp = self.novice_model_2
            self.current_agent = self.novice_agent_2
        else:
            self.current_mdp = self.base_human_model
            self.current_agent = self.base_agent

        curr_best_q_val = self.current_agent.get_value(state)
        curr_q_val = self.current_agent.get_q_value(state, action)
        #        return curr_q_val - curr_best_q_val
        return min((float(curr_best_q_val - curr_q_val) + delta) /
                   (float(curr_best_q_val) + delta), 1)

    def get_possible_actions(self):
        return self.base_human_model.get_actions()

    def get_best_action(self, state, explanation_features=[0, 0]):
        if explanation_features[1] == 1 and explanation_features[0] == 1:
            self.current_mdp = self.fully_actulized_model
            self.current_agent = self.fully_actulized_agent
        elif explanation_features[0] == 1:
            self.current_mdp = self.novice_model_1
            self.current_agent = self.novice_agent_1
        elif explanation_features[1] == 1:
            self.current_mdp = self.novice_model_2
            self.current_agent = self.novice_agent_2
        else:
            self.current_mdp = self.base_human_model
            self.current_agent = self.base_agent

        return self.current_agent._get_max_q_action(state)

    def get_initial_state(self):
        # TODO Randomize
        return self.base_human_model.get_init_state()

    def get_initial_state_features(self):
        return self.base_human_model.get_init_state().features()

    def get_next_state(self, state, act, explanation_features=[0]):
        if explanation_features[0] >= 0.5:
            self.current_mdp = self.fully_actulized_model
            self.current_agent = self.fully_actulized_agent
        else:
            self.current_mdp = self.base_human_model
            self.current_agent = self.base_agent

        self.current_mdp.set_state(state)
        reward, new_state = self.current_mdp.execute_agent_action(act)
        return new_state

    def set_state(self, x, y):
        state = GridWorldState(x, y)
        self.base_human_model.set_state(state)
        return state

    def visualize_agent(self, state):
        self.base_human_model.set_state(state)
        self.base_human_model.visualize_state(self.sample_agent)
Ejemplo n.º 20
0
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0, track_act_opt_pr=False):
    '''
    Args:
        mdp (MDP)
        indic_func (S x S --> {0,1})
        state_class (Class)
        epsilon (float)

    Returns:
        (StateAbstraction)
    '''

    print("\tRunning VI...",)
    sys.stdout.flush()
    # Run VI
    if isinstance(mdp, MDPDistribution):
        mdp = mdp.sample()

    vi = ValueIteration(mdp)
    iters, val = vi.run_vi()
    print(" done.")

    print("\tMaking state abstraction...",)
    sys.stdout.flush()
    sa = StateAbstraction(phi={}, state_class=state_class, track_act_opt_pr=track_act_opt_pr)
    clusters = defaultdict(list)
    num_states = len(vi.get_states())

    actions = mdp.get_actions()
    # Find state pairs that satisfy the condition.
    for i, state_x in enumerate(vi.get_states()):
        sys.stdout.flush()
        clusters[state_x] = [state_x]

        for state_y in vi.get_states()[i:]:
            if not (state_x == state_y) and indic_func(state_x, state_y, vi, actions, epsilon=epsilon):
                clusters[state_x].append(state_y)
                clusters[state_y].append(state_x)

    print("making clusters...",)
    sys.stdout.flush()
    
    # Build SA.
    for i, state in enumerate(clusters.keys()):
        new_cluster = clusters[state]
        sa.make_cluster(new_cluster)

        # Destroy old so we don't double up.
        for s in clusters[state]:
            if s in clusters.keys():
                clusters.pop(s)
    
    if aa_single_act:
        # Put all optimal actions in a set associated with the ground state.
        for ground_s in sa.get_ground_states():
            a_star_set = set(vi.get_max_q_actions(ground_s))
            sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp)

    print(" done.")
    print("\tGround States:", num_states)
    print("\tAbstract:", sa.get_num_abstr_states())
    print()

    return sa
Ejemplo n.º 21
0
def make_singletask_sa(mdp,
                       indic_func,
                       state_class,
                       epsilon=0.0,
                       aa_single_act=False,
                       prob_of_mdp=1.0):
    '''
    Args:
        mdp (MDP)
        indic_func (S x S --> {0,1})
        state_class (Class)
        epsilon (float)

    Returns:
        (StateAbstraction)
    '''

    print "\tRunning VI...",
    sys.stdout.flush()
    # Run VI
    if isinstance(mdp, MDPDistribution):
        mdp = mdp.sample()

    vi = ValueIteration(mdp)
    iters, val = vi.run_vi()
    print " done."

    print "\tMaking state abstraction...",
    sys.stdout.flush()
    sa = StateAbstraction(phi={}, state_class=state_class)
    clusters = defaultdict(set)
    num_states = len(vi.get_states())
    actions = mdp.get_actions()

    # Find state pairs that satisfy the condition.
    for i, state_x in enumerate(vi.get_states()):
        sys.stdout.flush()
        clusters[state_x].add(state_x)

        for state_y in vi.get_states()[i:]:
            if not (state_x == state_y) and indic_func(
                    state_x, state_y, vi, actions, epsilon=epsilon):
                clusters[state_x].add(state_y)
                clusters[state_y].add(state_x)

    print "making clusters...",
    sys.stdout.flush()

    # Build SA.
    for i, state in enumerate(clusters.keys()):
        new_cluster = clusters[state]
        sa.make_cluster(new_cluster)

        # Destroy old so we don't double up.
        for s in clusters[state]:
            if s in clusters.keys():
                clusters.pop(s)

    print " done."
    print "\tGround States:", num_states
    print "\tAbstract:", sa.get_num_abstr_states()
    print

    return sa
Ejemplo n.º 22
0
def make_singletask_sa(mdp,
                       indic_func,
                       state_class,
                       epsilon=0.0,
                       aa_single_act=False,
                       prob_of_mdp=1.0,
                       track_act_opt_pr=False):
    '''
    Args:
        mdp (MDP)
        indic_func (S x S --> {0,1})
        state_class (Class)
        epsilon (float)

    Returns:
        (StateAbstraction)
    '''

    print("\tRunning VI...", )
    sys.stdout.flush()
    # Run VI
    if isinstance(mdp, MDPDistribution):
        mdp = mdp.sample()

    vi = ValueIteration(mdp)
    iters, val = vi.run_vi()
    print(" done.")

    print("\tMaking state abstraction...", )
    sys.stdout.flush()
    sa = StateAbstraction(phi={},
                          state_class=state_class,
                          track_act_opt_pr=track_act_opt_pr)
    clusters = defaultdict(list)
    num_states = len(vi.get_states())

    actions = mdp.get_actions()
    # Find state pairs that satisfy the condition.
    for i, state_x in enumerate(vi.get_states()):
        sys.stdout.flush()
        clusters[state_x] = [state_x]

        for state_y in vi.get_states()[i:]:
            if not (state_x == state_y) and indic_func(
                    state_x, state_y, vi, actions, epsilon=epsilon):
                clusters[state_x].append(state_y)
                clusters[state_y].append(state_x)

    print("making clusters...", )
    sys.stdout.flush()

    # Build SA.
    for i, state in enumerate(clusters.keys()):
        new_cluster = clusters[state]
        sa.make_cluster(new_cluster)

        # Destroy old so we don't double up.
        for s in clusters[state]:
            if s in clusters.keys():
                clusters.pop(s)

    if aa_single_act:
        # Put all optimal actions in a set associated with the ground state.
        for ground_s in sa.get_ground_states():
            a_star_set = set(vi.get_max_q_actions(ground_s))
            sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp)

    print(" done.")
    print("\tGround States:", num_states)
    print("\tAbstract:", sa.get_num_abstr_states())
    print()

    return sa
def main():

    # Setup environment.
    mdp_class, agent_type, samples = parse_args()
    is_goal_terminal = False
    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute priors.

    # Stochastic mixture.
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy)

    # Avg MDP
    avg_mdp = ape.compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    # Make agents.

    # Q Learning
    ql_agent = QLearnerAgent(actions)
    shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy,
                                         actions=actions,
                                         name="Prior-QLearning")
    shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy,
                                          actions=actions,
                                          name="AvgMDP-QLearning")

    # RMax
    rmax_agent = RMaxAgent(actions)
    shaped_rmax_agent_prior = ShapedRMaxAgent(
        shaping_policy=opt_stoch_policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="Prior-RMax")
    shaped_rmax_agent_avgmdp = ShapedRMaxAgent(
        shaping_policy=avg_mdp_vi.policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="AvgMDP-RMax")
    prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr)

    if agent_type == "rmax":
        agents = [
            rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp,
            prune_rmax_agent
        ]
    else:
        agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=200,
                          is_rec_disc_reward=False,
                          verbose=True)