コード例 #1
0
def make_abstr_mdp_distr_multi_level(mdp_distr,
                                     state_abstr,
                                     action_abstr,
                                     step_cost=0.1):
    '''
	Args:
		mdp_distr (MDPDistribution)
		state_abstr (StateAbstraction)
		action_abstr (ActionAbstraction)

	Returns:
		(MDPDistribution)
	'''

    # Loop through old mdps and abstract.
    mdp_distr_dict = {}
    for mdp in mdp_distr.get_all_mdps():
        abstr_mdp = make_abstr_mdp_multi_level(mdp,
                                               state_abstr,
                                               action_abstr,
                                               step_cost=step_cost)
        prob_of_abstr_mdp = mdp_distr.get_prob_of_mdp(mdp)
        mdp_distr_dict[abstr_mdp] = prob_of_abstr_mdp

    return MDPDistribution(mdp_distr_dict)
コード例 #2
0
def make_mdp_distr(mdp_class="grid", num_mdps=15, gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        num_mdps (int)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    mdp_prob = 1.0 / num_mdps
    height, width = 10, 10

    # Make @num_mdps MDPs.
    for i in xrange(num_mdps):
        next_goals = rnd.sample([(1, 7), (7, 1), (7, 7), (6, 6), (6, 1),
                                 (1, 6)], 2)
        new_mdp = {
            "grid":
            GridWorldMDP(width=width,
                         height=height,
                         init_loc=(1, 1),
                         goal_locs=rnd.sample(
                             zip(range(1, width + 1), [height] * width), 1),
                         is_goal_terminal=True,
                         gamma=gamma),
            "four_room":
            FourRoomMDP(width=8, height=8, goal_locs=next_goals, gamma=gamma),
            "chain":
            ChainMDP(num_states=10,
                     reset_val=rnd.choice([0, 0.01, 0.05, 0.1]),
                     gamma=gamma),
            "random":
            RandomMDP(num_states=40,
                      num_rand_trans=rnd.randint(1, 10),
                      gamma=gamma)
        }[mdp_class]

        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict)
コード例 #3
0
def make_mdp_distr(mdp_class, is_goal_terminal, mdp_size=11, horizon=0, gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)
        step_cost (float)
        gamma (float)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}

    height, width, = mdp_size, mdp_size

    # Corridor.
    corr_width = 20
    corr_goal_magnitude = 1 #random.randint(1, 5)
    corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [j for j in range(corr_width-corr_goal_magnitude + 1, corr_width + 1)]
    corr_goal_locs  = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    tl_grid_world_rows, tl_grid_world_cols = [i for i in range(width - 4, width)], [j for j in range(height - 4, height)]
    tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols))
    tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [j for j in range(height - 4, height)]
    tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols))
    grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs

    # Four room.
    four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)]

    # SPREAD vs. TIGHT
    spread_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1), (2,2)]
    tight_goal_locs = [(width, height), (width-1, height), (width, height-1), (width, height - 2), (width - 2, height), (width - 1, height-1), (width-2,height-2)]

    changing_entities = {"four_room":four_room_goal_locs,
                    "grid":grid_goal_locs,
                    "corridor":corr_goal_locs,
                    "spread":spread_goal_locs,
                    "tight":tight_goal_locs,
                    "chain":[0.0, 0.01, 0.1, 0.5, 1.0],
                    "combo_lock":[[3,1,2],[3,2,1],[2,3,1],[3,3,1]],
                    "walls":make_wall_permutations(mdp_size),
                    "lava":make_lava_permutations(mdp_size)
                    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in changing_entities.keys() else len(changing_entities[mdp_class])
    if mdp_class == "octo":
        num_mdps = 12
    mdp_prob = 1.0 / num_mdps

    for i in range(num_mdps):

        new_mdp = {"chain":ChainMDP(reset_val=changing_entities["chain"][i%len(changing_entities["chain"])]),
                   # "lava":GridWorldMDP(width=width, height=height, rand_init=False, step_cost=-0.001, lava_cost=0.0, lava_locs=changing_entities["lava"][i%len(changing_entities["lava"])], goal_locs=[(mdp_size-3, mdp_size-3)], is_goal_terminal=is_goal_terminal, name="lava_world", slip_prob=0.1),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[changing_entities["four_room"][i % len(changing_entities["four_room"])]], is_goal_terminal=is_goal_terminal),
                   # "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[changing_entities["corridor"][i % len(changing_entities["corridor"])]], is_goal_terminal=is_goal_terminal, name="corridor"),
                    "combo_lock":ComboLockMDP(combo=changing_entities["combo_lock"][i%len(changing_entities["combo_lock"])]),
                    "spread":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["spread"][i % len(changing_entities["spread"])]], is_goal_terminal=is_goal_terminal, name="spread_grid"),
                    "tight":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["tight"][i % len(changing_entities["tight"])]], is_goal_terminal=is_goal_terminal, name="tight_grid"),
                    }[mdp_class]

        new_mdp.set_gamma(gamma)
        
        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
コード例 #4
0
ファイル: make_mdp.py プロジェクト: RoyalGuan/simple_rl
def make_mdp_distr(mdp_class="grid", grid_dim=7, horizon=0):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    height, width = grid_dim, grid_dim

    # Define goal locations.

    # Corridor.
    corr_width = 20
    corr_goal_magnitude = random.randint(1, 5)
    corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude)] + [
        j for j in xrange(corr_width - corr_goal_magnitude, corr_width + 1)
    ]
    corr_goal_locs = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    grid_world_rows, grid_world_cols = [i for i in xrange(width - 4, width)], [
        j for j in xrange(height - 4, height)
    ]
    grid_goal_locs = list(itertools.product(grid_world_rows, grid_world_cols))

    # Hallway.
    hall_goal_locs = [(i, width) for i in range(1, height + 1)]

    # Four room.
    four_room_goal_locs = [(2, 2), (width, height), (width, 1), (1, height)]

    # Taxi.
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    walls = []

    goal_loc_dict = {
        "four_room": four_room_goal_locs,
        "hall": hall_goal_locs,
        "grid": grid_goal_locs,
        "corridor": corr_goal_locs
    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(
        goal_loc_dict[mdp_class])
    mdp_prob = 1.0 / num_mdps

    for i in range(num_mdps):

        new_mdp = {"hall":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["hall"][i % len(goal_loc_dict["hall"])]]),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True),
                    "grid":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]]),
                    # THESE GOALS ARE SPECIFIED IMPLICITLY:
                    "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True),
                    "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])),
                    "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)),
                    "taxi":TaxiOOMDP(4, 4, slip_prob=0.0, agent=agent, walls=walls, \
                                    passengers=[{"x":2, "y":2, "dest_x":random.randint(1,4), "dest_y":random.randint(1,4), "in_taxi":0}])}[mdp_class]

        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy

    if params['multitask']:
        # Make distribution.
        mdp_dist_dict = {
            CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps
            for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps]
        }
        test_mdp = MDPDistribution(mdp_dist_dict)
    else:
        test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions,
                                num_features=num_features,
                                alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================

    if params['multitask']:
        run_agents_lifelong([sa_agent, linear_agent],
                            test_mdp,
                            samples=params['num_instances'],
                            episodes=params['episodes'],
                            steps=params['steps'],
                            verbose=False)
    else:
        # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)
        run_agents_on_mdp([sa_agent, linear_agent],
                          test_mdp,
                          instances=params['num_instances'],
                          episodes=params['episodes'],
                          steps=params['steps'],
                          verbose=False)
コード例 #6
0
def make_mdp_distr(mdp_class="grid",
                   grid_dim=9,
                   horizon=0,
                   step_cost=0,
                   gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)
        step_cost (float)
        gamma (float)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    height, width = grid_dim, grid_dim

    # Define goal locations.

    # Corridor.
    corr_width = 20
    corr_goal_magnitude = 1  #random.randint(1, 5)
    corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [
        j for j in range(corr_width - corr_goal_magnitude + 1, corr_width + 1)
    ]
    corr_goal_locs = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    tl_grid_world_rows, tl_grid_world_cols = [
        i for i in range(width - 4, width)
    ], [j for j in range(height - 4, height)]
    tl_grid_goal_locs = list(
        itertools.product(tl_grid_world_rows, tl_grid_world_cols))
    tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [
        j for j in range(height - 4, height)
    ]
    tr_grid_goal_locs = list(
        itertools.product(tr_grid_world_rows, tr_grid_world_cols))
    grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs

    # Hallway.
    hall_goal_locs = [(i, height) for i in range(1, 30)]

    # Four room.
    four_room_goal_locs = [(width, height), (width, 1), (1, height),
                           (1, height - 2),
                           (width - 2, height - 2)]  #, (width - 2, 1)]

    # Taxi.
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    walls = []

    goal_loc_dict = {
        "four_room": four_room_goal_locs,
        "hall": hall_goal_locs,
        "grid": grid_goal_locs,
        "corridor": corr_goal_locs,
    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(
        goal_loc_dict[mdp_class])
    if mdp_class == "octo":
        num_mdps = 12
    mdp_prob = 1.0 / num_mdps

    for i in range(num_mdps):

        new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False),
                    "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i),
                    "hall":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["hall"], name="hallway", is_goal_terminal=True),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"),
                    "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True),
                    # THESE GOALS ARE SPECIFIED IMPLICITLY:
                    "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True, slip_prob=0.1),
                    "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])),
                    "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)),
                    "taxi":TaxiOOMDP(3, 4, slip_prob=0.0, agent=agent, walls=walls, \
                                    passengers=[{"x":2, "y":1, "dest_x":random.choice([2,3]), "dest_y":random.choice([2,3]), "in_taxi":0},
                                                {"x":1, "y":2, "dest_x":random.choice([1,2]), "dest_y":random.choice([1,4]), "in_taxi":0}])}[mdp_class]

        new_mdp.set_step_cost(step_cost)
        new_mdp.set_gamma(gamma)

        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
コード例 #7
0
def make_mdp_distr(mdp_class="grid", grid_dim=9, horizon=0, step_cost=0, gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)
        step_cost (float)
        gamma (float)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    height, width = grid_dim, grid_dim

    # Define goal locations.
        
    # Corridor.
    corr_width = 20
    corr_goal_magnitude = 1 #random.randint(1, 5)
    corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude + 1)] + [j for j in xrange(corr_width-corr_goal_magnitude + 1, corr_width + 1)]
    corr_goal_locs  = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    tl_grid_world_rows, tl_grid_world_cols = [i for i in xrange(width - 4, width)], [j for j in xrange(height - 4, height)]
    tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols))
    tr_grid_world_rows, tr_grid_world_cols = [i for i in xrange(1, 4)], [j for j in xrange(height - 4, height)]
    tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols))
    grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs

    # Hallway.
    upworld_goal_locs = [(i, height) for i in xrange(1, 30)]

    # Four room.
    four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)]

    print four_room_goal_locs
                            
    tight_four_room_goal_locs = [(width, height), (width, height-1), (width-1, height), (width, height - 2), (width - 2, height), (width-1, height-1)]

    # Taxi.
    agent = {"x":1, "y":1, "has_passenger":0}
    walls = []

    goal_loc_dict = {"four_room":four_room_goal_locs,
                    "color":four_room_goal_locs,
                    "upworld":upworld_goal_locs,
                    "grid":grid_goal_locs,
                    "corridor":corr_goal_locs,
                    "tight_four_room":tight_four_room_goal_locs,
                    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(goal_loc_dict[mdp_class])
    if mdp_class == "octo":
        num_mdps = 12
    mdp_prob = 1.0 / num_mdps

    for i in xrange(num_mdps):

        new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False),
                    "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i),
                    "upworld":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["upworld"], name="upworld", is_goal_terminal=True),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"),
                    "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True),
                    "color":ColorMDP(width=width, height=height, num_colors=4, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True),
                    "tight_four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["tight_four_room"][i % len(goal_loc_dict["tight_four_room"])]], is_goal_terminal=True, name="tight_four_room")}[mdp_class]

        new_mdp.set_step_cost(step_cost)
        new_mdp.set_gamma(gamma)
        
        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
コード例 #8
0
def make_env_distribution(env_class='grid-world',
                          env_name=None,
                          n_env=10,
                          gamma=.9,
                          version=1,
                          w=5,
                          h=5,
                          stochastic=False,
                          horizon=0,
                          verbose=True):
    """
    Create a distribution over environments.
    This function is specialized to the included environments.
    :param env_class: (str) name of the environment class
    :param env_name: (str) name of the environment for save path
    :param n_env: (int) number of environments in the distribution
    :param gamma: (float) discount factor
    :param version: (int) in case a version indicator is needed
    :param w: (int) width for grid-world
    :param h: (int) height for grid-world
    :param horizon: (int)
    :param verbose: (bool) print info if True
    :param stochastic: (bool) some environments may be stochastic
    :return: (MDPDistribution)
    """
    if verbose:
        print('Creating environments of class', env_class)

    sampling_probability = 1. / float(n_env)
    env_dist_dict = {}

    if env_class == 'octo-grid':
        return MDPDistribution(octo_grid_collection(gamma, env_name),
                               horizon=horizon)
    elif env_class == 'deterministic-tight':
        return MDPDistribution(tight_collection(gamma, env_name),
                               horizon=horizon)
    elif env_class == 'deterministic-super-tight':
        return MDPDistribution(super_tight_collection(gamma, env_name),
                               horizon=horizon)
    elif env_class == 'deterministic-super-tight-big':
        return MDPDistribution(super_tight_collection_big(gamma, env_name),
                               horizon=horizon)
    elif env_class == 'stochastic-super-tight':
        return MDPDistribution(super_tight_collection(gamma,
                                                      env_name,
                                                      sto=True),
                               horizon=horizon)
    elif env_class == 'stochastic-super-tight-big':
        return MDPDistribution(super_tight_collection_big(gamma,
                                                          env_name,
                                                          sto=True),
                               horizon=horizon)
    elif env_class == 'deterministic-tight-big':
        return MDPDistribution(tight_collection_big(gamma, env_name),
                               horizon=horizon)
    elif env_class == 'deterministic-tight-small':
        return MDPDistribution(tight_collection_small(gamma, env_name),
                               horizon=horizon)
    elif env_class == 'deterministic-tight-super-big':
        return MDPDistribution(tight_collection_super_big(gamma, env_name),
                               horizon=horizon)
    elif env_class == 'stochastic-tight':
        return MDPDistribution(tight_collection(gamma, env_name, sto=True),
                               horizon=horizon)
    elif env_class == 'stochastic-tight-big':
        return MDPDistribution(tight_collection_big(gamma, env_name, sto=True),
                               horizon=horizon)
    elif env_class == 'stochastic-tight-small':
        return MDPDistribution(tight_collection_small(gamma,
                                                      env_name,
                                                      sto=True),
                               horizon=horizon)
    elif env_class == 'deterministic-spread':
        return MDPDistribution(deterministic_spread_collection(
            gamma, env_name),
                               horizon=horizon)
    elif env_class == 'four-room':
        return MDPDistribution(four_room_collection(gamma, env_name, size=7),
                               horizon=horizon)
    elif env_class == 'four-room-big':
        return MDPDistribution(four_room_collection(gamma, env_name, size=11),
                               horizon=horizon)

    for _ in range(n_env):
        if env_class == 'grid-world':
            new_env = sample_grid_world(gamma, env_name, w, h, verbose)
        elif env_class == 'corridor':
            new_env = sample_corridor(gamma, env_name, w, verbose)
        elif env_class == 'heat-map':
            new_env = sample_heat_map(gamma, env_name, w, h, verbose)
        elif env_class == 'maze-multi-walls':
            new_env = sample_maze_multi(gamma, env_name, verbose)
        elif env_class == 'maze-mono-goal':
            new_env = sample_maze_mono(gamma, env_name, verbose)
        elif env_class == 'tight':
            new_env = sample_tight(gamma, env_name, version, w, h, stochastic,
                                   verbose)
        elif env_class == 'test':
            new_env = sample_test_environment(gamma)
        else:
            raise ValueError('Environment class not implemented.')
        env_dist_dict[new_env] = sampling_probability

    return MDPDistribution(env_dist_dict, horizon=horizon)