def make_abstr_mdp_distr_multi_level(mdp_distr, state_abstr, action_abstr, step_cost=0.1): ''' Args: mdp_distr (MDPDistribution) state_abstr (StateAbstraction) action_abstr (ActionAbstraction) Returns: (MDPDistribution) ''' # Loop through old mdps and abstract. mdp_distr_dict = {} for mdp in mdp_distr.get_all_mdps(): abstr_mdp = make_abstr_mdp_multi_level(mdp, state_abstr, action_abstr, step_cost=step_cost) prob_of_abstr_mdp = mdp_distr.get_prob_of_mdp(mdp) mdp_distr_dict[abstr_mdp] = prob_of_abstr_mdp return MDPDistribution(mdp_distr_dict)
def make_mdp_distr(mdp_class="grid", num_mdps=15, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} num_mdps (int) Returns: (MDPDistribution) ''' mdp_dist_dict = {} mdp_prob = 1.0 / num_mdps height, width = 10, 10 # Make @num_mdps MDPs. for i in xrange(num_mdps): next_goals = rnd.sample([(1, 7), (7, 1), (7, 7), (6, 6), (6, 1), (1, 6)], 2) new_mdp = { "grid": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=rnd.sample( zip(range(1, width + 1), [height] * width), 1), is_goal_terminal=True, gamma=gamma), "four_room": FourRoomMDP(width=8, height=8, goal_locs=next_goals, gamma=gamma), "chain": ChainMDP(num_states=10, reset_val=rnd.choice([0, 0.01, 0.05, 0.1]), gamma=gamma), "random": RandomMDP(num_states=40, num_rand_trans=rnd.randint(1, 10), gamma=gamma) }[mdp_class] mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict)
def make_mdp_distr(mdp_class, is_goal_terminal, mdp_size=11, horizon=0, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) step_cost (float) gamma (float) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width, = mdp_size, mdp_size # Corridor. corr_width = 20 corr_goal_magnitude = 1 #random.randint(1, 5) corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [j for j in range(corr_width-corr_goal_magnitude + 1, corr_width + 1)] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World tl_grid_world_rows, tl_grid_world_cols = [i for i in range(width - 4, width)], [j for j in range(height - 4, height)] tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols)) tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [j for j in range(height - 4, height)] tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols)) grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs # Four room. four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)] # SPREAD vs. TIGHT spread_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1), (2,2)] tight_goal_locs = [(width, height), (width-1, height), (width, height-1), (width, height - 2), (width - 2, height), (width - 1, height-1), (width-2,height-2)] changing_entities = {"four_room":four_room_goal_locs, "grid":grid_goal_locs, "corridor":corr_goal_locs, "spread":spread_goal_locs, "tight":tight_goal_locs, "chain":[0.0, 0.01, 0.1, 0.5, 1.0], "combo_lock":[[3,1,2],[3,2,1],[2,3,1],[3,3,1]], "walls":make_wall_permutations(mdp_size), "lava":make_lava_permutations(mdp_size) } # MDP Probability. num_mdps = 10 if mdp_class not in changing_entities.keys() else len(changing_entities[mdp_class]) if mdp_class == "octo": num_mdps = 12 mdp_prob = 1.0 / num_mdps for i in range(num_mdps): new_mdp = {"chain":ChainMDP(reset_val=changing_entities["chain"][i%len(changing_entities["chain"])]), # "lava":GridWorldMDP(width=width, height=height, rand_init=False, step_cost=-0.001, lava_cost=0.0, lava_locs=changing_entities["lava"][i%len(changing_entities["lava"])], goal_locs=[(mdp_size-3, mdp_size-3)], is_goal_terminal=is_goal_terminal, name="lava_world", slip_prob=0.1), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[changing_entities["four_room"][i % len(changing_entities["four_room"])]], is_goal_terminal=is_goal_terminal), # "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[changing_entities["corridor"][i % len(changing_entities["corridor"])]], is_goal_terminal=is_goal_terminal, name="corridor"), "combo_lock":ComboLockMDP(combo=changing_entities["combo_lock"][i%len(changing_entities["combo_lock"])]), "spread":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["spread"][i % len(changing_entities["spread"])]], is_goal_terminal=is_goal_terminal, name="spread_grid"), "tight":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["tight"][i % len(changing_entities["tight"])]], is_goal_terminal=is_goal_terminal, name="tight_grid"), }[mdp_class] new_mdp.set_gamma(gamma) mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def make_mdp_distr(mdp_class="grid", grid_dim=7, horizon=0): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width = grid_dim, grid_dim # Define goal locations. # Corridor. corr_width = 20 corr_goal_magnitude = random.randint(1, 5) corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude)] + [ j for j in xrange(corr_width - corr_goal_magnitude, corr_width + 1) ] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World grid_world_rows, grid_world_cols = [i for i in xrange(width - 4, width)], [ j for j in xrange(height - 4, height) ] grid_goal_locs = list(itertools.product(grid_world_rows, grid_world_cols)) # Hallway. hall_goal_locs = [(i, width) for i in range(1, height + 1)] # Four room. four_room_goal_locs = [(2, 2), (width, height), (width, 1), (1, height)] # Taxi. agent = {"x": 1, "y": 1, "has_passenger": 0} walls = [] goal_loc_dict = { "four_room": four_room_goal_locs, "hall": hall_goal_locs, "grid": grid_goal_locs, "corridor": corr_goal_locs } # MDP Probability. num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len( goal_loc_dict[mdp_class]) mdp_prob = 1.0 / num_mdps for i in range(num_mdps): new_mdp = {"hall":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["hall"][i % len(goal_loc_dict["hall"])]]), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True), "grid":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]]), # THESE GOALS ARE SPECIFIED IMPLICITLY: "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True), "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])), "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)), "taxi":TaxiOOMDP(4, 4, slip_prob=0.0, agent=agent, walls=walls, \ passengers=[{"x":2, "y":2, "dest_x":random.randint(1,4), "dest_y":random.randint(1,4), "in_taxi":0}])}[mdp_class] mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def main(): # ====================== # == Make Environment == # ====================== params = get_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy if params['multitask']: # Make distribution. mdp_dist_dict = { CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps] } test_mdp = MDPDistribution(mdp_dist_dict) else: test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== if params['multitask']: run_agents_lifelong([sa_agent, linear_agent], test_mdp, samples=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) else: # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
def make_mdp_distr(mdp_class="grid", grid_dim=9, horizon=0, step_cost=0, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) step_cost (float) gamma (float) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width = grid_dim, grid_dim # Define goal locations. # Corridor. corr_width = 20 corr_goal_magnitude = 1 #random.randint(1, 5) corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [ j for j in range(corr_width - corr_goal_magnitude + 1, corr_width + 1) ] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World tl_grid_world_rows, tl_grid_world_cols = [ i for i in range(width - 4, width) ], [j for j in range(height - 4, height)] tl_grid_goal_locs = list( itertools.product(tl_grid_world_rows, tl_grid_world_cols)) tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [ j for j in range(height - 4, height) ] tr_grid_goal_locs = list( itertools.product(tr_grid_world_rows, tr_grid_world_cols)) grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs # Hallway. hall_goal_locs = [(i, height) for i in range(1, 30)] # Four room. four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2)] #, (width - 2, 1)] # Taxi. agent = {"x": 1, "y": 1, "has_passenger": 0} walls = [] goal_loc_dict = { "four_room": four_room_goal_locs, "hall": hall_goal_locs, "grid": grid_goal_locs, "corridor": corr_goal_locs, } # MDP Probability. num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len( goal_loc_dict[mdp_class]) if mdp_class == "octo": num_mdps = 12 mdp_prob = 1.0 / num_mdps for i in range(num_mdps): new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False), "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i), "hall":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["hall"], name="hallway", is_goal_terminal=True), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"), "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True), # THESE GOALS ARE SPECIFIED IMPLICITLY: "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True, slip_prob=0.1), "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])), "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)), "taxi":TaxiOOMDP(3, 4, slip_prob=0.0, agent=agent, walls=walls, \ passengers=[{"x":2, "y":1, "dest_x":random.choice([2,3]), "dest_y":random.choice([2,3]), "in_taxi":0}, {"x":1, "y":2, "dest_x":random.choice([1,2]), "dest_y":random.choice([1,4]), "in_taxi":0}])}[mdp_class] new_mdp.set_step_cost(step_cost) new_mdp.set_gamma(gamma) mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def make_mdp_distr(mdp_class="grid", grid_dim=9, horizon=0, step_cost=0, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) step_cost (float) gamma (float) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width = grid_dim, grid_dim # Define goal locations. # Corridor. corr_width = 20 corr_goal_magnitude = 1 #random.randint(1, 5) corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude + 1)] + [j for j in xrange(corr_width-corr_goal_magnitude + 1, corr_width + 1)] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World tl_grid_world_rows, tl_grid_world_cols = [i for i in xrange(width - 4, width)], [j for j in xrange(height - 4, height)] tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols)) tr_grid_world_rows, tr_grid_world_cols = [i for i in xrange(1, 4)], [j for j in xrange(height - 4, height)] tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols)) grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs # Hallway. upworld_goal_locs = [(i, height) for i in xrange(1, 30)] # Four room. four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)] print four_room_goal_locs tight_four_room_goal_locs = [(width, height), (width, height-1), (width-1, height), (width, height - 2), (width - 2, height), (width-1, height-1)] # Taxi. agent = {"x":1, "y":1, "has_passenger":0} walls = [] goal_loc_dict = {"four_room":four_room_goal_locs, "color":four_room_goal_locs, "upworld":upworld_goal_locs, "grid":grid_goal_locs, "corridor":corr_goal_locs, "tight_four_room":tight_four_room_goal_locs, } # MDP Probability. num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(goal_loc_dict[mdp_class]) if mdp_class == "octo": num_mdps = 12 mdp_prob = 1.0 / num_mdps for i in xrange(num_mdps): new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False), "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i), "upworld":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["upworld"], name="upworld", is_goal_terminal=True), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"), "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True), "color":ColorMDP(width=width, height=height, num_colors=4, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True), "tight_four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["tight_four_room"][i % len(goal_loc_dict["tight_four_room"])]], is_goal_terminal=True, name="tight_four_room")}[mdp_class] new_mdp.set_step_cost(step_cost) new_mdp.set_gamma(gamma) mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def make_env_distribution(env_class='grid-world', env_name=None, n_env=10, gamma=.9, version=1, w=5, h=5, stochastic=False, horizon=0, verbose=True): """ Create a distribution over environments. This function is specialized to the included environments. :param env_class: (str) name of the environment class :param env_name: (str) name of the environment for save path :param n_env: (int) number of environments in the distribution :param gamma: (float) discount factor :param version: (int) in case a version indicator is needed :param w: (int) width for grid-world :param h: (int) height for grid-world :param horizon: (int) :param verbose: (bool) print info if True :param stochastic: (bool) some environments may be stochastic :return: (MDPDistribution) """ if verbose: print('Creating environments of class', env_class) sampling_probability = 1. / float(n_env) env_dist_dict = {} if env_class == 'octo-grid': return MDPDistribution(octo_grid_collection(gamma, env_name), horizon=horizon) elif env_class == 'deterministic-tight': return MDPDistribution(tight_collection(gamma, env_name), horizon=horizon) elif env_class == 'deterministic-super-tight': return MDPDistribution(super_tight_collection(gamma, env_name), horizon=horizon) elif env_class == 'deterministic-super-tight-big': return MDPDistribution(super_tight_collection_big(gamma, env_name), horizon=horizon) elif env_class == 'stochastic-super-tight': return MDPDistribution(super_tight_collection(gamma, env_name, sto=True), horizon=horizon) elif env_class == 'stochastic-super-tight-big': return MDPDistribution(super_tight_collection_big(gamma, env_name, sto=True), horizon=horizon) elif env_class == 'deterministic-tight-big': return MDPDistribution(tight_collection_big(gamma, env_name), horizon=horizon) elif env_class == 'deterministic-tight-small': return MDPDistribution(tight_collection_small(gamma, env_name), horizon=horizon) elif env_class == 'deterministic-tight-super-big': return MDPDistribution(tight_collection_super_big(gamma, env_name), horizon=horizon) elif env_class == 'stochastic-tight': return MDPDistribution(tight_collection(gamma, env_name, sto=True), horizon=horizon) elif env_class == 'stochastic-tight-big': return MDPDistribution(tight_collection_big(gamma, env_name, sto=True), horizon=horizon) elif env_class == 'stochastic-tight-small': return MDPDistribution(tight_collection_small(gamma, env_name, sto=True), horizon=horizon) elif env_class == 'deterministic-spread': return MDPDistribution(deterministic_spread_collection( gamma, env_name), horizon=horizon) elif env_class == 'four-room': return MDPDistribution(four_room_collection(gamma, env_name, size=7), horizon=horizon) elif env_class == 'four-room-big': return MDPDistribution(four_room_collection(gamma, env_name, size=11), horizon=horizon) for _ in range(n_env): if env_class == 'grid-world': new_env = sample_grid_world(gamma, env_name, w, h, verbose) elif env_class == 'corridor': new_env = sample_corridor(gamma, env_name, w, verbose) elif env_class == 'heat-map': new_env = sample_heat_map(gamma, env_name, w, h, verbose) elif env_class == 'maze-multi-walls': new_env = sample_maze_multi(gamma, env_name, verbose) elif env_class == 'maze-mono-goal': new_env = sample_maze_mono(gamma, env_name, verbose) elif env_class == 'tight': new_env = sample_tight(gamma, env_name, version, w, h, stochastic, verbose) elif env_class == 'test': new_env = sample_test_environment(gamma) else: raise ValueError('Environment class not implemented.') env_dist_dict[new_env] = sampling_probability return MDPDistribution(env_dist_dict, horizon=horizon)