def __init__(self, env, max_layer): self.env = env self.modifications = [] self.counter = 0 self.nodes = [] agent = w_QAgent(env) agent.qlearn(3000, render=False) cell_dict = cell_frequency(agent) for element in cell_dict: if element[1] != 0: self.modifications.append((0, element[0][0], element[0][1])) self.modifications.append((1, element[0][0], element[0][1])) self.modifications.sort() self.num_nodes = 0 self.root = None self.max_layer = max_layer self.threshold = 10.75 # Storing best reward and corresponding environment self.max_reward = float("-inf") self.opt_env = None
def potency(mutual_ls, agent, modified, num_episodes, index): # This function tests the potency of the connected q-learning paradigm. # Parameters # ============================================== # agent: pre-trained agent in some environment # modified: new environment # num_episodes: number of episodes trained in connected q-learning paradigm # ============================================== series = agent.env.resettable_states() conn_agent = connected_qlearn(agent, modified, num_episodes) l_vals = [] for state in series: res = conn_agent.eval(fixed=state, show=False)[1] l_vals.append(res) new_agent = w_QAgent(modified) new_agent.qlearn(3500, show=False, render=False) n_vals = [] for state in series: res = new_agent.eval(fixed=state, show=False)[1] n_vals.append(res) l_vals = np.array(l_vals) n_vals = np.array(n_vals) a = abs(np.sum(l_vals) - np.sum(n_vals)) mutual_ls[index] = a
def default_policy(self, node_index): start = node_index simulate_env = copy.deepcopy(self.nodes[start].env) num_modifications_applied = len(simulate_env.jump_cells) + len( simulate_env.special) - len(self.env.special) - len( self.env.jump_cells) mods_left = self.max_layer - num_modifications_applied # Choose from unused modifications, from start node # We know that tree.nodes[start] is a leaf, so there is no used modifications at start yet. ls = [] for i in range(self.nodes[start].modification + 1, len(self.modifications)): ls.append(i) try: a = random.sample(ls, k=mods_left) except: print(ls) print(num_modifications_applied) raise ValueError a = sorted(a) for element in a: mod = self.modifications[element] if mod[0] == 0: simulate_env.jump_cells.append((mod[1], mod[2])) elif mod[0] == 1: simulate_env.special.append((mod[1], mod[2])) # Training agent = w_QAgent(simulate_env) agent.qlearn(3000, show=False) reward = utility(agent) if reward > self.threshold: print(colored(a, "red")) print(colored(reward, "red")) for element in a: start = self.add_node(element, start).index # Update tree's max reward if possible if reward > self.max_reward: self.max_reward = reward self.opt_env = simulate_env return [self.scale(reward), start] return self.scale(reward)
def connected_qlearn(agent, new_env, num_episodes): # Parameters # ============================================== # agent: pre-trained agent in some environment # new_env: new environment # ============================================== # We will use the pre-trained agent to train it in the new environment # Intuition is that the q-values only need slight changes, so it will be computationally wasteful to calculate from scratch linked_agent = w_QAgent(new_env) linked_agent.q = copy.deepcopy(agent.q) # linking the q-values together linked_agent.epsilon = 0.75 linked_agent.qlearn(num_episodes, render=False) return linked_agent
def best_observed_choice(self): vector = [] for jump in self.opt_env.jump_cells: if jump not in self.env.jump_cells: tup = (0, jump[0], jump[1]) vector.append(tup) for cell in self.opt_env.special: if cell not in self.env.special: tup = (1, cell[0], cell[1]) vector.append(tup) # Training to prevent errors arising from connected training agent = w_QAgent(self.opt_env) agent.qlearn(3500) rews = utility(agent) x = max(rews, self.max_reward) return (vector, x)
def greedy(self): walk = [] start = 0 while self.nodes[start].layer < self.max_layer: if len(self.nodes[start].visited_children) != 0: start = self.best_child(start, 0, 0, expanded=False) mod_index = self.nodes[start].modification walk.append(self.modifications[mod_index]) if len(walk) < self.max_layer: print("MCTS insufficient to get {} modifications!".format( self.max_layer)) return (walk, None) else: modified = make_env(self.env, walk) agent = w_QAgent(modified) agent.qlearn(3000, render=False) rews = utility(agent) return (walk, rews)
def default_policy(self, node_index): start = node_index simulate_env = copy.deepcopy(self.nodes[start].env) num_modifications_applied = len(simulate_env.jump_cells) + len( simulate_env.special) - len(self.env.special) - len( self.env.jump_cells) mods_left = self.max_layer - num_modifications_applied # Choose from unused modifications, from start node # We know that tree.nodes[start] is a leaf, so there is no used modifications at start yet. ls = [] if node_index != 0: for i in range(self.nodes[start].modification + 1, len(self.modifications)): ls.append(i) else: ls = [i for i in range(len(self.modifications))] a = random.sample(ls, k=mods_left) a = sorted(a) for element in a: mod = self.modifications[element] if mod[0] == 0: simulate_env.jump_cells.append((mod[1], mod[2])) elif mod[0] == 1: simulate_env.special.append((mod[1], mod[2])) # Training agent = w_QAgent(simulate_env) agent.qlearn(3000, show=False) reward = utility(agent) if reward > self.max_reward: self.max_reward = reward self.opt_env = copy.deepcopy(simulate_env) return reward
else: chosen_vectors = ls[0:(rounds * num_processes)] for iter in range(rounds): print(colored("Data addition round {} begins!".format(iter), "red")) for i in range(num_processes): if i + iter * num_processes >= len(chosen_vectors): break # results = simulate_env(env, num_mods) v = chosen_vectors[i + iter * num_processes] # modified = results[0] modified = make_env(env, v) # categories.append(results[1]) categories.append(v) agent = w_QAgent(modified) p = mp.Process(target=qlearn_as_func, args=(agent, modified, i, agents, i + iter * num_processes)) p.start() processes.append(p) for process in processes: process.join() for process in processes: process.terminate() for i in range(len(agents)): if agents[i] != 0: ut = utility(agents[i])
ref_env.jump_cells.append((element[1], element[2])) else: ref_env.special.append((element[1], element[2])) return ref_env def cell_frequency(agent): dict_return = {} for row in range(env.width): for col in range(env.length): dict_return[(row, col)] = 0 ls = agent.env.resettable_states() for i in range(len(ls)): states = agent.eval(show=False, fixed=ls[i])[2] for state in states: dict_return[(state[0], state[1])] += 1 dict_return = sorted(dict_return.items(), key=lambda x: -x[1]) return dict_return if __name__ == "__main__": agent = w_QAgent(env) agent.qlearn(3000, render=False) cell_dict = cell_frequency(agent) count = 0 for elem in cell_dict: if elem[1] == 0: print(elem[0])
rounds = 300 mp.set_start_method = "spawn" num_processes = 10 processes = [] manager = Manager() agents = manager.list() for i in range(rounds * num_processes): agents.append(0) # keeper categories = [] num_mods = 4 map_to_numpy = np.asarray(map, dtype="c") env = WindyGridworld() # reference environment orig_agent = w_QAgent(env) orig_agent.qlearn(3000, render=False) cell_dict = cell_frequency(orig_agent) modifications = [] for element in cell_dict: if element[1] != 0: modifications.append((0, element[0][0], element[0][1])) modifications.append((1, element[0][0], element[0][1])) modifications.sort() ls = None if num_mods == 1: ls = [[elem] for elem in modifications]
ls[i] = np.reshape(ls[i], (num_mods * 3)) ls = np.array(ls) vector = model.predict(ls) # Keep track of max and corresponding environment s = vector.shape a = np.reshape(vector, (s[0] * s[1])) index = np.argmax(a) # Vector at index of highest prediction corr_vec = ls[index] coor_vec = list(np.reshape(corr_vec, (len(corr_vec) // 3, 3))) res_env = make_env(env, coor_vec) agent = w_QAgent(res_env) agent.qlearn(3000, render=False) opt_val = utility(agent) # Re-format found vector x = copy.deepcopy(coor_vec) for i in range(len(x)): x[i] = tuple(x[i]) r_dir = os.path.abspath(os.pardir) data_dir = os.path.join(r_dir, "data-wgr") file_dir = os.path.join(data_dir, "sl_nh_result_{}.txt".format(num_mods)) with open(file_dir, "w") as file: file.write("Modifications: ") file.write(str(x))
def batch_greedy(env, num_mods, num_mods_per_run, ls_num_iters): # Parameters # ========================================================= # env: original environment # num_mods: total number of modifications # num_mods_per_run: total number of modifications considered in combination # ls_num_iters: list of number of iterations run for each number of modifications # ========================================================= # Example: [50, 200] means that if num_mods == 1, run 50 iterations, and if num_mods == 2, run 200 iterations. ref = copy.deepcopy(env) mods_ret = [] # answer of this algorithm # Initialize an MCTS tree tree = Tree(env, max_layer=num_mods) tree.initialize() # Keep a running count count = num_mods # Keep a running list of modifications ls_mods = copy.deepcopy(tree.modifications) # Initialize baseline baseline = 10.12 assert (count >= num_mods_per_run) assert (len(ls_num_iters) == num_mods_per_run) while count > 0: print(colored(ls_mods, "red")) n = 0 if count >= num_mods_per_run: n = num_mods_per_run else: n = count tree = Tree(ref, max_layer=n) tree.initialize() tree.threshold = baseline tree.modifications = copy.deepcopy(ls_mods) # Find out number of iterations num_iter = ls_num_iters[n - 1] # Perform an MCTS search tree.ucb_search(iterations=num_iter) a = tree.best_observed_choice() for elem in a[0]: mods_ret.append(elem) # Transform the environment for elem in a[0]: if elem[0] == 0: # wall ref.jump_cells.append((elem[1], elem[2])) elif elem[0] == 1: # cell ref.special.append((elem[1], elem[2])) ls_mods.remove(elem) count -= n # Increase baseline baseline += 0.4 * n # Find utility agent = w_QAgent(ref) agent.qlearn(3000) rews = utility(agent) return (mods_ret, rews)