Python QAgent Examples, qlearn.QAgent Python Examples

Example #1

0

Show file

File: connect_mcts.py Project: vietvu2001/rld-taxiv3

    def __init__(self, env, max_layer):
        self.env = env
        self.modifications = []
        self.counter = 0
        self.nodes = []
        
        # List of all possible modifications: tree.modifications
        for wall in env.walls:
            self.modifications.append((0, wall[0], wall[1]))
        for row in range(env.width):
            for col in range(env.length):
                self.modifications.append((1, row, col))
        
        self.num_nodes = 0
        self.root = None
        self.threshold = 8

        # Train original agent
        self.agent = QAgent(self.env)
        self.agent.qlearn(500, show=False, render=False)
        self.env.reset()

        self.max_layer = max_layer

        # Storing tree's max reward with corresponding environment
        self.max_reward = float("-inf")
        self.opt_env = None

Example #2

0

Show file

File: connect_potency.py Project: vietvu2001/rld-taxiv3

def potency(mutual_ls, agent, modified, num_episodes, index):
    # This function tests the potency of the connected q-learning paradigm.
    # Parameters
    # ==============================================
    # agent: pre-trained agent in some environment
    # modified: new environment
    # num_episodes: number of episodes trained in connected q-learning paradigm
    # ==============================================

    series = agent.env.resettable_states()

    conn_agent = connected_qlearn(agent, modified, num_episodes)
    l_vals = []

    for state in series:
        res = conn_agent.eval(fixed=state, show=False)[1]
        l_vals.append(res)

    new_agent = QAgent(modified)
    new_agent.qlearn(600, show=False, render=False)
    n_vals = []

    for state in series:
        res = new_agent.eval(fixed=state, show=False)[1]
        n_vals.append(res)

    l_vals = np.array(l_vals)
    n_vals = np.array(n_vals)

    a = abs(np.sum(l_vals) - np.sum(n_vals))
    
    mutual_ls[index] = a

Example #3

0

Show file

File: mcts_trimmed.py Project: vietvu2001/rld-taxiv3

    def __init__(self, env, max_layer):
        self.env = env
        self.modifications = []
        self.counter = 0
        self.nodes = []

        agent = QAgent(env)
        agent.qlearn(600, show=False)
        wall_dict = wall_interference(agent)
        cell_dict = cell_frequency(agent)

        for element in wall_dict:
            self.modifications.append((0, element[0][0], element[0][1]))

        for element in cell_dict[0:14]:
            self.modifications.append((1, element[0][0], element[0][1]))

        self.num_nodes = 0
        self.root = None
        self.max_layer = max_layer
        self.threshold = 8

        # Storing best reward and corresponding environment
        self.max_reward = float("-inf")
        self.opt_env = None

Example #4

0

Show file

File: mcts_trimmed.py Project: vietvu2001/rld-taxiv3

    def default_policy(self, node_index):
        start = node_index
        simulate_env = copy.deepcopy(self.nodes[start].env)
        num_modifications_applied = len(self.env.walls) - len(
            simulate_env.walls) + len(simulate_env.special) - len(
                self.env.special)
        mods_left = self.max_layer - num_modifications_applied

        # Choose from unused modifications, from start node
        # We know that tree.nodes[start] is a leaf, so there is no used modifications at start yet.
        ls = []
        for i in range(self.nodes[start].modification + 1,
                       len(self.modifications)):
            ls.append(i)

        try:
            a = random.sample(ls, k=mods_left)

        except:
            print(ls)
            print(num_modifications_applied)
            raise ValueError

        a = sorted(a)
        for element in a:
            mod = self.modifications[element]
            if mod[0] == 0:
                simulate_env = simulate_env.transition([(mod[1], mod[2])])
            elif mod[0] == 1:
                simulate_env.special.append((mod[1], mod[2]))

        # Training
        agent = QAgent(simulate_env)
        agent.qlearn(600, render=False)
        reward = utility(agent)

        if reward > self.threshold + 0.5:
            print(colored(a, "red"))
            print(colored(reward, "red"))
            for element in a:
                start = self.add_node(element, start).index

            # Update tree's max reward if possible
            if reward > self.max_reward:
                self.max_reward = reward
                self.opt_env = simulate_env

            return [self.scale(reward), start]

        return self.scale(reward)

Example #5

0

Show file

File: mcts_trimmed.py Project: vietvu2001/rld-taxiv3

    def best_observed_choice(self):
        vector = []
        for wall in self.env.walls:
            if wall not in self.opt_env.walls:
                tup = (0, wall[0], wall[1])
                vector.append(tup)

        for cell in self.opt_env.special:
            if cell not in self.env.special:
                tup = (1, cell[0], cell[1])
                vector.append(tup)

        # Training to prevent errors arising from connected training
        agent = QAgent(self.opt_env)
        agent.qlearn(600)
        rews = utility(agent)

        return (vector, rews)

Example #6

0

Show file

File: mcts.py Project: vietvu2001/rld-taxiv3

    def greedy(self):
        walk = []
        start = 0
        while self.nodes[start].layer < self.max_layer:
            if len(self.nodes[start].visited_children) != 0:
                start = self.best_child(start, 0, 0, expanded=False)
                mod_index = self.nodes[start].modification
                walk.append(self.modifications[mod_index])
        
        if len(walk) < self.max_layer:
            print("MCTS insufficient to get {} modifications!".format(self.max_layer))
            return (walk, None)

        else:
            modified = make_env(self.env, walk)
            agent = QAgent(modified)
            agent.qlearn(600, render=False)
            rews = utility(agent)
            return (walk, rews)

Example #7

0

Show file

File: path_based.py Project: vietvu2001/rld-taxiv3

def path_based(env, num_mods):
    # Train agent in original environment
    agent = QAgent(env)
    agent.qlearn(600, render=False)
    cell_dict = cell_frequency(agent)
    wall_dict = wall_interference(agent)

    opt_seq = None
    opt_val = float("-inf")

    for k in range(num_mods):
        seq = []
        ref = copy.deepcopy(env)
        # Pick k modifications on walls
        walls_to_remove = [wall_dict[i][0] for i in range(k)]
        for wall in walls_to_remove:
            ref = ref.transition([(wall)])
            seq.append((0, wall[0], wall[1]))

        num_special_cells = num_mods - k
        cells_to_assign = [cell_dict[i][0] for i in range(num_special_cells)]

        for cell in cells_to_assign:
            ref.special.append(cell)
            seq.append((1, cell[0], cell[1]))

        agent_k = QAgent(ref)
        print(colored("Iteration {} begins!".format(k), "red"))
        print(ref.walls, ref.special)
        agent_k.qlearn(600, render=False)

        rews = utility(agent_k)
        if rews > opt_val:
            opt_val = rews
            opt_seq = seq

    return (opt_seq, opt_val)

Example #8

0

Show file

def connected_qlearn(agent, new_env, num_episodes):
    # Parameters
    # ==============================================
    # agent: pre-trained agent in some environment
    # new_env: new environment
    # ==============================================

    # We will use the pre-trained agent to train it in the new environment
    # Intuition is that the q-values only need slight changes, so it will be computationally wasteful to calculate from scratch

    linked_agent = QAgent(new_env)
    linked_agent.q = copy.deepcopy(agent.q)  # linking the q-values together

    linked_agent.epsilon = 0.75
    linked_agent.qlearn(num_episodes, render=False)

    return linked_agent

Example #9

0

Show file

File: correct_data.py Project: vietvu2001/rld-taxiv3

def check(input_data, index, output_data):
    modified = make_env(env, input_data, index)
    agent = QAgent(modified)
    agent.qlearn(600, show=False)
    rews = utility(agent)
    return (rews == output_data[index])

Example #10

0

Show file

File: connect_potency.py Project: vietvu2001/rld-taxiv3

    
    mutual_ls[index] = a


if __name__ == "__main__":
    processes = []
    mp.set_start_method = "spawn"
    num_processes = 10
    manager = Manager()
    mutual = manager.list()

    for _ in range(N * num_processes):
        mutual.append(0)
    
    # Create default agent
    agent = QAgent(env)
    agent.qlearn(650)


    for iter in range(N):
        for i in range(num_processes):
            p = mp.Process(target=potency, args=(mutual, agent, modified, 400, i + iter * num_processes))
            p.start()
            processes.append(p)

        for process in processes:
            process.join()

        for process in processes:
            process.terminate()

Example #11

0

Show file

File: connect_mcts.py Project: vietvu2001/rld-taxiv3

class Tree():
    def __init__(self, env, max_layer):
        self.env = env
        self.modifications = []
        self.counter = 0
        self.nodes = []
        
        # List of all possible modifications: tree.modifications
        for wall in env.walls:
            self.modifications.append((0, wall[0], wall[1]))
        for row in range(env.width):
            for col in range(env.length):
                self.modifications.append((1, row, col))
        
        self.num_nodes = 0
        self.root = None
        self.threshold = 8

        # Train original agent
        self.agent = QAgent(self.env)
        self.agent.qlearn(500, show=False, render=False)
        self.env.reset()

        self.max_layer = max_layer

        # Storing tree's max reward with corresponding environment
        self.max_reward = float("-inf")
        self.opt_env = None


    def scale(self, x):
        # Scale the utility of the agent for backpropagation
        return max(0, x - self.threshold)


    def initialize(self):
        # Initialize the tree with the root
        # Assert that the tree is null (no root beforehand)
        assert self.root == None

        # Create root
        root = Node(None, self.num_nodes, None, self.env)

        # Add root to the list of nodes of the tree and increase the number of nodes 
        self.nodes.append(root)
        self.num_nodes += 1

        # Assign the root to the root field of the tree
        self.root = root

        # Update the environment configuration of the root
        self.root.update_walls_special(self, parent_bool=False)

        # Update the layer of the root (the tree's version)
        self.root.update_layer(self)


    def add_node(self, mod_index, parent_index):
        # This function adds a node with a modification index and a parent index into the MCTS tree.
        assert parent_index < self.num_nodes
        assert mod_index in self.nodes[parent_index].get_unused_modifications(self)

        # Create a node from the modification index (in tree.modifications) and the parent index on the tree
        node = Node(mod_index, self.num_nodes, parent_index, self.env)

        # Append the node to the list of nodes of the tree and increase the number of nodes
        self.nodes.append(node)
        self.num_nodes += 1

        # Changing the boolean leaf status of the parent_index to False. 
        self.nodes[parent_index].leaf = False

        # Update the layer of the newly added node
        self.nodes[node.index].update_layer(self)

        # Update the environment configuration of the newly added node
        self.nodes[node.index].update_walls_special(self, parent_bool=True)

        # Mark the newly added node as a visited child, with respect to its parent node
        self.nodes[parent_index].visited_children.append(node.index)

        # Return the node as output
        return self.nodes[node.index]


    def expand(self, node_index):  # return index of an expanded node
        # Get the list of unused modifications on the node (given by the tree and the node index)
        ls = self.nodes[node_index].get_unused_modifications(self)

        # Assert that the node still has unvisited children
        assert len(ls) > 0

        # Choose a random modification index from this list
        mod_index = random.choice(ls)

        # Add the node of this modification index into the tree
        node = self.add_node(mod_index, node_index)

        # Return the index of this newly created node as output
        return node.index


    def best_child(self, node_index, const, const_2=1, expanded=True):
        # Find the best child of a node based on the UCB heuristic
        # If node is not fully expanded, use the heuristic only on visited children
        assert self.num_nodes > node_index
        ls = self.nodes[node_index].get_unused_modifications(self)

        # If expanded boolean is True, the length of unused modifications list must be 0
        if expanded:
            assert len(ls) == 0

        # Find the best child with largest UCB value
        opt = float("-inf")
        child = None
        for c in self.nodes[node_index].visited_children:
            # Calculate the term for child c
            
            scaled_reward = self.nodes[c].sum_reward / self.nodes[c].count
            exploration_term = const * math.sqrt(2 * math.log(self.nodes[node_index].count) / self.nodes[c].count)
            extra = 0
            if len(self.nodes[c].simulation_history) != 0:
                extra = const_2 * math.sqrt(np.var(self.nodes[c].simulation_history) + 1 / self.nodes[c].count)

            result = scaled_reward + exploration_term + extra  # Schadd SP-MCTS added term

            # Compare to running maximum
            if result > opt:
                opt = result
                child = c

        chosen_mod = self.modifications[self.nodes[child].modification]
        print(colored("Chosen child's modification: {}".format(chosen_mod), "red"))
        
        return child


    def default_policy(self, node_index):
        start = node_index
        simulate_env = copy.deepcopy(self.nodes[start].env)
        num_modifications_applied = len(self.env.walls) - len(simulate_env.walls) + len(simulate_env.special)
        mods_left = self.max_layer - num_modifications_applied
        
        # Choose from unused modifications, from start node
        # We know that tree.nodes[start] is a leaf, so there is no used modifications at start yet.
        ls = []
        for i in range(self.nodes[start].modification + 1, len(self.modifications)):
            ls.append(i)

        # Following a completely random policy
        a = random.sample(ls, k=mods_left)
        a = sorted(a)

        for element in a:
            mod = self.modifications[element]
            if mod[0] == 0:
                simulate_env = simulate_env.transition([(mod[1], mod[2])])
            elif mod[0] == 1:
                simulate_env.special.append((mod[1], mod[2]))
        
        # Training
        agent = connected_qlearn(self.agent, simulate_env, 400)
        reward = utility(agent)

        if reward > self.threshold + 0.5:
            print(colored(a, "red"))
            print(colored(reward, "red"))

            for element in a:
                start = self.add_node(element, start).index

            if reward > self.max_reward:
                self.max_reward = reward
                self.opt_env = simulate_env
            
            return [self.scale(reward), start]

        return self.scale(reward)

    
    def tree_policy(self, node_index, c1, c2):
        iter_index = node_index
        while not self.nodes[iter_index].terminal(self):
            if not self.nodes[iter_index].fully_expanded(self):
                return self.expand(iter_index)
            
            else:
                iter_index = self.best_child(iter_index, c1, c2)
        
        return iter_index


    def backup(self, node_index, reward):
        iter_index = node_index
        while iter_index is not None:
            self.nodes[iter_index].sum_reward += reward
            self.nodes[iter_index].simulation_history.append(reward)
            self.nodes[iter_index].count += 1
            iter_index = self.nodes[iter_index].parent
    

    def ucb_search(self, iterations):
        root_index = self.nodes[0].index
        c1 = 1
        c2 = 1

        for i in range(iterations):
            print(colored("Iteration {} begins!".format(i), "red"))
            
            # Return the index of the node newly expanded and play out to the terminal (or node at last layer)

            leaf_index = self.tree_policy(root_index, c1, c2)
            a = self.default_policy(leaf_index)

            if isinstance(a, list):
                leaf_index = a[1]
                reward = a[0]
            
            else:
                reward = a

            self.backup(leaf_index, reward)
            print(colored("Number of nodes so far: {}".format(len(self.nodes)), "green"))
            print(colored("Maximum reward seen so far: {}".format(self.max_reward), "green"))
            print("Iteration {} ends!".format(i))
            print()


    def greedy(self):
        walk = []
        start = 0
        while self.nodes[start].layer < self.max_layer:
            if len(self.nodes[start].visited_children) != 0:
                start = self.best_child(start, 0, 0, expanded=False)
                mod_index = self.nodes[start].modification
                walk.append(self.modifications[mod_index])
        
        if len(walk) < self.max_layer:
            print("MCTS insufficient to get {} modifications!".format(self.max_layer))
            return (walk, None)

        else:
            modified = make_env(self.env, walk)
            agent = QAgent(modified)
            agent.qlearn(600, render=False)
            rews = utility(agent)
            return (walk, rews)

    
    def best_observed_choice(self):
        vector = []
        for wall in self.env.walls:
            if wall not in self.opt_env.walls:
                tup = (0, wall[0], wall[1])
                vector.append(tup)

        for cell in self.opt_env.special:
            if cell not in self.env.special:
                tup = (1, cell[0], cell[1])
                vector.append(tup)

        # Training to prevent errors arising from connected training
        agent = QAgent(self.opt_env)
        agent.qlearn(600)
        rews = utility(agent)

        x = max(rews, self.max_reward)

        return (vector, x)


    def info(self, node_index):
        dict_return = {}
        for key in vars(self.nodes[node_index]):
            if key != "simulation_history":
                if key != "env":
                    dict_return[key] = vars(self.nodes[node_index])[key]
                else:
                    dict_return["walls"] = self.nodes[node_index].env.walls
                    dict_return["special_cells"] = self.nodes[node_index].env.special
        
        return dict_return

Example #12

0

Show file

    rounds = 5
    mp.set_start_method = "spawn"
    num_processes = 10
    processes = []
    manager = Manager()
    agents = manager.list()
    for i in range(rounds * num_processes):
        agents.append(0)  # keeper

    categories = []
    num_mods = 1

    map_to_numpy = np.asarray(map, dtype="c")
    env = TaxiEnv(map_to_numpy)  # reference environment

    orig_agent = QAgent(env)
    orig_agent.qlearn(600, show=False)
    cell_dict = cell_frequency(orig_agent)
    wall_dict = wall_interference(orig_agent)
    modifications = []

    for element in wall_dict:
        modifications.append((0, element[0]))
    for element in cell_dict[0:14]:
        row, col = element[0]
        modifications.append((1, (row, col)))

    for iter in range(rounds):
        print(colored("Data addition round {} begins!".format(iter), "red"))
        for i in range(num_processes):
            results = simulate_env(env, num_mods)

Example #13

0

Show file

File: multi_greedy.py Project: vietvu2001/rld-taxiv3

def greedy(env, num_mods):
    # This function returns the sequence of modifications based on the wall and cell heuristics
    # Parameters
    # ===============================================================
    # env: the original environment
    # num_mods: the number of modifications
    # ===============================================================

    greedy_seq = []
    ref = copy.deepcopy(env)
    agent = None

    for i in range(num_mods):
        # For each iteration, find out the wall that most interferes and the cell that is crossed the most. Try out all options.
        if i == 0:
            agent = QAgent(ref)
            agent.qlearn(600, render=False)

        else:
            agent = connected_qlearn(agent, ref, 300)

        # Take out the lists from the heuristics.
        wall_dict = wall_interference(agent)
        cell_dict = cell_frequency(agent)

        # Take out the max values, and the options to try out.
        wall_nums = [elem[1] for elem in wall_dict]
        max_wall = max(wall_nums)

        cell_nums = [elem[1] for elem in cell_dict]
        max_cell = max(cell_nums)

        wall_options = [elem[0] for elem in wall_dict if elem[1] == max_wall]
        cell_options = [elem[0] for elem in cell_dict if elem[1] == max_cell]

        # Test out all the options, get optimal modification
        opt_value = float("-inf")
        opt_choice = None
        category = -1

        for wall in wall_options:
            print(colored("Testing environment", "red"))
            e = ref.transition([wall])
            new_agent = connected_qlearn(agent, e, 300)

            # Get utility
            val = utility(new_agent)
            if val > opt_value:
                opt_value = val
                opt_choice = wall
                category = 0

        for cell in cell_options:
            print(colored("Testing environment", "red"))
            e = copy.deepcopy(ref)
            e.special.append(cell)
            new_agent = connected_qlearn(agent, e, 300)

            # Get utility
            val = utility(new_agent)
            if val > opt_value:
                opt_value = val
                opt_choice = cell
                category = 1

        assert (category != -1)

        # Store found modification and change the reference environment
        if category == 0:
            mod = (0, opt_choice[0], opt_choice[1])
            greedy_seq.append(mod)
            ref = ref.transition([opt_choice])

        elif category == 1:
            mod = (1, opt_choice[0], opt_choice[1])
            greedy_seq.append(mod)
            ref.special.append(opt_choice)

    # Evaluate utility
    total_agent = QAgent(ref)
    total_agent.qlearn(600, render=False)
    result = utility(total_agent)
    # print(colored(result, "red"))

    return greedy_seq, result

Example #14

0

Show file

File: connect_mcts_trimmed.py Project: vietvu2001/rld-taxiv3

class Tree():
    def __init__(self, env, max_layer):
        self.env = env
        self.modifications = []
        self.counter = 0
        self.nodes = []

        agent = QAgent(env)
        agent.qlearn(600, show=False)
        wall_dict = wall_interference(agent)
        cell_dict = cell_frequency(agent)

        for element in wall_dict:
            self.modifications.append((0, element[0][0], element[0][1]))

        for element in cell_dict[0:14]:
            self.modifications.append((1, element[0][0], element[0][1]))

        self.num_nodes = 0
        self.root = None
        self.max_layer = max_layer
        self.threshold = 8

        self.agent = QAgent(self.env)
        self.agent.qlearn(500, show=False, render=False)
        self.env.reset()

        # Storing best reward score and corresponding environment seen so far
        self.max_reward = float("-inf")
        self.opt_env = None

    def scale(self, x):
        return max(0, x - self.threshold)

    def initialize(self):
        assert self.root == None
        root = Node(None, self.num_nodes, None, self.env)
        self.nodes.append(root)
        self.num_nodes += 1
        self.root = root
        self.root.update_walls_special(self, parent_bool=False)
        self.root.update_layer(self)

    def add_node(self, mod_index, parent_index):
        assert parent_index < self.num_nodes
        assert mod_index in self.nodes[parent_index].get_unused_modifications(
            self)

        node = Node(mod_index, self.num_nodes, parent_index, self.env)
        self.nodes.append(node)
        self.num_nodes += 1
        self.nodes[parent_index].leaf = False
        self.nodes[node.index].update_layer(self)
        self.nodes[node.index].update_walls_special(self, parent_bool=True)
        self.nodes[parent_index].visited_children.append(node.index)
        return self.nodes[node.index]

    def expand(self, node_index):  # return index of an expanded node
        assert self.num_nodes > node_index
        ls = self.nodes[node_index].get_unused_modifications(self)
        assert len(ls) > 0  # still have an unvisited child
        mod_index = random.choice(ls)
        node = self.add_node(mod_index, node_index)
        return node.index

    def best_child(self,
                   node_index,
                   const,
                   const_2=1,
                   expanded=True
                   ):  # return index of best child according to ucb heuristic
        assert self.num_nodes > node_index
        ls = self.nodes[node_index].get_unused_modifications(self)
        if expanded:
            assert len(ls) == 0

        opt = float("-inf")
        child = None

        for c in self.nodes[node_index].visited_children:
            scaled_reward = self.nodes[c].sum_reward / self.nodes[c].count
            exploration_term = const * math.sqrt(2 * math.log(
                self.nodes[node_index].count) / self.nodes[c].count)
            extra = 0
            if len(self.nodes[c].simulation_history) != 0:
                extra = const_2 * math.sqrt(
                    np.var(self.nodes[c].simulation_history) +
                    1 / self.nodes[c].count)

            result = scaled_reward + exploration_term + extra  # Schadd SP-MCTS added term
            if result > opt:
                opt = result
                child = c

        chosen_mod = self.modifications[self.nodes[child].modification]
        print(
            colored("Chosen child's modification: {}".format(chosen_mod),
                    "red"))

        return child

    def default_policy(self, node_index):
        start = node_index
        simulate_env = copy.deepcopy(self.nodes[start].env)
        num_modifications_applied = len(self.env.walls) - len(
            simulate_env.walls) + len(simulate_env.special) - len(
                self.env.special)
        mods_left = self.max_layer - num_modifications_applied

        # Choose from unused modifications, from start node
        # We know that tree.nodes[start] is a leaf, so there is no used modifications at start yet.
        ls = []
        for i in range(self.nodes[start].modification + 1,
                       len(self.modifications)):
            ls.append(i)

        a = random.sample(ls, k=mods_left)
        a = sorted(a)
        for element in a:
            mod = self.modifications[element]
            if mod[0] == 0:
                simulate_env = simulate_env.transition([(mod[1], mod[2])])
            elif mod[0] == 1:
                simulate_env.special.append((mod[1], mod[2]))

        # Training
        agent = connected_qlearn(self.agent, simulate_env, 425)
        reward = utility(agent)

        if reward > self.threshold + 0.5:
            print(colored(a, "red"))
            print(colored(reward, "red"))
            for element in a:
                start = self.add_node(element, start).index

            # Update tree's max reward environment if possible
            if reward > self.max_reward:
                self.max_reward = reward
                self.opt_env = copy.deepcopy(simulate_env)

            return [self.scale(reward), start]

        return self.scale(reward)

    def tree_policy(self, node_index, c1, c2):
        iter_index = node_index
        while not self.nodes[iter_index].terminal(self):
            if not self.nodes[iter_index].fully_expanded(self):
                return self.expand(iter_index)

            else:
                iter_index = self.best_child(iter_index, c1, c2)

        return iter_index

    def backup(self, node_index, reward):
        iter_index = node_index
        while iter_index is not None:
            self.nodes[iter_index].sum_reward += reward
            self.nodes[iter_index].simulation_history.append(reward)
            self.nodes[iter_index].count += 1
            iter_index = self.nodes[iter_index].parent

    def ucb_search(self, iterations):
        root_index = self.nodes[0].index
        c1 = 1
        c2 = 1

        for i in range(iterations):
            print(colored("Iteration {} begins!".format(i), "red"))
            leaf_index = self.tree_policy(root_index, c1, c2)
            a = self.default_policy(leaf_index)
            if isinstance(a, list):
                leaf_index = a[1]
                reward = a[0]

            else:
                reward = a

            self.backup(leaf_index, reward)
            print(
                colored("Number of nodes so far: {}".format(len(self.nodes)),
                        "green"))
            print(
                colored(
                    "Maximum reward seen so far: {}".format(self.max_reward),
                    "green"))
            print("Iteration {} ends!".format(i))
            print()

    def greedy(self):
        walk = []
        start = 0
        while self.nodes[start].layer < self.max_layer:
            if len(self.nodes[start].visited_children) != 0:
                start = self.best_child(start, 0, 0, expanded=False)
                mod_index = self.nodes[start].modification
                walk.append(self.modifications[mod_index])

        if len(walk) < self.max_layer:
            print("MCTS insufficient to get {} modifications".format(
                self.max_layer))
            return (walk, None)

        else:
            modified = make_env(self.env, walk)
            agent = QAgent(modified)
            agent.qlearn(600, render=False)
            rews = utility(agent)
            return (walk, rews)

    def best_observed_choice(self):
        vector = []
        for wall in self.env.walls:
            if wall not in self.opt_env.walls:
                tup = (0, wall[0], wall[1])
                vector.append(tup)

        for cell in self.opt_env.special:
            if cell not in self.env.special:
                tup = (1, cell[0], cell[1])
                vector.append(tup)

        # Training to prevent errors arising from connected training
        agent = QAgent(self.opt_env)
        agent.qlearn(600)
        rews = utility(agent)

        return (vector, rews)

    def info(self, node_index):
        dict_return = {}
        for key in vars(self.nodes[node_index]):
            if key != "simulation_history":
                if key != "env":
                    dict_return[key] = vars(self.nodes[node_index])[key]
                else:
                    dict_return["walls"] = self.nodes[node_index].env.walls
                    dict_return["special_cells"] = self.nodes[
                        node_index].env.special

        return dict_return

Example #15

0

Show file

File: connected_allstates.py Project: vietvu2001/rld-taxiv3

data = []

if __name__ == "__main__":
    rounds = 20
    mp.set_start_method = "spawn"
    num_processes = 10
    processes = []
    manager = Manager()
    agents = manager.list()
    for i in range(rounds * num_processes):
        agents.append(0)  # keeper

    categories = []
    num_mods = 2

    orig_agent = QAgent(env)
    orig_agent.qlearn(600, render=False)

    for iter in range(rounds):
        print(colored("Data addition round {} begins!".format(iter), "red"))
        for i in range(num_processes):
            results = simulate_env(env, num_mods)
            modified = results[0]
            categories.append(results[1])
            # agent = QAgent(modified)

            p = mp.Process(target=connected_qlearn_as_func, args=(orig_agent, modified, i, agents, i + iter * num_processes))
            p.start()
            processes.append(p)

        for process in processes:

Example #16

0

Show file

                exist = True
                break
        
        if not exist:
            h.insert(seq, val)
    
    if i % 100 == 0:
        print(i)


opt_seq = None
opt_val = -1
for element in range(len(h.array)):
    seq = h.mod_seq(element)
    modified = make_env(env, seq)
    agent = QAgent(modified)
    agent.qlearn(600, render=False)
    rews = utility(agent)
    print(colored(rews, "red"))
    if rews > opt_val:
        opt_val = rews
        opt_seq = seq

r_dir = os.path.abspath(os.pardir)
data_dir = os.path.join(r_dir, "data")
file_dir = os.path.join(data_dir, "sl_result_{}.txt".format(num_mods))
with open(file_dir, "w") as file:
    file.write("Modifications: ")
    file.write(str(opt_seq))
    file.write("\n")
    file.write("Utility: ")

Example #17

0

Show file

    rounds = 100
    mp.set_start_method = "spawn"
    num_processes = 10
    processes = []
    manager = Manager()
    agents = manager.list()
    for i in range(rounds * num_processes):
        agents.append(0)  # keeper

    categories = []
    num_mods = 3

    map_to_numpy = np.asarray(map, dtype="c")
    env = TaxiEnv(map_to_numpy)  # reference environment

    orig_agent = QAgent(env)
    orig_agent.qlearn(600, show=False)
    cell_dict = cell_frequency(orig_agent)
    wall_dict = wall_interference(orig_agent)
    modifications = []

    for element in wall_dict:
        modifications.append((0, element[0]))
    for element in cell_dict[0:14]:
        row, col = element[0]
        modifications.append((1, (row, col)))

    for iter in range(rounds):
        print(colored("Data addition round {} begins!".format(iter), "red"))
        for i in range(num_processes):
            results = simulate_env(env, num_mods)

Example #18

0

Show file

File: allstates.py Project: vietvu2001/rld-taxiv3

    processes = []
    manager = Manager()
    agents = manager.list()
    for i in range(rounds * num_processes):
        agents.append(0)  # keeper

    categories = []
    num_mods = 4

    for iter in range(rounds):
        print(colored("Data addition round {} begins!".format(iter), "red"))
        for i in range(num_processes):
            results = simulate_env(env, num_mods)
            modified = results[0]
            categories.append(results[1])
            agent = QAgent(modified)
            p = mp.Process(target=qlearn_as_func, args=(agent, modified, i, agents, i + iter * num_processes))
            p.start()
            processes.append(p)

        for process in processes:
            process.join()

        for process in processes:
            process.terminate()

    
    for i in range(len(agents)):
        ut = utility(agents[i])
        data.append((categories[i], ut))

Example #19

0

Show file

            index = random.randint(0, N - k)
        else:
            index = random.randint(index + 1, N - k + i)

        res.append(list[index])

    return res


if num_mods == 6:
    num_trials = int(2e+6)

else:
    num_trials = int(1e+6)

orig_agent = QAgent(env)
orig_agent.qlearn(600)
cell_dict = cell_frequency(orig_agent)
wall_dict = wall_interference(orig_agent)
modifications = []

for element in wall_dict:
    modifications.append((0, element[0][0], element[0][1]))
for element in cell_dict[0:14]:
    row, col = element[0]
    modifications.append((1, row, col))

# Initialize and build heap
sz = min(12 * num_mods, len(x_test))
h = Heap(model, x_test[0:sz], sz)
h.build_heap()

Example #20

0

Show file

File: batch_greedy.py Project: vietvu2001/rld-taxiv3

def batch_greedy(env, num_mods, num_mods_per_run, ls_num_iters):
    # Parameters
    # =========================================================
    # env: original environment
    # num_mods: total number of modifications
    # num_mods_per_run: total number of modifications considered in combination
    # ls_num_iters: list of number of iterations run for each number of modifications
    # =========================================================

    # Example: [50, 200] means that if num_mods == 1, run 50 iterations, and if num_mods == 2, run 200 iterations.

    ref = copy.deepcopy(env)
    mods_ret = []  # answer of this algorithm

    # Initialize an MCTS tree
    tree = Tree(env, max_layer=num_mods)
    tree.initialize()

    # Keep a running count
    count = num_mods

    # Keep a running list of modifications
    ls_mods = copy.deepcopy(tree.modifications)

    # Initialize baseline
    baseline = 8 + 0.25 * num_mods_per_run

    assert (count >= num_mods_per_run)
    assert (len(ls_num_iters) == num_mods_per_run)

    while count > 0:
        print(colored(ls_mods, "red"))
        n = 0
        if count >= num_mods_per_run:
            n = num_mods_per_run

        else:
            n = count

        tree = Tree(ref, max_layer=n)
        tree.initialize()
        tree.threshold = baseline

        tree.modifications = copy.deepcopy(ls_mods)

        # Find out number of iterations
        num_iter = ls_num_iters[n - 1]

        # Perform an MCTS search
        tree.ucb_search(iterations=num_iter)

        a = tree.best_observed_choice()
        for elem in a[0]:
            mods_ret.append(elem)

        # Transform the environment
        for elem in a[0]:
            if elem[0] == 0:  # wall
                ref = ref.transition([(elem[1], elem[2])])

            elif elem[0] == 1:  # cell
                ref.special.append((elem[1], elem[2]))

            ls_mods.remove(elem)

        count -= n
        print(colored(mods_ret, "red"))

        # Increase baseline
        baseline += 0.5 * n

    # Find utility
    agent = QAgent(ref)
    agent.qlearn(600)
    rews = utility(agent)

    return (mods_ret, rews)