Ejemplo n.º 1
0
    def rollout(self, lookahead, env, n: OR_Node):
        while not n.SOLVED and lookahead.sim_calls - lookahead.init_sim_calls < lookahead.sim_budget:
            lookahead.rollout_depth += 1
            t0 = time.perf_counter()
            lookahead.expand(env, n)

            # Pick random unsolved child of n
            t0 = time.perf_counter()
            n = lookahead.pick_random_unsolved_child(env, n)
            tf = time.perf_counter()
            lookahead.rollout_runtime_pick_random_unsolved += tf - t0
            if n.terminal:
                n.visited = True
                lookahead.num_visited += 1
                lookahead.solve_and_propagate_labels(n)
                if lookahead.worst_terminal_accumulated_reward is None or lookahead.worst_terminal_accumulated_reward > n.accumulated_reward:
                    lookahead.worst_terminal_accumulated_reward = n.accumulated_reward
                break
            t0 = time.perf_counter()
            is_novel = lookahead.root.feature_table.is_novel((n.state[0]))
            tf = time.perf_counter()
            lookahead.rollout_runtime_is_novel += tf - t0
            if is_novel:
                n.visited = True
                lookahead.num_visited += 1
                lookahead.root.feature_table.update_feature_table((n.state[0]))
            elif not n.visited:
                #pruned as is not novel
                n.randomV = lookahead.cost_to_go_est(env, n)
                lookahead.solve_and_propagate_labels(n)
                break
        if not n.SOLVED and lookahead._pruned_state_strategy == "heuristic":
            # If didn't finish rollout due to computational budget apply heuistic value
            n.randomV = lookahead.cost_to_go_est(env, n)
Ejemplo n.º 2
0
 def expand(self, env, n: OR_Node):
     """
         Construct AND nodes por each of the actions applicable
         on state s(n)
     """
     if len(n.children) == 0:
         if self.tabulate_state_visits:
             tupOfState = tuple(n.state[0].tolist())
             try:
                 n.num_visits = self.stateCount[tupOfState]
             except:
                 n.num_visits = 0
         else:
             n.num_visits = 0
         #wizluk.logger.debug("expanded out a node")
         for a in range(env.action_space.n):
             and_node = AND_Node(a, n)
             if self.tabulate_state_visits:
                 try:
                     and_node.num_visits = self.stateCountAction[a][
                         tupOfState]
                 except:
                     and_node.num_visits = 0
             else:
                 and_node.num_visits = 0
             and_node.Q = float('-inf')
             and_node.visited = False
Ejemplo n.º 3
0
    def make_root_node(self, s, forget=True):
        n = OR_Node(s, 0)
        if forget:
            self._exp_graph = AND_OR_Graph()

        try:
            n = self._exp_graph.locate(n)
            self.root = n
            #wizluk.logger.debug("Root node already considered")
        except KeyError:
            n.visited = False
            self._exp_graph.register(n)
            self._exp_graph.add_root(n)
            self.root = n
            #wizluk.logger.debug("New root node ")
        self.current = self.root
Ejemplo n.º 4
0
    def make_root_node(self, s, forget=True):
        n = OR_Node(s, 0)
        n.accumulated_reward = 0
        n.num_visits = 0
        if forget:
            self._exp_graph = AND_OR_Graph()
            #gc.collect()

        if self.root is not None:
            del self.root.feature_table

        try:
            n = self._exp_graph.locate(n)
            n.SOLVED = False
            self.free_mem(self.root, n)
            #gc.collect()
            self.root = n
            self.strategy.initialize_feature_table(self, n)
            wizluk.logger.debug("Root node already considered")
        except KeyError:
            n.SOLVED = False
            n.visited = False
            if self.root is not None:
                self.free_mem(self.root, n)
                #gc.collect()
            self._exp_graph.register(n)
            self._exp_graph.add_root(n)
            self.root = n
            self.strategy.initialize_feature_table(self, n)
            wizluk.logger.debug("New root node ")
        self.current = self.root
Ejemplo n.º 5
0
    def make_root_node(self, s, forget=True):
        n = OR_Node(s, 1)
        if forget:
            self._exp_graph = AND_OR_Graph()

        try:
            n = self._exp_graph.locate(n)
            self.free_mem(self.root, n)
            self.root = n
        except KeyError:
            n.visited = False
            if self.root is not None:
                self.free_mem(self.root, n)
            self._exp_graph.register(n)
            self._exp_graph.add_root(n)
            self.root = n
        self.root._d = 0
        self.current = self.root
Ejemplo n.º 6
0
 def check_OR_solved(self, n: OR_Node):
     is_solved = True
     for _, a_node in n.children.items():
         if not a_node.SOLVED:
             is_solved = False
             break
     n.SOLVED = is_solved
     if n.SOLVED:
         self.num_solved += 1
         for p in n.parents:
             self.check_AND_solved(p)
Ejemplo n.º 7
0
    def update_table(self, n: OR_Node):
        open = deque()
        open.append(n)
        i = 0
        while len(open) > 0:
            n = open.pop()  # top of the stack
            if isinstance(n, OR_Node):
                for act, child in n.children.items():
                    open.append(child)
                self.root.feature_table.update_feature_table((n.state[0], n.d))

                n.SOLVED = False
                continue
            elif isinstance(n, AND_Node):
                for succ, r in n.children:
                    open.append(succ)
                n.SOLVED = False
                continue
            else:
                assert False
Ejemplo n.º 8
0
    def pick_action(self, env, n: OR_Node, action, history):
        history.append(n.children[action])
        next_state, reward, terminal, _ = env.step(action)
        self.sim_calls += 1
        next_state = np.reshape(next_state,
                                [1, np.prod(env.observation_space.shape)])
        succ = OR_Node(next_state, n.d + 1, terminal)
        if n.children[action].update(reward, succ):
            if self.tabulate_state_visits:
                tupOfState = tuple(succ.state[0].tolist())
                try:
                    succ.num_visits = self.stateCount[tupOfState]
                except:
                    succ.num_visits = 0
            else:
                succ.num_visits = 0  # if we get a new successor
            succ.r = reward
            self._exp_graph.update(n, action, reward, succ)
            node = succ
            node.v = 0
            node.num_rollouts = 0
        else:
            for child in n.children[action].children:
                succ1, reward1 = child
                if reward1 == reward and succ1 == succ:
                    #wizluk.logger.debug("the state exists")
                    node = succ1
        history.append(node)
        n.children[action].visited = True

        self.max_depth = max(self.max_depth, node.d)
        return node, history
Ejemplo n.º 9
0
    def rollout(self, lookahead, env, n: OR_Node):
        while not n.SOLVED and lookahead.sim_calls - lookahead.init_sim_calls < lookahead.sim_budget:
            lookahead.rollout_depth += 1
            lookahead.expand(env, n)
            # Pick random unsolved child of n
            n = lookahead.pick_random_unsolved_child(env, n)
            if n.terminal:
                n.visited = True
                lookahead.num_visited += 1
                lookahead.solve_and_propagate_labels(n)
                if lookahead.worst_terminal_accumulated_reward is None or lookahead.worst_terminal_accumulated_reward > n.accumulated_reward:
                    lookahead.worst_terminal_accumulated_reward = n.accumulated_reward
                break
            f, v, rank, old_depth = lookahead.root.feature_table.get_novel_feature(
                (n.state[0], n.d))
            if n.d < old_depth:
                n.visited = True
                lookahead.num_visited += 1
                lookahead.root.feature_table.update_feature_table(
                    (n.state[0], n.d))
            elif not n.visited and n.d >= old_depth:
                n.visited = True
                lookahead.num_visited += 1
                #pruned as is not novel
                n.randomV = lookahead.cost_to_go_est(env, n)

                lookahead.solve_and_propagate_labels(n)
                break
            elif n.visited and old_depth < n.d:
                #pruned as is not novel
                n.randomV = lookahead.cost_to_go_est(env, n)
                n._children = {}
                lookahead.solve_and_propagate_labels(n)
                break

        if not n.SOLVED and lookahead._pruned_state_strategy == "heuristic":  #If didn't finish rollout due to computational budget apply heuistic value
            n.randomV = lookahead.cost_to_go_est(env, n)
Ejemplo n.º 10
0
    def bestChild(self, env, n: OR_Node, history):
        assert (len(n.children) > 0)
        L = [
            float('inf')
            if n.children[k].num_visits == 0 else n.children[k].Q + self._C *
            np.sqrt(2 * np.log(n.num_visits / n.children[k].num_visits))
            for k in n.children.keys()
        ]
        selected = list(n.children.keys())[np.argmax(L)]

        history.append(n.children[selected])
        if self._atari == "True" and len(n.children[selected].children
                                         ) != 0 and self._caching != "None":
            elapsed_steps = env._elapsed_steps
            envuw = env.unwrapped
            for node, reward in n.children[selected].children:
                if hasattr(
                        node,
                        'restoreStateFrom') and node.restoreState is not None:
                    break
            wasRestored = False
            if hasattr(node,
                       'restoreStateFrom') and node.restoreState is not None:
                if node.restoreStateFrom != self.get_action_call and self._caching != "Full":  #State is not from this get action call therefore for partial caching don't restore
                    node.restoreState = None
                else:
                    env.unwrapped.restore_full_state(node.restoreState)
                    env._elapsed_steps = elapsed_steps + 1
                    succ = node
                    wasRestored = True

            if not wasRestored:
                assert (False)
        else:
            next_state, reward, terminal, _ = env.step(selected)
            self.sim_calls += 1
            if self._atari != "True":
                next_state = np.reshape(
                    next_state, [1, np.prod(env.observation_space.shape)])
            elif self._caching != "None":  # If atari and caching is on
                assert (False)
            succ = OR_Node(next_state, n.d + 1, terminal)
            if n.children[selected].update(reward, succ):
                if self.tabulate_state_visits:
                    tupOfState = tuple(succ.state[0].tolist())
                    try:
                        succ.num_visits = self.stateCount[tupOfState]
                    except:
                        succ.num_visits = 0
                else:
                    succ.num_visits = 0  # if we get a new successor
                succ.r = reward
            else:
                foundChild = False
                for child in n.children[selected].children:
                    succ1, reward1 = child
                    if reward1 == reward and succ1 == succ:
                        succ = succ1
                        foundChild = True
                assert (foundChild)

        history.append(succ)
        n.children[selected].visited = True

        self.max_depth = max(self.max_depth, succ.d)
        return succ, history
Ejemplo n.º 11
0
    def pick_random_unvisited_child(self, env, n: OR_Node, history):
        candidates = [
            k for k in n.children.keys() if not n.children[k].visited
        ]
        selected = np.random.choice(candidates)
        history.append(n.children[selected])
        if self._atari == "True" and len(n.children[selected].children
                                         ) != 0 and self._caching != "None":
            elapsed_steps = env._elapsed_steps
            wasRestored = False
            for node, reward in n.children[selected].children:
                if hasattr(
                        node,
                        'restoreStateFrom') and node.restoreState is not None:
                    break
            if hasattr(node,
                       'restoreStateFrom') and node.restoreState is not None:
                if node.restoreStateFrom != self.get_action_call and self._caching != "Full":  #State is not from this get action call therefore for partial caching don't restore
                    node.restoreState = None
                else:
                    env.unwrapped.restore_full_state(node.restoreState)
                    env._elapsed_steps = elapsed_steps + 1
                    succ = node
                    wasRestored = True

            if not wasRestored:
                next_state, reward, terminal, _ = env.step(selected)
                self.sim_calls += 1
                if np.array_equal(
                        next_state, node._state
                ) and reward == node.r and terminal == node.terminal:
                    succ = node
                else:
                    succ = copy.deepcopy(node)
                    succ.r = reward
                    succ._state = copy.deepcopy(next_state)
                    succ.terminal = terminal
                    n.children[selected].children.add((succ, reward))
                succ.restoreState = env.unwrapped.clone_full_state()
                succ.restoreStateFrom = self.get_action_call
        else:
            next_state, reward, terminal, _ = env.step(selected)
            self.sim_calls += 1
            if self._atari != "True":
                next_state = np.reshape(
                    next_state, [1, np.prod(env.observation_space.shape)])
            succ = OR_Node(next_state, n.d + 1, terminal)
            if n.children[selected].update(reward, succ):
                if self.tabulate_state_visits:
                    tupOfState = tuple(succ.state[0].tolist())
                    try:
                        succ.num_visits = self.stateCount[tupOfState]
                    except:
                        succ.num_visits = 0
                else:
                    succ.num_visits = 0  # if we get a new successor
                succ.r = reward
            else:
                foundChild = False
                for child in n.children[selected].children:
                    succ1, reward1 = child
                    if reward1 == reward and succ1 == succ:
                        succ = succ1
                        foundChild = True
                assert (foundChild)
            if self._atari == "True" and self._caching != "None":
                succ.restoreState = env.unwrapped.clone_full_state()
                succ.restoreStateFrom = self.get_action_call
        history.append(succ)

        assert self.isInChildrenOnce(n.children[selected], succ)
        n.children[selected].visited = True

        self.max_depth = max(self.max_depth, succ.d)
        return succ, history
Ejemplo n.º 12
0
 def solve_and_propagate_labels(self, n: OR_Node):
     n.SOLVED = True
     self.num_solved += 1
     for p in n.parents:
         self.check_AND_solved(p)
Ejemplo n.º 13
0
    def pick_random_unsolved_child(self, env, n: OR_Node):
        selected = self.sample_child(n)
        assert (not n.children[selected].SOLVED)
        if self._atari == "True" and len(n.children[selected].children
                                         ) != 0 and self._caching != "None":
            elapsed_steps = env._elapsed_steps
            envuw = env.unwrapped
            for node, reward in n.children[selected].children:
                break
            if node.restoreState is not None:
                env.unwrapped.restore_full_state(node.restoreState)
                env._elapsed_steps = elapsed_steps + 1
            else:
                t0 = time.perf_counter()
                next_state, sreward, terminal, _ = env.step(selected)
                tf = time.perf_counter()
                self.rollout_runtime_sim += tf - t0
                reward = sreward
                self.sim_calls += 1
                node.restoreState = env.unwrapped.clone_full_state()
                node.terminal = terminal
        else:
            t0 = time.perf_counter()
            next_state, reward, terminal, _ = env.step(selected)
            tf = time.perf_counter()
            self.rollout_runtime_sim += tf - t0
            self.sim_calls += 1

            if self._representation is not None:
                t0 = time.perf_counter()

                parentLength = int(n.state.size)

                screen = env.unwrapped.ale.getScreen()
                next_state_flat = self._representation.getActiveFeatures(
                    screen, n.state[0], parentLength)

                tf = time.perf_counter()

                next_state_flat = np.reshape(next_state_flat,
                                             [1, len(next_state_flat)])

                self.rollout_runtime_sim += tf - t0
            elif self._grayScale == "True":
                next_state_flat = np.reshape(
                    self.convertScreenToGrayCompressed(next_state),
                    [1, self._grayScaleSizeX * self._grayScaleSizeY])
            else:
                next_state_flat = np.reshape(
                    next_state, [1, np.prod(env.observation_space.shape)])

            succ = OR_Node(next_state_flat, n.d + 1, terminal)
            succ.add_parent(n.children[selected])
            if self._atari == "True" and self._caching != "None":
                succ.restoreState = env.unwrapped.clone_full_state()
            succ.SOLVED = False
            succ.visited = False

            if succ.d == self._horizon:
                succ.terminal = True

            if n.children[selected].update(reward, succ):
                n.children[
                    selected].SOLVED = False  # if we get a new successor, we unsolve the node
                node = succ
            else:
                for child in n.children[selected].children:
                    succ1, reward1 = child
                    if reward1 == reward and succ1 == succ:
                        node = succ1

        try:
            node.accumulated_reward = max(node.accumulated_reward,
                                          n.accumulated_reward + reward)
        except AttributeError:
            node.accumulated_reward = n.accumulated_reward + reward
            node.num_visits = 0

        self.max_depth = max(self.max_depth, node.d)
        n.children[selected].num_visits += 1
        node.num_visits += 1
        return node