Exemple #1
0
    def simulation_process(self, child_node: Node) -> None:
        # From child node onwards sample randomly for n time steps or until terminal state and collect rewards
        current_state = child_node.state

        for t in range(self.episode_duration):
            # Sample action randomly
            action = np.random.choice(self.action_space)
            # Simulate next state
            next_state_values = self.prediction_model(s_t=current_state,
                                                      a_t=action)
            next_state_idx = self.env.get_state_space_idx(
                observation=next_state_values)
            next_state = State(x=next_state_values[0],
                               y=next_state_values[1],
                               x_pos=next_state_idx[0],
                               y_pos=next_state_idx[1])

            # Add reward
            self.reward_collection += self.reward_model.get_reward(
                s=next_state, a=action)

            # check if terminal state
            if self.env.is_terminal(state=next_state):
                break
            else:
                current_state = next_state
Exemple #2
0
    def expansion_process(self, leaf_node: Node) -> Node:
        # if not all actions were explored choose randomly from those
        available_actions = [
            self.action_space[i] for i in range(len(self.action_space))
            if leaf_node.action_visits[i] == 0
        ]
        action = np.random.choice(available_actions)

        # Simulate next state
        next_state_values = self.prediction_model(s_t=leaf_node.state,
                                                  a_t=action)
        next_state_idx = self.env.get_state_space_idx(
            observation=next_state_values)
        next_state = State(x=next_state_values[0],
                           y=next_state_values[1],
                           x_pos=next_state_idx[0],
                           y_pos=next_state_idx[1])

        self.reward_collection += self.reward_model.get_reward(s=next_state,
                                                               a=action)

        # Add node to tree
        child_node = Node(state=next_state, action_space=self.action_space)
        leaf_node.add_successor(node=child_node, action=action)

        # Update node
        if leaf_node.total_visits == np.inf:
            leaf_node.total_visits = 1
        else:
            leaf_node.total_visits += 1

        leaf_node.action_visits[action] += 1
        self.action_trajectory.put(action)

        return child_node
Exemple #3
0
 def gen_node_from_observation(self, observation: List[float]) -> Node:
     """
     Generates a node object for the environment observation
     :param observation:
     :return:
     """
     x_idx, y_idx = self.env.get_state_space_idx(observation=observation)
     state = State(x=observation[0],
                   y=observation[1],
                   x_pos=x_idx,
                   y_pos=y_idx)
     return Node(state=state, action_space=self.action_space)
Exemple #4
0
    def tree_traversal(self, ref_node: Node, ref_state: State, search_tree: queue.LifoQueue) \
            -> Tuple[Union[Node, None], queue.LifoQueue]:
        for action_id in self.action_space:
            nodes_list = ref_node.successor_nodes[action_id]
            if len(nodes_list) == 0:
                continue

            for node in nodes_list:
                search_tree.put(node)
                if node.state.get_state_idx() == ref_state.get_state_idx():
                    return node, search_tree

            return None, search_tree
Exemple #5
0
 def gen_state_from_observation(self, observation: List[float]) -> State:
     """
     Generates a state object for the environment observation
     :param observation:
     :return:
     """
     if self.state_space_type == STATE_SPACE_TYPE.DISCRETE:
         x_idx, y_idx = self.env.get_state_space_idx(
             observation=observation)
     else:
         x_idx, y_idx = None, None
     return State(x=observation[0],
                  y=observation[1],
                  x_pos=x_idx,
                  y_pos=y_idx)
Exemple #6
0
 def get_future_reward(self, state: State, action: int) -> float:
     """
     Computation of future reward which will be obtained by taking action a in state s. For determinisitic system
     dynamics.
     :param state:
     :param action:
     :return: Reward of action a taken in state s
     """
     s_t1_obs = self.transition_model.state_transition(state, action)
     x_t1_idx, y_t1_idx = self.env.get_state_space_idx(observation=s_t1_obs)
     new_state = State(x=s_t1_obs[0],
                       y=s_t1_obs[1],
                       x_pos=x_t1_idx,
                       y_pos=y_t1_idx)
     reward = self.value_space[new_state.x_idx][new_state.y_idx]
     return reward
Exemple #7
0
    def simulate_n_steps(self):
        for i in range(self.n_simulations + 1):
            # Sample from experience
            sample_state_idx = np.random.choice(range(len(self.state_action_observations.keys())))
            sample_state = list(self.state_action_observations.keys())[sample_state_idx]
            sample_action = np.random.choice(self.state_action_observations[sample_state])

            # Simulate one step and get reward
            state_t0 = self.env.get_state_space_value(x_idx=sample_state[0], y_idx=sample_state[1])

            s_t1 = self.prediction_model(s_t=state_t0, a_t=sample_action)
            x_t1_idx, y_t1_idx = self.env.get_state_space_idx(observation=s_t1)
            state_t1 = State(x=s_t1[0], y=s_t1[1], x_pos=x_t1_idx, y_pos=y_t1_idx)

            reward_t1 = self.reward_model.get_reward(s=state_t1, a=sample_action)

            # Make Q-Learning Update
            self.q_learning_update(state_t0=state_t0, action_t0=Action(action=sample_action),
                                   reward=reward_t1, state_t1=state_t1)