def _expand_graph_node(self, node): assert bool(node.rewards) == bool(node.edges) if node.edges: return # graph node is expanded already # neighbours are ordered in the order of actions: # 0, 1, ..., _model.num_actions observations, rewards, dones, infos, states = \ yield from self._model.predict_steps( node.state, list(space_utils.element_iter(self._model.action_space)) ) solved = [info.get('solved', False) for info in infos] node.bonus = [info.get('bonus', 0.) for info in infos] self._bonuses.append(max(node.bonus)) value_batch = yield Request(RequestType.AGENT_PREDICTION, np.array(observations)) for idx, action in enumerate( space_utils.element_iter(self._action_space)): node.rewards[action] = rewards[idx] new_node = self._state2node.get(states[idx], None) if new_node is None: if dones[idx]: child_value = self._value_traits.zero else: [child_value] = value_batch[idx] new_node = self._initialize_graph_node(child_value, states[idx], dones[idx], solved=solved[idx]) node.edges[action] = new_node
def _expand_leaf(self, leaf): if leaf is None: # Dead End return self._value_traits.dead_end if leaf.terminal: # Terminal state return self._value_traits.zero # neighbours are ordered in the order of actions: # 0, 1, ..., _model.num_actions obs, rewards, dones, solved, states = self._children_of_state( leaf.state) value_batch = yield np.array(obs) for idx, action in enumerate( space_utils.element_iter(self._action_space)): leaf.rewards[action] = rewards[idx] new_node = self._state2node.get(states[idx], None) if new_node is None: if dones[idx]: child_value = self._value_traits.zero else: child_value = value_batch[idx] new_node = self._initialize_graph_node(child_value, states[idx], dones[idx], solved=solved[idx]) leaf.children[action] = TreeNode(new_node) return leaf.value_acc.get()
def _expand_graph_node(self, node): assert bool(node.rewards) == bool(node.edges) if (len(node.edges) > 0 or # graph node is expanded already node.solved or node.terminal): return # neighbours are ordered in the order of actions: # 0, 1, ..., _model.num_actions observations, rewards, dones, infos, states = \ yield from self._model.predict_steps( node.state, list(space_utils.element_iter(self._model.action_space)) ) # solved = [info.get('solved', False) for info in infos] assert all([reward in (0, 1) for reward in rewards]), \ 'We assume that env is deterministic, and there are goal states ' \ 'obtaining which gives you reward=1 and ends episode. All other ' \ 'actions should give reward=0' solved = [reward == 1 for reward in rewards] node.bonus = [ self._filter_bonus(info.get('bonus', 0.), reward, done) for info, reward, done in zip(infos, rewards, dones) ] node.value_acc.add_bonus(max(node.bonus)) self._bonuses.append(max(node.bonus)) value_batch = yield Request(RequestType.AGENT_PREDICTION, np.array(observations)) for idx, action in enumerate( space_utils.element_iter(self._action_space)): node.rewards[action] = rewards[idx] new_node = self._state2node.get(states[idx], None) if new_node is None: if dones[idx]: child_value = self._value_traits.zero else: [child_value] = value_batch[idx] new_node = self._initialize_graph_node(child_value, states[idx], dones[idx], solved=solved[idx]) node.edges[action] = new_node self._update_from_node(node)
def qualities(self, observation, model): actions = list(space_utils.element_iter(model.action_space)) (observations, rewards, dones) = yield from model.predict_steps(actions, include_state=False) if not self._use_policy: values = yield observations else: (values, _) = yield observations values = np.reshape(values, -1) return list(rewards + self._discount * values * (1 - dones))
def _children_of_state(self, parent_state): old_state = self._model.clone_state() self._model.restore_state(parent_state) def step_and_rewind(action): (observation, reward, done, info) = self._model.step(action) state = self._model.clone_state() solved = 'solved' in info and info['solved'] self._model.restore_state(parent_state) return (observation, reward, done, solved, state) results = zip(*[ step_and_rewind(action) for action in space_utils.element_iter(self._model.action_space) ]) self._model.restore_state(old_state) return results
def predict_step(self, observation, action, repeat_fix=True): """Predicts next state, reward and done. Args: observation (np.ndarray): Array of shape (height, width, n_channels) of one-hot encoded observation (along axis=-1). action (int): Action performed by the agent. repeat_fix (bool): Indicates if 'hack' fix should be used. There is a problem with passing requests to RequestHandler that shapes of tensors have to be the same for each request, and in this case we always pass tensor with stacked predictions requests for each action in the state. Yields: request (Request): Model prediction request with one-hot encoded input state and action; handled by RequestHandler. Returns: pred_obs (np.ndarray): Array of shape (height, width, n_channels) of one-hot encoded state (along axis=-1). reward (float): Reward received by the agent. done (bool): Indicates if episode terminated. info (dict): Environment additional info. """ if repeat_fix: actions = list(space_utils.element_iter(self.action_space)) next_obs, rewards, dones, _, _ = yield from self.predict_steps( self.obs2state(observation), actions) return next_obs[action], rewards[action], dones[action] batched_observation = np.expand_dims(observation, axis=0) batched_action = np.expand_dims(action, axis=0) pred_obs, reward, done, info = yield from self._batch_predict_steps( batched_observation, batched_action) # Unbatch predictions pred_obs = tf.squeeze(pred_obs, axis=0).numpy() reward = reward.item() done = done.item() info = info.item() return pred_obs, reward, done, info
def qualities(self, observation, model): init_state = model.clone_state() child_qualities = [] for init_action in space_utils.element_iter(model.action_space): (observation, init_reward, done) = yield from model.step(init_action) yield from self._rollout_agent.reset(model, observation) value = 0 total_discount = 1 time = 0 while not done and time < self._time_limit: (action, _) = yield from self._rollout_agent.act(observation) (observation, reward, done) = yield from model.step(action) value += total_discount * reward total_discount *= self._discount time += 1 child_qualities.append(init_reward + self._discount * value) model.restore_state(init_state) return child_qualities
def __call__(self, observation, model): init_state = model.clone_state() child_qualities = [] for init_action in space_utils.element_iter(model.action_space): (observation, init_reward, done, _) = model.step(init_action) yield from self._agent.reset(model, observation) value = 0 total_discount = 1 time = 0 while not done and time < self._time_limit: (action, _) = yield from self._agent.act(observation) (observation, reward, done, _) = model.step(action) value += total_discount * reward total_discount *= self._discount time += 1 child_qualities.append(init_reward + self._discount * value) model.restore_state(init_state) prior = _uniform_prior(len(child_qualities)) return list(zip(child_qualities, prior))
def _expand_leaf(self, leaf): # print("MCTS leaf") if leaf is None: # Dead End return self._value_traits.dead_end if leaf.terminal: # Terminal state return self._value_traits.zero # neighbours are ordered in the order of actions: # 0, 1, ..., _model.num_actions obs, rewards, dones, solved, states = self._children_of_state( leaf.state) # print(full_obs) # obs = np.concatenate([full_obs['observation'], full_obs['desired_goal']], axis=-1) # print("obss\n\n\n", obs) # print("obss\n\n\n") # print('expand_leaf obs', np.array(obs).astype(np.float32)) value_batch = yield np.array(obs).astype(np.float32) # print('value batch', value_batch) # print("expand_fin") for idx, action in enumerate( space_utils.element_iter(self._action_space)): leaf.rewards[action] = rewards[idx] new_node = self._state2node.get(states[idx], None) if new_node is None: if dones[idx]: child_value = self._value_traits.zero else: child_value = value_batch[idx] new_node = self._initialize_graph_node(child_value, states[idx], dones[idx], solved=solved[idx]) leaf.children[action] = TreeNode(new_node) return leaf.value_acc.get()
def __call__(self, observation, model): del observation init_state = model.clone_state() def step_and_rewind(action): (observation, reward, done, _) = model.step(action) model.restore_state(init_state) return (observation, reward, done) (observations, rewards, dones) = data.nested_stack([ step_and_rewind(action) for action in space_utils.element_iter(model.action_space) ]) # Run the network to predict values for children. values = yield observations # (batch_size, 1) -> (batch_size,) values = np.reshape(values, -1) # Compute the final qualities, masking out the "done" states. child_qualities = list(rewards + self._discount * values * (1 - dones)) prior = _uniform_prior(len(child_qualities)) return list(zip(child_qualities, prior))
def _children_of_state(self, parent_state): # print("MCTS ch") old_state = self._model.clone_state() self._model.restore_state(parent_state) def step_and_rewind(action): (full_observation, reward, done, info) = self._model.step(action) observation = np.concatenate([ full_observation['observation'], full_observation['desired_goal'] ], axis=-1) state = self._model.clone_state() solved = 'solved' in info and info['solved'] self._model.restore_state(parent_state) return (observation, reward, done, solved, state) results = zip(*[ step_and_rewind(action) for action in space_utils.element_iter(self._model.action_space) ]) self._model.restore_state(old_state) return results