コード例 #1
0
ファイル: bonus.py プロジェクト: peternara/google-research-NN
    def observe(self, experience):
        """Updates internal state counts based on observing this experience.

        Args: experience (Experience)
    """
        next_state = AS.AbstractState(experience.next_state)
        self._state_counts[next_state] += 1
コード例 #2
0
 def _step(self, action):
     state, env_reward, done, info = self.env.step(action)
     info["env reward"] = env_reward
     abstract_state = AS.AbstractState(state)
     new_reward = sum(
         rule(self._prev_abstract_state, abstract_state)
         for rule in self._reward_rules)
     self._prev_abstract_state = abstract_state
     return state, new_reward, done, info
コード例 #3
0
 def start_abstract_state(self):
     start_state = np.zeros(128).astype(np.uint8)
     indices = np.array([42, 43, 3, 65, 66])
     values = [149, 155, 8, 2, 0]
     start_state[indices] = values
     start_state = State(start_state, None)
     start_state.set_object_changes(3)
     start_state = AS.AbstractState(start_state)
     return start_state
コード例 #4
0
 def goal_abstract_state(self):
     goal_state = np.zeros(128).astype(np.uint8)
     indices = np.array([42, 43, 3, 65, 66])
     values = [149, 235, 8, 2, 0]
     goal_state[indices] = values
     goal_state = State(goal_state, None)
     goal_state.set_object_changes(3)
     goal_state = AS.AbstractState(goal_state)
     return goal_state
コード例 #5
0
    def _update(self, path, episode, graph_updates, edge_trajectories):
        """Updates internal state based on trying to follow path.

        Args:
            path (list[DirectedEdge]): planned path
            episode (list[Experience]): actually executed experiences
            graph_updates (list[GraphUpdate]): updates to make on the graph
            edge_trajectories (dict): map from edges to the experiences for
              those edges, paired with the number of worker steps the worker was
              active at the beginning of the experience (Experience, int)
    """
        # Add all experienced states to the graph
        for experience in episode:
            state = AS.AbstractState(experience.state)
            next_state = AS.AbstractState(experience.next_state)
            if state == next_state:
                # Abstract state must change on positive reward
                # TODO: Fix this: violated by PrivateEye
                #assert experience.reward <= 0, (state, reward, next_state)
                pass

            if not experience.done:
                edge = self._graph.get_edge(state, next_state)
                if edge is not None:
                    edge.update_reward(experience.reward,
                                       force=self._new_reward)

        # Make graph updates
        for graph_update in graph_updates:
            graph_update.update(self._graph)

        # Add experiences to the worker
        for edge in edge_trajectories:
            # Hindsight-like
            #for edge_to_update in edge.start.neighbors:
            #    if edge_to_update.training() and not edge_to_update.dead:
            trajectory = edge_trajectories[edge]
            for i, (experience, worker_steps,
                    cum_reward) in enumerate(trajectory):
                self._worker.add_experience(edge, experience, worker_steps,
                                            cum_reward, trajectory.success)

        # Add path back to the queue
        self._path_prioritizer.add_path(path)
コード例 #6
0
  def visualize(self, state):
    """Returns a PIL Image visualizing this Justification."""

    def plot_state(canvas, state, value):
      canvas[int(state.pixel_y) - 3:int(state.pixel_y) + 3,
             int(state.pixel_x) - 3:int(state.pixel_x) + 3] = value

    def match(state1, state2):
      """Returns True if both states have the same room number and

            inventory
      """
      return state1.room_number == state2.room_number and \
              np.array_equal(state1.match_attributes, state2.match_attributes)

    canvas = state.unmodified_pixels
    x_lines, y_lines = AS.AbstractState.bucket_lines()
    x_lines = [int(x) for x in x_lines]
    y_lines = [int(x) for x in y_lines]

    canvas[:, x_lines] = np.array([0., 0., 255.])
    canvas[y_lines, :] = np.array([0., 0., 255.])

    abstract_state = AS.AbstractState(state)

    # Only draw the nodes that match on inventory and room number
    feasible_set = self._graph.feasible_set
    for feasible_node in feasible_set:
      if not match(feasible_node.abstract_state, abstract_state):
        continue

      color = np.array([36., 255., 36.])
      plot_state(canvas, feasible_node.abstract_state, color)

    for goal_edge in self._path:
      goal = goal_edge.end.abstract_state
      if match(goal, abstract_state):
        plot_state(canvas, goal, np.array([255., 109., 182.]))

    # Plot current position
    plot_state(canvas, abstract_state, np.array([255., 255., 109.]))

    image = Image.fromarray(canvas, "RGB")
    width, height = image.size
    image = image.resize((width * 2, height * 2))
    draw = ImageDraw.Draw(image)
    draw.text((0, 0), self._text, (255, 255, 255))

    font = ImageFont.truetype(data.workspace.arial, 8)
    for node in self._graph.nodes:
      if match(node.abstract_state, abstract_state):
        draw.text((node.abstract_state.pixel_x * 2 - 4,
                   node.abstract_state.pixel_y * 2 - 4),
                  str(node.uid), (255, 255, 255),
                  font=font)
    return image
コード例 #7
0
 def start_abstract_state(self):
     start_state = np.zeros(128).astype(np.uint8)
     start_state[42] = 3
     start_state[43] = 235
     start_state[3] = 7
     start_state[65] = 0
     start_state[66] = 1
     start_state = State(start_state, None)
     start_state = AS.AbstractState(start_state)
     return start_state
コード例 #8
0
 def goal_abstract_state(self):
     goal_state = np.zeros(128).astype(np.uint8)
     goal_state[42] = 60
     goal_state[43] = 148
     goal_state[3] = 1
     goal_state[65] = 0
     goal_state[66] = 15
     goal_state = State(goal_state, None)
     goal_state = AS.AbstractState(goal_state)
     return goal_state
コード例 #9
0
 def _step(self, action):
     state, reward, done, info = self.env.step(action)
     info["done"] = done
     abstract_state = AS.AbstractState(state)
     new_done = sum(
         rule(self._prev_abstract_state, abstract_state)
         for rule in self._done_rules)
     done = new_done or done
     self._prev_abstract_state = abstract_state
     return state, reward, done, info
コード例 #10
0
    def _make_goal(self, state):
        raise NotImplementedError("Deprecated! set_goal was updated")
        goal = np.zeros(AS.AbstractState.DIM + 2)
        goal[:AS.AbstractState.DIM] = \
            self.goal_abstract_state.numpy - AS.AbstractState(state).unbucketed
        goal[AS.AbstractState.DIM] = float(self._steps) / self.max_steps
        if AS.AbstractState(state) == self.goal_abstract_state:
            goal[AS.AbstractState.DIM + 1] = 1.
        difference = \
            self.goal_abstract_state.numpy - self.start_abstract_state.numpy
        normalization = np.linalg.norm(difference)
        goal[0] /= (normalization * 3)
        goal[1] /= (normalization * 3)

        state_copy = copy.copy(state)
        state_copy.set_goal(goal)
        reward = 0.
        if self.goal_abstract_state == AS.AbstractState(state):
            reward = 1.
        return state_copy, reward
コード例 #11
0
    def _skill_state(self, state, goal_edge, step, cum_reward):
        """Adds the goal and step to the state.

        Args: state (State)
            goal_edge (DirectedEdge): goal_edge.end is goal
            step (int): number of steps the skill has been active

        Returns:
            State
        """
        goal_abstract_state = goal_edge.end.abstract_state
        abstract_state_diff = \
            goal_abstract_state.numpy - AS.AbstractState(state).unbucketed
        worker_step_frac = float(step) / self.max_steps(goal_edge)
        on_goal = AS.AbstractState(state) == goal_edge.end.abstract_state
        goal = Goal(abstract_state_diff, worker_step_frac, on_goal, cum_reward)

        # Shallow copy OK. Copy references to the np.arrays. Getters don't
        # expose the underlying arrays directly
        state_copy = copy.copy(state)
        state_copy.set_goal(goal)
        return state_copy
コード例 #12
0
    def act(self, state, test=False):
        """Returns action for the current state.

        Args: state (State)
            test (bool): if True, no teleporting is used

        Returns:
            action (Action)
            justification (Justification)
        """
        if len(self._plan) == 0:
            node = self._graph.get_node(AS.AbstractState(state))
            if (node is not None and node.active()
                    and not self._explorer.active()):
                # This is happening in a separate process, so this shared
                # graph doesn't get updated
                self._graph_updates.append(Visit(node))
                node.visit()
                self._explorer.activate(node)

            if self._explorer.active():
                action, s = self._explorer.act(state)
                return action, Justification([], self._graph, s)
            elif test:  # No resetting on test episodes!
                action = DefaultAction(random.randint(0,
                                                      self._num_actions - 1))
                justification = Justification([], self._graph, "test random")
                return action, justification
            else:
                return EndEpisode(), Justification([], self._graph, "reset")
        elif (self._enable_teleporting and not test and not self._teleported
              and len(self._plan) > 0
              and self._plan[-1].start.teleport is not None):
            self._teleported = True
            self._plan = self._plan[-1:]
            s = "teleport to: {}".format(self._plan[-1].start.uid)
            justification = Justification(self._plan, self._graph, s)
            return self._plan[-1].start.teleport, justification

        next_edge = self._plan[0]
        self._allow_setting_teleport = \
                self._allow_setting_teleport and not next_edge.training()
        action = DefaultAction(
            self._worker.act(state, next_edge, len(self._worker_rewards),
                             sum(self._worker_rewards)))
        s = "{} -> {} step={} [{:.2f}], train={}, [{:.2f}]".format(
            next_edge.start.uid, next_edge.end.uid, len(self._worker_rewards),
            sum(self._worker_rewards), next_edge.train_count,
            next_edge.success_rate)
        justification = Justification(copy.copy(self._plan), self._graph, s)
        return action, justification
コード例 #13
0
ファイル: bonus.py プロジェクト: peternara/google-research-NN
    def __call__(self, experience):
        """Returns another experience with the bonus added to the reward.

        Args: experience (Experience)

        Returns:
            Experience
        """
        next_state = AS.AbstractState(experience.next_state)
        next_state_count = self._state_counts[next_state]
        assert next_state_count > 0
        reward_bonus = self._beta / np.sqrt(next_state_count)
        return Experience(experience.state, experience.action,
                          experience.reward + reward_bonus,
                          experience.next_state, experience.done)
コード例 #14
0
 def crop(state):
     abstract_state = AS.AbstractState(state)
     y = int(abstract_state.pixel_y * 84. / 210.)
     x = int(abstract_state.pixel_x * 84. / 160.)
     cropped = state.pixel_state[:, y - 10:y + 10,
                                 max(0, x - 30):x + 30]
     padding = (0, 0)
     if x - 30 < 0:
         padding = (30 - x, 0)
     elif x + 30 > 160:
         padding = (0, 190 - x)
     cropped = torch.FloatTensor(cropped)
     cropped = try_gpu(
         torch.nn.functional.pad(cropped, padding, mode="reflect"))
     return cropped
コード例 #15
0
    def reward(self, next_state, edge, env_reward, done):
        """Defines the worker's intrinsic reward for reaching next_state

        while trying to traverse the edge.

        Args: next_state (State) edge (DirectedEdge)
            env_reward (float): environment extrinsic reward
            done (bool): True if overall episode ended

        Returns:
            float
        """
        if AS.AbstractState(next_state) == edge.end.abstract_state and \
                not done and env_reward >= 0:
            return 1.
        else:
            return 0.
コード例 #16
0
 def crop(state):
     abstract_state = AS.AbstractState(state)
     y = int(abstract_state.pixel_y * 84. / 210.)
     cropped = state.pixel_state[:, y - 10:y + 10, :]
     return cropped
コード例 #17
0
  def __init__(self, num_actions, worker, abstract_graph_config, start_state,
               runner_config, num_parallel, domain, success_weight):
    """Use from_config to construct Master.

        Args:
            num_actions (int): number of possible actions at each state
            worker (Worker): worker to use
            abstract_graph_config (Config): config for AbstractGraph
            start_state (State): state returned by env.reset(). NOTE: assumed
              that there is only a single start state
            runner_config (Config): config for EpisodeRunner
            num_parallel (int): number of workers to run in parallel
            domain (str): environment domain e.g. MontezumaRevengeNoFrameskip-v4
            success_weight (float): how much to weight successes in priority
    """
    super(Master, self).__init__()
    self._num_actions = num_actions
    self._worker = worker
    self._room_dir = data.room_dir(domain)
    self._success_weight = success_weight

    self._path_prioritizer = MultiPriorityQueue(self._priority_fns())
    self._path_prioritizer.add_path([])

    def eval_to_train(edge):
      if not edge.dead:
        worker.mark_failed_evaluation(edge)

    # All paths are added to the queue via new reliable edges or via new
    # edges as an optimization
    def eval_to_reliable(edge):
      worker.mark_reliable(edge)
      feasible_set = self._graph.feasible_set
      for neighbor_edge in edge.end.neighbors:
        if neighbor_edge.end not in feasible_set:
          path = edge.end.path_to_start() + [neighbor_edge]
          self._path_prioritizer.add_path(path)

    def new_edge_callback(new_edge):
      feasible_set = self._graph.feasible_set
      if new_edge.start in feasible_set:
        if new_edge.end not in feasible_set:
          path = new_edge.start.path_to_start() + [new_edge]
          self._path_prioritizer.add_path(path)
      else:
        # Update priority of paths who've gained a new edge
        for parent_edge in new_edge.start.parents:
          if parent_edge.start in feasible_set:
            path = parent_edge.start.path_to_start() + [parent_edge]
            self._path_prioritizer.add_path(path)

      max_edge_degree = abstract_graph_config.max_edge_degree

      # Add higher-distance neighbors
      parent_edges = [
          parent_edge for parent_edge in new_edge.start.parents
          if parent_edge.degree == 1
      ]
      bfs_queue = deque(parent_edges)
      visited = set([parent_edge.start for parent_edge in parent_edges] +
                    [new_edge.end, new_edge.start])
      while len(bfs_queue) > 0:
        edge = bfs_queue.popleft()
        if edge.degree >= max_edge_degree:
          break

        for parent_edge in edge.start.parents:
          parent = parent_edge.start
          # Only traverse degree 1 edges to preserve BFS property
          if parent_edge.degree == 1 and parent not in visited:
            visited.add(parent)
            combined_degree = edge.degree + parent_edge.degree
            combined_reward = edge.reward + parent_edge.reward
            combined_life_lost = edge.life_lost or parent_edge.life_lost
            if not parent.contains_neighbor(edge.end):
              combined_edge = self._graph.get_edge(parent.abstract_state,
                                                   edge.end.abstract_state,
                                                   combined_degree,
                                                   combined_reward,
                                                   combined_life_lost)
              bfs_queue.append(combined_edge)

      # Forwards
      bfs_queue = deque([new_edge])
      visited = set([new_edge.end, new_edge.start])
      while len(bfs_queue) > 0:
        edge = bfs_queue.popleft()
        if edge.degree >= max_edge_degree:
          break

        for neighbor_edge in edge.end.neighbors:
          neighbor = neighbor_edge.end
          # Only traverse degree 1 edges to preserve BFS property
          if neighbor_edge.degree == 1 and neighbor not in visited:
            visited.add(neighbor)
            combined_degree = edge.degree + neighbor_edge.degree
            combined_reward = edge.reward + neighbor_edge.reward
            combined_life_lost = edge.life_lost or neighbor_edge.life_lost
            if not neighbor.contains_parent(edge.start):
              combined_edge = self._graph.get_edge(edge.start.abstract_state,
                                                   neighbor.abstract_state,
                                                   combined_degree,
                                                   combined_reward,
                                                   combined_life_lost)
              bfs_queue.append(combined_edge)

      ##########################
      # END HACK
      ##########################

    edge_callbacks = {
        (DirectedEdge.EVALUATING, DirectedEdge.TRAINING): eval_to_train,
        (DirectedEdge.EVALUATING, DirectedEdge.RELIABLE): eval_to_reliable,
    }
    self._graph = AbstractGraph.from_config(abstract_graph_config,
                                            AS.AbstractState(start_state),
                                            edge_callbacks, new_edge_callback,
                                            domain)

    self._start_node = self._graph.get_node(AS.AbstractState(start_state))

    self._runner_config = runner_config
    # runners[i] is None when previous episode[i] terminated
    self._runners = [None for _ in range(num_parallel)]
コード例 #18
0
 def start_abstract_state(self):
     ram = np.zeros(128)
     ram[[97, 105, 1, 113]] = [39, 30, 18, 31]
     state = State(ram, None)
     return AS.AbstractState(state)
コード例 #19
0
  def __init__(self, start_state, edge_window_size, traverse_threshold,
               min_visit_count, max_edge_degree, edge_callbacks,
               new_edge_callback):
    super(OracleGraph,
          self).__init__(start_state, edge_window_size, traverse_threshold,
                         min_visit_count, max_edge_degree, edge_callbacks,
                         new_edge_callback)

    # Oracle nodes and edges for the first room
    # AbstractStates in order
    ROOM_NUMBER = 1
    path = [
        (70, 220),  # down ladder (x, y)
        (70, 210),  # down ladder
        (70, 200),  # down ladder
        (70, 190),  # bottom of ladder
        (80, 190),  # go right
        (90, 190),  # jump across rope
        (100, 190),  # on rope
        (110, 190),  # jump across rope
        (120, 190),  # on ledge
        (130, 180),  # down ladder
        (130, 170),  # down ladder
        (130, 160),  # down ladder
        (130, 150),  # bottom of ladder
        (120, 150),  # left
        (110, 150),  # left
        (100, 150),  # left
        (90, 150),  # left
        (80, 150),  # left
        (70, 150),  # left
        (60, 150),  # left
        (50, 150),  # left
        (40, 150),  # left
        (30, 150),  # left
        (20, 150),  # bottom of ladder
        (20, 160),  # up ladder
        (20, 170),  # up ladder
        (20, 180),  # up ladder
        (10, 180),  # left
        (10, 190),  # jump
        (10, 200),  # jump
    ]

    key_state = (10, 210, ROOM_NUMBER, 2, 14)

    # revisit the states in order except getting key
    backward_path = list(reversed(path)) + [
        (80, 240),  # right
        (90, 240),  # jump gap
        (100, 240),  # right gap
        (110, 240),  # right
        (120, 240),  # right
    ]

    open_door = (130, 240, ROOM_NUMBER, 0, 10)

    # Add (room #, inv mask, inv)
    for i, s in enumerate(path):
      path[i] = path[i] + (ROOM_NUMBER, 0, 15)

    for i, s in enumerate(backward_path):
      backward_path[i] = backward_path[i] + (ROOM_NUMBER, 2, 14)

    path = path + [key_state] + backward_path + [open_door]

    prev_node = self._start_node
    for i, state in enumerate(path):
      ram = np.zeros(128).astype(np.uint8)
      ram[np.array((42, 43, 3, 65, 66))] = state
      ram[42] += 10
      abstract_state = AS.AbstractState(State(ram_state=ram))
      curr_node = self._nodes[abstract_state] = AbstractNode(
          abstract_state, self._min_visit_count, self._uid_count)
      self._uid_count += 1
      edge = DirectedEdge(
          prev_node,
          curr_node,
          self._edge_window_size,
          self._edge_window_size,
          self._traverse_threshold,
          self._callbacks,
          degree=1)
      prev_node.add_neighbor(edge)
      curr_node.add_parent(edge)
      prev_node = curr_node
コード例 #20
0
 def _step(self, action):
     next_state, reward, done, info = super(BeamWrapper, self)._step(action)
     if AS.AbstractState(next_state).room_number != 7:
         done = True
     return next_state, reward, done, info
コード例 #21
0
 def goal_abstract_state(self):
     ram = np.zeros(128)
     ram[[97, 105, 1, 113]] = [117, 30, 18, 31]
     goal_state = State(ram, None)
     return AS.AbstractState(goal_state)
コード例 #22
0
 def _reset(self):
     state = self.env.reset()
     self._prev_abstract_state = AS.AbstractState(state)
     return state