Esempio n. 1
0
    def play_game(self, game):

        if self.config.fixed_temperatures is not None:
            self.temperature = self.config.visit_softmax_temperature(
                self.training_step)

        while not game.terminal:
            root = Node(0)

            current_observation = np.float32(game.get_observation(-1))
            if self.config.norm_obs:
                current_observation = (current_observation -
                                       self.obs_min) / self.obs_range
            current_observation = torch.from_numpy(current_observation).to(
                self.device)

            initial_inference = self.network.initial_inference(
                current_observation.unsqueeze(0))

            legal_actions = game.environment.legal_actions()
            root.expand(initial_inference, game.to_play, legal_actions)
            root.add_exploration_noise(self.config.root_dirichlet_alpha,
                                       self.config.root_exploration_fraction)

            self.mcts.run(root, self.network)

            error = root.value() - initial_inference.value.item()
            game.history.errors.append(error)

            action = self.config.select_action(root, self.temperature)

            game.apply(action)
            game.store_search_statistics(root)

            self.experiences_collected += 1

            if self.experiences_collected % self.config.weight_sync_frequency == 0:
                self.sync_weights()

            save_history = (
                game.history_idx -
                game.previous_collect_to) == self.config.max_history_length
            if save_history or game.done or game.terminal:
                overlap = self.config.num_unroll_steps + self.config.td_steps
                if not game.history.dones[game.previous_collect_to - 1]:
                    collect_from = max(0, game.previous_collect_to - overlap)
                else:
                    collect_from = game.previous_collect_to
                history = game.get_history_sequence(collect_from)
                ignore = overlap if not game.done else None
                self.replay_buffer.save_history.remote(history,
                                                       ignore=ignore,
                                                       terminal=game.terminal)

            if game.step >= self.config.max_steps:
                self.environment.was_real_done = True
                break

        if self.config.two_players:
            self.stats_to_log[game.info["result"]] += 1
Esempio n. 2
0
def play_game(config: MuZeroConfig, network: Network) -> Game:
    game = Game.from_config(config)

    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        last_observation = game.make_image(-1)
        root.expand(game.to_play(), game.legal_actions(),
                    network.initial_inference(last_observation).numpy())
        root.add_exploration_noise(config)

        # logging.debug('Running MCTS on step {}.'.format(len(game.history)))
        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the network.
        run_mcts(config, root, game.action_history(), network)
        action = root.select_action(config, len(game.history), network)
        game.apply(action)
        game.store_search_statistics(root)

    logging.info('Finished episode at step {} | cumulative reward: {}' \
        .format(len(game.obs_history), sum(game.rewards)))

    return game
Esempio n. 3
0
  def play_game(self, environment):
    assert self.network is not None, ".load_network() needs to be called before playing."

    game = self.config.new_game(environment)

    if self.config.save_mcts:
      path_to_mcts_folder = os.path.split(os.path.normpath(self.config.saves_dir))[0]
      path_to_mcts_folder = os.path.join(path_to_mcts_folder, 'mcts')
      os.makedirs(path_to_mcts_folder, exist_ok=True)

    if self.config.save_gif_as:
      path_to_gif_folder = os.path.split(os.path.normpath(self.config.saves_dir))[0]
      path_to_gif_folder = os.path.join(path_to_gif_folder, 'gifs')
      os.makedirs(path_to_gif_folder, exist_ok=True)

    frames = []
    game.pred_values = []
    game.pred_rewards = []
    game.search_depths = []
    while not game.terminal:
      root = Node(0)

      current_observation = np.float32(game.get_observation(-1))
      if self.config.norm_obs:
        current_observation = (current_observation - self.obs_min) / self.obs_range
      current_observation = torch.from_numpy(current_observation).to(self.device)

      initial_inference = self.network.initial_inference(current_observation.unsqueeze(0))
      
      legal_actions = game.environment.legal_actions()
      root.expand(initial_inference, game.to_play, legal_actions)

      if self.config.use_exploration_noise:
        root.add_exploration_noise(self.config.root_dirichlet_alpha, self.config.root_exploration_fraction)

      actions_to_apply, corresponding_rewards = [], []
      if self.config.only_prior:
        _, action = max([(child.prior, action) for action, child in root.children.items()])
        reward = self.network.recurrent_inference(root.hidden_state, [action]).reward.item()
        actions_to_apply.append(action)
        corresponding_rewards.append(reward)
        root.children[action].visit_count += 1
        game.search_depths.append([0])

      elif self.config.only_value:
        q_values = []
        max_q_val = -np.inf
        for action in root.children.keys():
          output = self.network.recurrent_inference(root.hidden_state, [action])
          if self.config.two_players:
            q_val = (output.reward - self.config.discount * output.value).item()
          else:
            q_val = (output.reward + self.config.discount * output.value).item()
          if q_val > max_q_val:
            max_q_val = q_val
            chosen_action = action
            reward = output.reward.item()
          root.children[action].visit_count += 1

        actions_to_apply.append(chosen_action)
        corresponding_rewards.append(reward)
        game.search_depths.append([1])

      else:
        search_paths = self.mcts.run(root, self.network)
        search_depths = [len(search_path) for search_path in search_paths]
        game.search_depths.append(search_depths)

        if self.config.save_mcts and game.step >= self.config.save_mcts_after_step:
          path_to_file = os.path.join(path_to_mcts_folder, str(game.step) + '.png')
          write_mcts_as_png(search_paths, path_to_file=path_to_file)

        node = root
        actions_applied = 0
        while node.expanded():
          action = self.config.select_action(node, temperature=self.config.temperature)
          reward = node.children[action].reward
          node = node.children[action]

          actions_to_apply.append(action)
          corresponding_rewards.append(reward)
          actions_applied += 1

          if actions_applied == self.config.apply_mcts_actions:
            break

      game.pred_values.append(initial_inference.value.item())
      game.store_search_statistics(root)
      
      for action, reward in zip(actions_to_apply, corresponding_rewards):
        game.pred_rewards.append(reward)
        if self.config.two_players:
          if game.to_play == self.config.random_opp:
            action = np.random.choice(legal_actions)
          elif game.to_play == self.config.human_opp:
            print("waiting for your input: {}".format(legal_actions))
            action = int(input())
            while action not in legal_actions:
              print("invalid action, choose again!")
              action = int(input())
          to_play = game.to_play

        game.apply(action)

        if self.config.verbose:
          prior_policy = [round(child.prior, 2) for child in root.children.values()]
          sum_visits = sum(child.visit_count for child in root.children.values())
          mcts_policy = [round(child.visit_count/sum_visits, 2) for child in root.children.values()]
          print("\nstep {}".format(game.step))
          print("   legal actions: {}".format(list(legal_actions)))
          print("   prior policy:  {}".format(prior_policy))
          print("   mcts policy:   {}".format(mcts_policy))
          print("   prior value:    {}".format(round(game.pred_values[-1], 2)))
          print("   mcts value:    {}".format(round(root.value(), 2)))

        if self.config.render:
          try:
            frame = game.environment.unwrapped._get_image()
            self.viewer.imshow(frame)
          except:
            frame = game.environment.render(mode='rgb_array')
          frames.append(frame)

          if self.config.sleep:
            time.sleep(self.config.sleep)

        if game.terminal or game.step >= self.config.max_steps:
          environment.was_real_done = True
          game.terminal = True
          if self.config.two_players:
            if to_play in [self.config.random_opp, self.config.human_opp]:
              game.history.rewards[-1] *= -1
          break

    msg = "\033[92m[Game done]\033[0m --> "
    msg += "length: {:.1f}, return: {:.1f}, pred return: {:.1f}, pred value: {:.1f}, mcts value: {:.1f}"
    print(msg.format(game.step, np.sum(game.history.rewards), np.sum(game.pred_rewards),
                     np.mean(game.pred_values), np.mean(game.history.root_values)))

    if self.config.save_gif_as and frames:
      filename = self.config.save_gif_as + '.gif'
      self.save_frames_as_gif(frames, path_to_gif_folder, filename)

    return game