def play_game(self, game): if self.config.fixed_temperatures is not None: self.temperature = self.config.visit_softmax_temperature( self.training_step) while not game.terminal: root = Node(0) current_observation = np.float32(game.get_observation(-1)) if self.config.norm_obs: current_observation = (current_observation - self.obs_min) / self.obs_range current_observation = torch.from_numpy(current_observation).to( self.device) initial_inference = self.network.initial_inference( current_observation.unsqueeze(0)) legal_actions = game.environment.legal_actions() root.expand(initial_inference, game.to_play, legal_actions) root.add_exploration_noise(self.config.root_dirichlet_alpha, self.config.root_exploration_fraction) self.mcts.run(root, self.network) error = root.value() - initial_inference.value.item() game.history.errors.append(error) action = self.config.select_action(root, self.temperature) game.apply(action) game.store_search_statistics(root) self.experiences_collected += 1 if self.experiences_collected % self.config.weight_sync_frequency == 0: self.sync_weights() save_history = ( game.history_idx - game.previous_collect_to) == self.config.max_history_length if save_history or game.done or game.terminal: overlap = self.config.num_unroll_steps + self.config.td_steps if not game.history.dones[game.previous_collect_to - 1]: collect_from = max(0, game.previous_collect_to - overlap) else: collect_from = game.previous_collect_to history = game.get_history_sequence(collect_from) ignore = overlap if not game.done else None self.replay_buffer.save_history.remote(history, ignore=ignore, terminal=game.terminal) if game.step >= self.config.max_steps: self.environment.was_real_done = True break if self.config.two_players: self.stats_to_log[game.info["result"]] += 1
def play_game(config: MuZeroConfig, network: Network) -> Game: game = Game.from_config(config) while not game.terminal() and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. root = Node(0) last_observation = game.make_image(-1) root.expand(game.to_play(), game.legal_actions(), network.initial_inference(last_observation).numpy()) root.add_exploration_noise(config) # logging.debug('Running MCTS on step {}.'.format(len(game.history))) # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the network. run_mcts(config, root, game.action_history(), network) action = root.select_action(config, len(game.history), network) game.apply(action) game.store_search_statistics(root) logging.info('Finished episode at step {} | cumulative reward: {}' \ .format(len(game.obs_history), sum(game.rewards))) return game
def play_game(self, environment): assert self.network is not None, ".load_network() needs to be called before playing." game = self.config.new_game(environment) if self.config.save_mcts: path_to_mcts_folder = os.path.split(os.path.normpath(self.config.saves_dir))[0] path_to_mcts_folder = os.path.join(path_to_mcts_folder, 'mcts') os.makedirs(path_to_mcts_folder, exist_ok=True) if self.config.save_gif_as: path_to_gif_folder = os.path.split(os.path.normpath(self.config.saves_dir))[0] path_to_gif_folder = os.path.join(path_to_gif_folder, 'gifs') os.makedirs(path_to_gif_folder, exist_ok=True) frames = [] game.pred_values = [] game.pred_rewards = [] game.search_depths = [] while not game.terminal: root = Node(0) current_observation = np.float32(game.get_observation(-1)) if self.config.norm_obs: current_observation = (current_observation - self.obs_min) / self.obs_range current_observation = torch.from_numpy(current_observation).to(self.device) initial_inference = self.network.initial_inference(current_observation.unsqueeze(0)) legal_actions = game.environment.legal_actions() root.expand(initial_inference, game.to_play, legal_actions) if self.config.use_exploration_noise: root.add_exploration_noise(self.config.root_dirichlet_alpha, self.config.root_exploration_fraction) actions_to_apply, corresponding_rewards = [], [] if self.config.only_prior: _, action = max([(child.prior, action) for action, child in root.children.items()]) reward = self.network.recurrent_inference(root.hidden_state, [action]).reward.item() actions_to_apply.append(action) corresponding_rewards.append(reward) root.children[action].visit_count += 1 game.search_depths.append([0]) elif self.config.only_value: q_values = [] max_q_val = -np.inf for action in root.children.keys(): output = self.network.recurrent_inference(root.hidden_state, [action]) if self.config.two_players: q_val = (output.reward - self.config.discount * output.value).item() else: q_val = (output.reward + self.config.discount * output.value).item() if q_val > max_q_val: max_q_val = q_val chosen_action = action reward = output.reward.item() root.children[action].visit_count += 1 actions_to_apply.append(chosen_action) corresponding_rewards.append(reward) game.search_depths.append([1]) else: search_paths = self.mcts.run(root, self.network) search_depths = [len(search_path) for search_path in search_paths] game.search_depths.append(search_depths) if self.config.save_mcts and game.step >= self.config.save_mcts_after_step: path_to_file = os.path.join(path_to_mcts_folder, str(game.step) + '.png') write_mcts_as_png(search_paths, path_to_file=path_to_file) node = root actions_applied = 0 while node.expanded(): action = self.config.select_action(node, temperature=self.config.temperature) reward = node.children[action].reward node = node.children[action] actions_to_apply.append(action) corresponding_rewards.append(reward) actions_applied += 1 if actions_applied == self.config.apply_mcts_actions: break game.pred_values.append(initial_inference.value.item()) game.store_search_statistics(root) for action, reward in zip(actions_to_apply, corresponding_rewards): game.pred_rewards.append(reward) if self.config.two_players: if game.to_play == self.config.random_opp: action = np.random.choice(legal_actions) elif game.to_play == self.config.human_opp: print("waiting for your input: {}".format(legal_actions)) action = int(input()) while action not in legal_actions: print("invalid action, choose again!") action = int(input()) to_play = game.to_play game.apply(action) if self.config.verbose: prior_policy = [round(child.prior, 2) for child in root.children.values()] sum_visits = sum(child.visit_count for child in root.children.values()) mcts_policy = [round(child.visit_count/sum_visits, 2) for child in root.children.values()] print("\nstep {}".format(game.step)) print(" legal actions: {}".format(list(legal_actions))) print(" prior policy: {}".format(prior_policy)) print(" mcts policy: {}".format(mcts_policy)) print(" prior value: {}".format(round(game.pred_values[-1], 2))) print(" mcts value: {}".format(round(root.value(), 2))) if self.config.render: try: frame = game.environment.unwrapped._get_image() self.viewer.imshow(frame) except: frame = game.environment.render(mode='rgb_array') frames.append(frame) if self.config.sleep: time.sleep(self.config.sleep) if game.terminal or game.step >= self.config.max_steps: environment.was_real_done = True game.terminal = True if self.config.two_players: if to_play in [self.config.random_opp, self.config.human_opp]: game.history.rewards[-1] *= -1 break msg = "\033[92m[Game done]\033[0m --> " msg += "length: {:.1f}, return: {:.1f}, pred return: {:.1f}, pred value: {:.1f}, mcts value: {:.1f}" print(msg.format(game.step, np.sum(game.history.rewards), np.sum(game.pred_rewards), np.mean(game.pred_values), np.mean(game.history.root_values))) if self.config.save_gif_as and frames: filename = self.config.save_gif_as + '.gif' self.save_frames_as_gif(frames, path_to_gif_folder, filename) return game