def run_evaluation(self): current_network = ChessNetwork(self.net_name) try: current_network.load(version='current') except ValueError: logger.fatal('Cannot evaluate a model without at least ' 'a "current" version.') raise AssertionError('No current version of network.') nextgen_network = ChessNetwork(self.net_name) try: nextgen_network.load(version='nextgen', ckpt=self.ckpt) except ValueError: logger.warn('No nextgen version of this model - testing ' 'play against blank slate version.') mcts_params = dotdict(n_sims=100, c_base=4.0, c_init=1.0, eps=0.155, resign_threshold=-0.85, temperature=1, use_noise=True) c_mcts = MCTS(current_network, params=mcts_params) ng_mcts = MCTS(nextgen_network, params=mcts_params) env = ChessEnvironment() agent_params = dotdict(temp_threshold=0, max_hmoves=50, n_book_moves=5) c_version = ChessAgent(c_mcts, env, params=agent_params) ng_version = ChessAgent(ng_mcts, env, params=agent_params) self.play_game(c_version, ng_version, env)
def perform_validation_step(self, task_index): """ Perform validation steps for the task from index task_index. Args: task_index: task index Returns: (rewards, traces lengths) """ validation_rewards = [] traces_lengths = [] for _ in range(self.num_validation_episodes): # Start new episode mcts = MCTS(self.policy, self.env, task_index, **self.mcts_test_params) # Sample an execution trace with mcts using policy as a prior trace = mcts.sample_execution_trace() task_reward, trace_length, progs_failed_indices = trace[7], len( trace[3]), trace[10] validation_rewards.append(task_reward) traces_lengths.append(trace_length) return validation_rewards, traces_lengths, progs_failed_indices
def play_episode(self): # reset env obs = self.env.reset() env_state = self.env.get_state() done = False t = 0 total_reward = 0.0 mcts = MCTS(self.config) root_node = Node(state=env_state, done=False, obs=obs, reward=0, action=None, parent=RootParentNode(env=self.env_creator()), mcts=mcts, depth=0) compute_action_times = [] while not done: t += 1 # compute action choice t0 = time.time() tree_policy, action, _, root_node = mcts.compute_action(root_node) root_node.parent = RootParentNode(env=self.env_creator()) compute_action_times.append(time.time() - t0) # take action obs, reward, done, info = self.env.step(action) total_reward += reward avg_time = np.mean(compute_action_times) return t, total_reward, avg_time
def play_iteration(self, task_index, verbose=False): """ Play one training iteration, i.e. select a task, play episodes, store experience in buffer and sample batches to perform gradient descent on policy weights. """ # Get new task to attempt task_name = self.env.get_program_from_index(task_index) if self.verbose: print('Attempt task {} (length {})for {} episodes'.format( task_name, self.env.length, self.num_episodes_per_task)) # Start training on the task for episode in range(self.num_episodes_per_task): if self.verbose: print('=> Episode: %d' % (episode)) # Start new episode mcts = MCTS(self.policy, self.env, task_index, **self.mcts_train_params) # Sample an execution trace with mcts using policy as a prior res = mcts.sample_execution_trace() observations, prog_indices, previous_actions_indices, policy_labels, lstm_states, _, _, \ task_reward, clean_sub_execution, rewards, programs_failed_indices, \ programs_failed_initstates = res if self.verbose: print("Task_reward:") print(task_reward) print("Rewards:") print(rewards) # record trace and store it in buffer only if no problem in sub-programs execution if clean_sub_execution: # Generates trace trace = list( zip(observations, prog_indices, lstm_states, policy_labels, rewards)) # Append trace to buffer self.buffer.append_trace(trace) else: if self.verbose: print("Trace has not been stored in buffer.") # Decrease statistics of programs that failed #for idx in programs_failed_indices: #self.curriculum_scheduler.update_statistics(idx, torch.FloatTensor([0.0])) # Train policy on batch if self.buffer.get_memory_length() > self.batch_size: for _ in range(self.num_updates_per_episode): batch = self.buffer.sample_batch(self.batch_size) if batch is not None: self.policy.train_on_batch(batch) if verbose: print("Done episode {}/{}".format(episode + 1, self.num_episodes_per_task))
def make_target(self, state_index: int, num_unroll_steps: int, td_steps: int, model=None, config=None): # The value target is the discounted root value of the search tree N steps into the future, plus # the discounted sum of all rewards until then. target_values, target_rewards, target_policies = [], [], [] for current_index in range(state_index, state_index + num_unroll_steps + 1): bootstrap_index = current_index + td_steps if bootstrap_index < len(self.root_values): if model is None: value = self.root_values[bootstrap_index] * self.discount ** td_steps else: # Reference : Appendix H => Reanalyze # Note : a target network based on recent parameters is used to provide a fresher, # stable n-step bootstrapped target for the value function obs = self.obs(bootstrap_index) obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) network_output = model.initial_inference(obs) value = network_output.value.data.cpu().item() * self.discount ** td_steps else: value = 0 for i, reward in enumerate(self.rewards[current_index:bootstrap_index]): value += reward * self.discount ** i if current_index > 0 and current_index <= len(self.rewards): last_reward = self.rewards[current_index-1] else: last_reward = 0 if current_index < len(self.root_values): target_values.append(value) target_rewards.append(last_reward) # Reference : Appendix H => Reanalyze # Note : MuZero Reanalyze revisits its past time-steps and re-executes its search using the # latest model parameters, potentially resulting in a better quality policy than the original search. # This fresh policy is used as the policy target for 80% of updates during MuZero training if model is not None and np.random.random() <= config.revisit_policy_search_rate: from core.mcts import MCTS, Node root = Node(0) obs = self.obs(current_index) obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) network_output = model.initial_inference(obs) root.expand(self.to_play(), self.legal_actions(), network_output) MCTS(config).run(root, self.action_history(current_index), model) self.store_search_stats(root, current_index) target_policies.append(self.child_visits[current_index]) else: # States past the end of games are treated as absorbing states. target_values.append(0) target_rewards.append(last_reward) # Note: Target policy is set to 0 so that no policy loss is calculated for them target_policies.append([0 for _ in range(len(self.child_visits[0]))]) return target_values, target_rewards, target_policies
def run_selfplay(self): ''' Executes a self-play task. Establishes an Agent in an Environment and lets the Agent play a full game of chess against itself. The resulting training examples are returned. ''' network = ChessNetwork(name=self.net_name) try: network.load(version=self.version) except ValueError: pass env = ChessEnvironment() search_tree = MCTS(network) agent = ChessAgent(search_tree, env) exs = agent.play(game_name=f'{self.net_name}_game{self.iteration+1}', save=False) return exs
'number_of_simulations': conf.number_of_simulations_for_validation, 'max_depth_dict': max_depth_dict, 'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': True, 'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma } for _ in range(40): env = ListEnv(length=len, encoding_dim=conf.encoding_dim) bubblesort_index = env.programs_library['BUBBLESORT']['index'] # Test with mcts mcts = MCTS(policy, env, bubblesort_index, **mcts_test_params) res = mcts.sample_execution_trace() mcts_reward = res[7] mcts_rewards.append(mcts_reward) if mcts_reward > 0: mcts_rewards_normalized.append(1.0) else: mcts_rewards_normalized.append(0.0) # Test with network alone network_only = NetworkOnly(policy, env, max_depth_dict) netonly_reward, _ = network_only.play(bubblesort_index) network_only_rewards.append(netonly_reward) mcts_rewards_normalized_mean = np.mean( np.array(mcts_rewards_normalized))
env = QuickSortListEnv(length=len_, encoding_dim=conf.encoding_dim, expose_stack=expose_stack, without_partition_update=without_p_upd, sample_from_errors_prob=samp_err_poss, reduced_set=reduced_op_set, recursive_version=recursive_quicksort, expose_pointers_value=do_not_expose_pointer_values) try: operation_index = env.programs_library[args.operation]['index'] except: print("The model analyzed does not have the operation ", args.operation) exit(0) # Test with mcts mcts = MCTS(policy, env, operation_index, **mcts_test_params) res = mcts.sample_execution_trace() mcts_reward = res[7] mcts_rewards.append(mcts_reward) if mcts_reward > 0: mcts_rewards_normalized.append(1.0) else: mcts_rewards_normalized.append(0.0) # Test with network alone network_only = NetworkOnly(policy, env, max_depth_dict) netonly_reward, _ = network_only.play(operation_index) network_only_rewards.append(netonly_reward) mcts_rewards_normalized_mean = np.mean(np.array(mcts_rewards_normalized)) mcts_rewards_mean = np.mean(np.array(mcts_rewards))