def perform_validation_step(self, task_index): """ Perform validation steps for the task from index task_index. Args: task_index: task index Returns: (rewards, traces lengths) """ validation_rewards = [] traces_lengths = [] for _ in range(self.num_validation_episodes): # Start new episode mcts = MCTS(self.policy, self.env, task_index, **self.mcts_test_params) # Sample an execution trace with mcts using policy as a prior trace = mcts.sample_execution_trace() task_reward, trace_length, progs_failed_indices = trace[7], len( trace[3]), trace[10] validation_rewards.append(task_reward) traces_lengths.append(trace_length) return validation_rewards, traces_lengths, progs_failed_indices
def play_iteration(self, task_index, verbose=False): """ Play one training iteration, i.e. select a task, play episodes, store experience in buffer and sample batches to perform gradient descent on policy weights. """ # Get new task to attempt task_name = self.env.get_program_from_index(task_index) if self.verbose: print('Attempt task {} (length {})for {} episodes'.format( task_name, self.env.length, self.num_episodes_per_task)) # Start training on the task for episode in range(self.num_episodes_per_task): if self.verbose: print('=> Episode: %d' % (episode)) # Start new episode mcts = MCTS(self.policy, self.env, task_index, **self.mcts_train_params) # Sample an execution trace with mcts using policy as a prior res = mcts.sample_execution_trace() observations, prog_indices, previous_actions_indices, policy_labels, lstm_states, _, _, \ task_reward, clean_sub_execution, rewards, programs_failed_indices, \ programs_failed_initstates = res if self.verbose: print("Task_reward:") print(task_reward) print("Rewards:") print(rewards) # record trace and store it in buffer only if no problem in sub-programs execution if clean_sub_execution: # Generates trace trace = list( zip(observations, prog_indices, lstm_states, policy_labels, rewards)) # Append trace to buffer self.buffer.append_trace(trace) else: if self.verbose: print("Trace has not been stored in buffer.") # Decrease statistics of programs that failed #for idx in programs_failed_indices: #self.curriculum_scheduler.update_statistics(idx, torch.FloatTensor([0.0])) # Train policy on batch if self.buffer.get_memory_length() > self.batch_size: for _ in range(self.num_updates_per_episode): batch = self.buffer.sample_batch(self.batch_size) if batch is not None: self.policy.train_on_batch(batch) if verbose: print("Done episode {}/{}".format(episode + 1, self.num_episodes_per_task))
'max_depth_dict': max_depth_dict, 'temperature': conf.temperature, 'c_puct': conf.c_puct, 'exploit': True, 'level_closeness_coeff': conf.level_closeness_coeff, 'gamma': conf.gamma } for _ in range(40): env = ListEnv(length=len, encoding_dim=conf.encoding_dim) bubblesort_index = env.programs_library['BUBBLESORT']['index'] # Test with mcts mcts = MCTS(policy, env, bubblesort_index, **mcts_test_params) res = mcts.sample_execution_trace() mcts_reward = res[7] mcts_rewards.append(mcts_reward) if mcts_reward > 0: mcts_rewards_normalized.append(1.0) else: mcts_rewards_normalized.append(0.0) # Test with network alone network_only = NetworkOnly(policy, env, max_depth_dict) netonly_reward, _ = network_only.play(bubblesort_index) network_only_rewards.append(netonly_reward) mcts_rewards_normalized_mean = np.mean( np.array(mcts_rewards_normalized)) mcts_rewards_mean = np.mean(np.array(mcts_rewards))