Example #1
0
    def perform_validation_step(self, task_index):
        """
        Perform validation steps for the task from index task_index.

        Args:
            task_index: task index

        Returns:
            (rewards, traces lengths)

        """
        validation_rewards = []
        traces_lengths = []
        for _ in range(self.num_validation_episodes):
            # Start new episode
            mcts = MCTS(self.policy, self.env, task_index,
                        **self.mcts_test_params)

            # Sample an execution trace with mcts using policy as a prior
            trace = mcts.sample_execution_trace()
            task_reward, trace_length, progs_failed_indices = trace[7], len(
                trace[3]), trace[10]

            validation_rewards.append(task_reward)
            traces_lengths.append(trace_length)
        return validation_rewards, traces_lengths, progs_failed_indices
Example #2
0
    def play_iteration(self, task_index, verbose=False):
        """
        Play one training iteration, i.e. select a task, play episodes, store experience in buffer and sample batches
        to perform gradient descent on policy weights.

        """

        # Get new task to attempt
        task_name = self.env.get_program_from_index(task_index)
        if self.verbose:
            print('Attempt task {} (length {})for {} episodes'.format(
                task_name, self.env.length, self.num_episodes_per_task))

        # Start training on the task
        for episode in range(self.num_episodes_per_task):
            if self.verbose:
                print('=> Episode: %d' % (episode))

            # Start new episode
            mcts = MCTS(self.policy, self.env, task_index,
                        **self.mcts_train_params)

            # Sample an execution trace with mcts using policy as a prior
            res = mcts.sample_execution_trace()
            observations, prog_indices, previous_actions_indices, policy_labels, lstm_states, _, _, \
                task_reward, clean_sub_execution, rewards, programs_failed_indices, \
                programs_failed_initstates = res

            if self.verbose:
                print("Task_reward:")
                print(task_reward)
                print("Rewards:")
                print(rewards)

            # record trace and store it in buffer only if no problem in sub-programs execution
            if clean_sub_execution:
                # Generates trace
                trace = list(
                    zip(observations, prog_indices, lstm_states, policy_labels,
                        rewards))
                # Append trace to buffer
                self.buffer.append_trace(trace)
            else:
                if self.verbose:
                    print("Trace has not been stored in buffer.")

                # Decrease statistics of programs that failed
                #for idx in programs_failed_indices:
                #self.curriculum_scheduler.update_statistics(idx, torch.FloatTensor([0.0]))

            # Train policy on batch
            if self.buffer.get_memory_length() > self.batch_size:
                for _ in range(self.num_updates_per_episode):
                    batch = self.buffer.sample_batch(self.batch_size)
                    if batch is not None:
                        self.policy.train_on_batch(batch)
            if verbose:
                print("Done episode {}/{}".format(episode + 1,
                                                  self.num_episodes_per_task))
Example #3
0
            'max_depth_dict': max_depth_dict,
            'temperature': conf.temperature,
            'c_puct': conf.c_puct,
            'exploit': True,
            'level_closeness_coeff': conf.level_closeness_coeff,
            'gamma': conf.gamma
        }

        for _ in range(40):

            env = ListEnv(length=len, encoding_dim=conf.encoding_dim)
            bubblesort_index = env.programs_library['BUBBLESORT']['index']

            # Test with mcts
            mcts = MCTS(policy, env, bubblesort_index, **mcts_test_params)
            res = mcts.sample_execution_trace()
            mcts_reward = res[7]
            mcts_rewards.append(mcts_reward)
            if mcts_reward > 0:
                mcts_rewards_normalized.append(1.0)
            else:
                mcts_rewards_normalized.append(0.0)

            # Test with network alone
            network_only = NetworkOnly(policy, env, max_depth_dict)
            netonly_reward, _ = network_only.play(bubblesort_index)
            network_only_rewards.append(netonly_reward)

        mcts_rewards_normalized_mean = np.mean(
            np.array(mcts_rewards_normalized))
        mcts_rewards_mean = np.mean(np.array(mcts_rewards))