Beispiel #1
0
class Trainer:
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']
        agent_params['state_size'] = self.env.observation_space.shape[0]
        agent_params['action_size'] = self.env.action_space_size
        self.agent = AgentPPO(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        print("PPO agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))

    def train(self, num_of_episodes):

        logging.info("Training:")
        reward_window = deque(maxlen=100)
        # reward_matrix = np.zeros((num_of_episodes, 300))

        for episode_i in range(1, num_of_episodes):

            state = self.env.reset()
            total_reward = 0
            total_loss = 0

            counter = 0
            total_action_mean = 0
            total_action_std = 0

            for t in range(self.t_max):
                action, log_probs, mean, std = self.agent.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done,
                                log_probs)
                state = next_state

                # DEBUG
                # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}".
                #              format(episode_i, reward, counter, action))

                total_loss += self.agent.agent_loss
                total_reward += np.array(reward)

                counter += 1
                total_action_mean = total_action_mean * (
                    counter - 1) / counter + np.mean(mean) / counter
                total_action_std = total_action_std * (
                    counter - 1) / counter + np.mean(std) / counter

            reward_window.append(total_reward)
            self.avg_rewards.append(np.mean(total_reward))
            print(
                '\rEpisode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f}\tMean: {:.2f} \tStd {:.2f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.mean(total_reward),
                        np.mean(reward_window), total_action_mean,
                        total_action_std, total_loss,
                        self.agent.learning_rate_policy,
                        self.agent.learning_rate_value_fn),
                end="")

            # logging.info('Episode {}\tCurrent Score (average over 20 robots): {:.2f}\tAverage Score (over episodes): {:.2f} '
            #              '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'.
            #              format(episode_i, np.mean(total_reward), np.mean(reward_window),
            #                     total_loss, self.agent.learning_rate_policy, self.agent.learning_rate_value_fn))

            self.agent.learning_rate_policy *= self.learning_rate_decay
            self.agent.learning_rate_value_fn *= self.learning_rate_decay
            self.agent.set_learning_rate(self.agent.learning_rate_policy,
                                         self.agent.learning_rate_value_fn)

            if episode_i % 100 == 0:

                avg_reward = np.mean(np.array(reward_window))
                print("\rEpisode: {}\tAverage total reward: {:.2f}".format(
                    episode_i, avg_reward))

                if avg_reward >= 30.0:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(episode_i - 100, avg_reward))
                    if not os.path.exists(self.model_path):
                        os.makedirs(self.model_path)
                    torch.save(
                        self.agent.get_actor().state_dict(),
                        self.model_path + 'checkpoint_actor_{}.pth'.format(
                            datetime.datetime.today().strftime(
                                '%Y-%m-%d_%H-%M')))
                    torch.save(
                        self.agent.get_critic().state_dict(),
                        self.model_path + 'checkpoint_critic_{}.pth'.format(
                            datetime.datetime.today().strftime(
                                '%Y-%m-%d_%H-%M')))

        t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
        # reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t))
        np.array(self.avg_rewards).dump(self.results_path +
                                        'average_rewards_new_{}.dat'.format(t))

    def test(self,
             checkpoint_actor_filename,
             checkpoint_critic_filename,
             time_span=10):
        checkpoint_actor_path = self.model_path + checkpoint_actor_filename
        checkpoint_critic_path = self.model_path + checkpoint_critic_filename
        self.agent.get_actor().load_state_dict(
            torch.load(checkpoint_actor_path))
        self.agent.get_critic().load_state_dict(
            torch.load(checkpoint_critic_path))
        for t in range(time_span):
            state = self.env.reset(train_mode=False)
            self.score = 0
            #done = False

            while True:
                action = self.agent.choose_action(state, 'test')
                sys.stdout.flush()
                self.env.render()
                state, reward, done, _ = self.env.step(action)
                self.score += np.array(reward)
                if any(done):
                    break

            print('\nFinal score:', self.score)

        self.env.close()

    @staticmethod
    def __set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)
Beispiel #2
0
class Trainer:
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']
        agent_params['state_size'] = self.env.observation_space.shape[0]
        agent_params['action_size'] = self.env.action_space.n
        self.agent = Agent(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.max_eps = trainer_params['max_eps']
        self.final_eps = trainer_params['final_eps']
        self.eps_decay = trainer_params['eps_decay']
        self.b_decay = trainer_params['b_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))

    def train(self, num_of_episodes):

        reward_window = deque(maxlen=100)

        self.eps_decay = (self.final_eps /
                          self.max_eps)**(1 / (0.2 * num_of_episodes))

        reward_matrix = np.zeros((num_of_episodes, 300))

        for episode_i in range(1, num_of_episodes):

            state = self.env.reset()
            done = False
            total_reward = 0
            total_loss = 0

            #self.agent.eps = self.max_eps/(episode_i + 1)
            self.agent.eps *= self.eps_decay

            #self.agent.b = 1 - np.exp(-self.b_decay * episode_i)

            counter = 0
            while not done:
                action = self.agent.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done)
                state = next_state

                # DEBUG
                # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}, actions: {}, fc1 weight data: {}".
                #              format(episode_i, reward, counter, action, actions,
                #                     self.agent.get_qlocal().fc1.weight.data))

                total_loss += self.agent.agent_loss
                total_reward += reward
                reward_matrix[episode_i, counter] = reward
                counter += 1

            reward_window.append(total_reward)

            print(
                '\rEpisode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f} '
                '\t\tTotal loss: {:.2f}\tEpsilon: {:.2f}\tBeta: {:.2f}\tLearning rate: {:.4f}'
                .format(episode_i, total_reward, np.mean(reward_window),
                        total_loss, self.agent.eps, self.agent.b,
                        self.agent.learning_rate),
                end="")

            logging.info(
                'Episode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f} '
                '\t\tTotal loss: {:.2f}\tEpsilon: {:.2f}\tBeta: {:.2f}\tLearning rate: {:.4f}'
                .format(episode_i, total_reward, np.mean(reward_window),
                        total_loss, self.agent.eps, self.agent.b,
                        self.agent.learning_rate))

            self.agent.learning_rate *= self.learning_rate_decay
            self.agent.set_learning_rate(self.agent.learning_rate)

            if episode_i % 100 == 0:

                avg_reward = np.mean(np.array(reward_window))
                print("\rEpisode: {}\tAverage total reward: {:.2f}".format(
                    episode_i, avg_reward))
                self.avg_rewards.append(avg_reward)

                if avg_reward >= 13.0:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(episode_i - 100, avg_reward))
                    torch.save(
                        self.agent.get_qlocal().state_dict(), self.model_path +
                        'checkpoint_{}.pth'.format(datetime.datetime.today().
                                                   strftime('%Y-%m-%d_%H-%M')))

        t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
        reward_matrix.dump(self.results_path +
                           'reward_matrix_new_{}.dat'.format(t))
        np.array(self.avg_rewards).dump(self.results_path +
                                        'average_rewards_new_{}.dat'.format(t))

    def test(self, checkpoint_filename, time_span=10):

        checkpoint_path = self.model_path + checkpoint_filename
        self.agent.get_qlocal().load_state_dict(torch.load(checkpoint_path))
        for t in range(time_span):
            state = self.env.reset(train_mode=False)
            self.score = 0
            done = False

            while not done:
                action = self.agent.choose_action(state, 'test')
                sys.stdout.flush()
                self.env.render()
                state, reward, done, _ = self.env.step(action)
                self.score += reward

            print('\nFinal score:', self.score)

        self.env.close()

    @staticmethod
    def __set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)