コード例 #1
0
    def train(self):
        from algos.ppo4multienvs import PPO, ReplayBuffer
        from nets.network import ActorCritic_Norm as ActorCritic
        cl_args = self.cl_args

        log_save_name = cl_args.algo_id + '_' + cl_args.env_id + '_buffer_{}_batch_{}_hidden_{}_lr_{}_maxsteps_{}'.format(
            cl_args.buffer_size, cl_args.batch_size, cl_args.hidden_size,
            cl_args.learning_rate, cl_args.max_steps_per_episodes)
        log_save_path = os.path.join("./runs", log_save_name)
        if os.path.exists(log_save_path):
            shutil.rmtree(log_save_path)
        utli.writer = SummaryWriter(log_save_path)

        model_dir = utli.Save_model_dir(cl_args.algo_id, cl_args.env_id)

        # Create the environment to train on.
        num_envs = 8

        def make_env():
            def _thunk():
                env = gym.make(cl_args.env_id)
                return env

            return _thunk

        envs = [make_env() for i in range(num_envs)]
        envs = SubprocVecEnv(envs)

        env = gym.make(cl_args.env_id)
        env.seed(0)
        buffer_size = cl_args.buffer_size
        batch_size = cl_args.batch_size

        # Train for 1 million timesteps.
        num_steps = cl_args.num_steps

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])

        lr = LearningRate.get_instance()
        lr.lr = 10**(-3)
        lr.decay_factor = 0.5

        lr = cl_args.learning_rate

        evaluate_every = cl_args.evaluate_every

        # The buffer
        replay_buffer = ReplayBuffer(num_total_sizes=buffer_size,
                                     obs_dims=state_dim,
                                     act_dims=action_dim,
                                     batch_size=batch_size)

        # network
        model = ActorCritic(state_dim=state_dim,
                            action_dim=action_dim,
                            hidden_size=128).to(device)

        # policy
        policy = PPO(model=model,
                     replay_buf=replay_buffer,
                     lr=lr,
                     device=device)

        time_step = 0
        # Evaluate the initial network
        evaluations = []

        # begin optimize
        cur_state = envs.reset()
        reward_window = deque(maxlen=50)

        while time_step < num_steps:
            replay_buffer.clear()
            train_r = 0
            for _ in range(buffer_size // batch_size):
                state = torch.FloatTensor(cur_state).unsqueeze(0)
                dist, value = model(state.to(device))
                action = dist.sample()
                log_prob = dist.log_prob(action)

                action = action.cpu().detach().numpy()[0]
                log_prob = log_prob.cpu().detach().numpy()[0]
                value = value.cpu().detach().numpy()[0]

                next_state, reward, done, _ = envs.step(action)

                train_r += reward.sum()

                reward = np.expand_dims(reward, axis=1)
                done = np.expand_dims(done, axis=1)
                replay_buffer.add(cur_obs=cur_state,
                                  cur_action=action,
                                  reward=reward,
                                  done=done,
                                  old_log_prob=log_prob,
                                  value=value)
                cur_state = next_state

                time_step += 1
                if time_step % evaluate_every == 0:
                    evaluation, mean_reward, mean_step = self.evaluate_policy(
                        env=env,
                        model=model,
                        time_step=time_step,
                        evaluation_trajectories=6)
                    evaluations.append(evaluation)
                    reward_window.append(mean_reward)
                    print(np.mean(reward_window))

                    utli.recordEvaluateResults(
                        results=(mean_reward, mean_step,
                                 np.mean(reward_window)),
                        time_step=time_step)

            # compute returns
            returns = policy.compute_gae(next_state=next_state)

            returns = replay_buffer.cat(returns)

            # training PPO policy
            value_losses, ppo_losses, entropys, losses = policy.train(
                returns=returns)

            utli.recordTrainResults(results=(train_r,
                                             np.mean(np.array(value_losses)),
                                             np.mean(np.array(ppo_losses)),
                                             np.mean(np.array(entropys)),
                                             np.mean(np.array(losses))),
                                    time_step=time_step)

        # last evalution
        last_evaluation, mean_reward, mean_step = self.evaluate_policy(
            env=env,
            model=model,
            time_step=time_step,
            evaluation_trajectories=6)
        evaluations.append(last_evaluation)
        reward_window.append(mean_reward)
        print(np.mean(reward_window))

        utli.recordEvaluateResults(results=(mean_reward, mean_step,
                                            np.mean(reward_window)),
                                   time_step=time_step)

        # store results
        utli.store_results(evaluations, (time_step + 1), cl_args)
コード例 #2
0
    def train(self):
        from algos.ppo4Categorical import PPO, ReplayBuffer
        from nets.network import ActorCritic_Cate_zm01 as ActorCritic
        cl_args = self.cl_args

        log_save_name = cl_args.algo_id + '_' + cl_args.env_id + '_buffer_{}_batch_{}_hidden_{}_lr_{}_maxsteps_{}'.format(
            cl_args.buffer_size, cl_args.batch_size, cl_args.hidden_size,
            cl_args.learning_rate, cl_args.max_steps_per_episodes)
        log_save_path = os.path.join("./runs", log_save_name)
        if os.path.exists(log_save_path):
            shutil.rmtree(log_save_path)
        utli.writer = SummaryWriter(log_save_path)

        model_dir = utli.Save_model_dir(cl_args.algo_id, cl_args.env_id)

        # Create the environment to train on.
        env = gym.make(cl_args.env_id)
        env_evaluate = gym.make(cl_args.env_id)

        env = env.unwrapped
        env_evaluate = env_evaluate.unwrapped

        # env.seed(0)
        buffer_size = cl_args.buffer_size
        batch_size = cl_args.batch_size

        # Train for 1 million timesteps.
        num_steps = cl_args.num_steps

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        # max_action = float(env.action_space.high[0])

        lr = LearningRate.get_instance()
        lr.lr = 10**(-3)
        lr.decay_factor = 0.5

        lr = cl_args.learning_rate

        ppo_epoch = cl_args.ppo_epoch

        evaluate_every = cl_args.evaluate_every

        max_steps_per_episodes = cl_args.max_steps_per_episodes

        stop_condition = cl_args.stop_condition

        use_device = cl_args.use_device

        # The buffer
        replay_buffer = ReplayBuffer(num_total_sizes=buffer_size,
                                     obs_dims=state_dim,
                                     act_dims=1,
                                     batch_size=batch_size)

        # network
        if use_device:
            model = ActorCritic(state_dim=state_dim,
                                action_dim=action_dim,
                                hidden_size=cl_args.hidden_size).to(device)
        else:
            model = ActorCritic(state_dim=state_dim,
                                action_dim=action_dim,
                                hidden_size=cl_args.hidden_size)

        # policy
        policy = PPO(model=model,
                     replay_buf=replay_buffer,
                     lr=lr,
                     device=device,
                     use_device=use_device,
                     ppo_epoch=ppo_epoch,
                     weight_epsilon=0.0)

        time_step = 0
        # Evaluate the initial network
        evaluations = []

        # begin optimize
        # cur_state = env.reset()
        reward_window4Train = deque(maxlen=100)
        reward_window4Evaluate = deque(maxlen=100)
        episode_t = 0
        count = 0
        S_time = time.time()
        while time_step < num_steps:
            episode_t += 1
            cur_state = env.reset()
            path_length, path_rewards = 0, 0.

            while True:
                path_length += 1
                time_step += 1

                state = torch.FloatTensor(cur_state).unsqueeze(0)
                # state = torch.FloatTensor(cur_state[None])
                if use_device:
                    with torch.no_grad():
                        action, old_log_prob, value = model.select_action(
                            state.to(device))
                else:
                    with torch.no_grad():
                        action, old_log_prob, value = model.select_action(
                            state)

                next_state, reward, done, _ = env.step(action)
                # reward = np.expand_dims(reward, axis=1)
                # done = np.expand_dims(done, axis=1)
                replay_buffer.add(cur_obs=cur_state,
                                  cur_action=action,
                                  reward=reward,
                                  done=done,
                                  old_log_prob=old_log_prob.cpu(),
                                  value=value)
                cur_state = next_state

                if replay_buffer.enough_data:
                    next_state = torch.FloatTensor(next_state).unsqueeze(0)
                    if use_device:
                        with torch.no_grad():
                            _, _, next_value = model.select_action(
                                next_state.to(device))
                    else:
                        with torch.no_grad():
                            _, _, next_value = model.select_action(next_state)

                    # compute returns
                    # returns = policy.compute_gae(next_state=next_state)
                    returns = replay_buffer.compute_gae(next_value=next_value)

                    # training PPO policy
                    value_losses, ppo_losses, entropys, losses = policy.train(
                        returns=returns)

                    utli.recordLossResults(
                        results=(np.mean(np.array(value_losses)),
                                 np.mean(np.array(ppo_losses)),
                                 np.mean(np.array(entropys)),
                                 np.mean(np.array(losses))),
                        time_step=time_step)

                    replay_buffer.clear()
                path_rewards += reward
                if done or max_steps_per_episodes == path_length:
                    break

                if time_step % evaluate_every == 0:
                    evaluation, mean_reward, mean_step = self.evaluate_policy(
                        env=env_evaluate,
                        model=model,
                        time_step=time_step,
                        use_device=use_device,
                        max_step=max_steps_per_episodes,
                        evaluation_trajectories=6)
                    evaluations.append(evaluation)
                    reward_window4Evaluate.append(mean_reward)

                    utli.recordEvaluateResults(
                        results=(mean_reward, mean_step,
                                 np.mean(reward_window4Evaluate)),
                        time_step=time_step)

            reward_window4Train.append(path_rewards)
            utli.recordTrainResults(results=(path_rewards, path_length,
                                             np.mean(reward_window4Train)),
                                    time_step=time_step)
            print(
                "Episode: %d,      Time steps: %d,        Path length: %d       Reward: %f"
                % (episode_t, time_step, path_length, path_rewards))

            count = utli.Save_trained_model(
                count=count,
                num=cl_args.num_model,
                model=model,
                model_dir=model_dir,
                stop_condition=stop_condition,
                reward_window4Train=reward_window4Train,
                reward_window4Evaluate=reward_window4Evaluate)

        # last evalution
        evaluation, mean_reward, mean_step = self.evaluate_policy(
            env=env_evaluate,
            model=model,
            time_step=time_step,
            use_device=use_device,
            max_step=max_steps_per_episodes,
            evaluation_trajectories=6)
        evaluations.append(evaluation)
        reward_window4Evaluate.append(mean_reward)

        utli.recordEvaluateResults(results=(mean_reward, mean_step,
                                            np.mean(reward_window4Evaluate)),
                                   time_step=time_step)
        E_time = time.time()
        # store results
        utli.store_results(evaluations, (time_step + 1),
                           cl_args,
                           S_time=S_time,
                           E_time=E_time)