Beispiel #1
0
def train(env, log_dir):
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    env = VecNormalize(env,
                       training=True,
                       norm_obs=True,
                       norm_reward=True,
                       gamma=0.9997,
                       clip_obs=10.,
                       clip_reward=10.,
                       epsilon=0.1)

    drive = PPO("MlpPolicy",
                env,
                ent_coef=0.01,
                vf_coef=1,
                batch_size=32,
                learning_rate=linear_schedule(0.001),
                clip_range=linear_schedule(0.1),
                n_steps=1000,
                n_epochs=20,
                tensorboard_log=log_dir + "/drive_tensorboard_log",
                verbose=1)

    drive.learn(total_timesteps=total_timesteps, callback=callback)

    for i in range(total_train_runs):
        env.close()
        drive.learn(total_timesteps=total_timesteps,
                    callback=callback,
                    reset_num_timesteps=False)

    drive.save("conduziadrive")
Beispiel #2
0
                    new_values - b_values[minibatch_ind], -args.clip_coef,
                    args.clip_coef)
                v_loss_clipped = (v_clipped - b_returns[minibatch_ind])**2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * (
                    (new_values - b_returns[minibatch_ind])**2).mean()

            loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
            optimizer.step()

    # TRY NOT TO MODIFY: record rewards for plotting purposes
    writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]['lr'],
                      global_step)
    writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
    writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
    writer.add_scalar("losses/entropy", entropy.mean().item(), global_step)
    writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
    print("SPS:", int(global_step / (time.time() - start_time)))
    writer.add_scalar("charts/SPS",
                      int(global_step / (time.time() - start_time)),
                      global_step)

envs.close()
writer.close()
class MultiModuleExp:
    """ 
    A whole experiment.
    It should contain: (1) environments, (2) policies, (3) training, (4) testing.
    The results should be able to compare with other experiments.

    The Multi-RNN experiment.
    """
    def __init__(
        self,
        args,
        env_id="HopperBulletEnv-v0",
        features_extractor_class=MultiExtractor,
        features_extractor_kwargs={},
    ) -> None:
        print("Starting MultiModuleExp")
        """ Init with parameters to control the training process """
        self.args = args
        self.env_id = env_id
        self.use_cuda = torch.cuda.is_available() and args.cuda
        self.device = torch.device("cuda" if self.use_cuda else "cpu")

        # Make Environments
        print("Making train environments...")
        venv = DummyVecEnv([
            make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render)
            for i in range(args.num_envs)
        ])
        self.eval_env = DummyVecEnv(
            [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)])
        if args.vec_normalize:
            venv = VecNormalize(venv)
            self.eval_env = VecNormalize(self.eval_env, norm_reward=False)

        features_extractor_kwargs["num_envs"] = args.num_envs
        policy_kwargs = {
            "features_extractor_class": features_extractor_class,
            "features_extractor_kwargs": features_extractor_kwargs,
            # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor.
            # pi: Actor (policy-function); vf: Critic (value-function)
            "net_arch": [dict(pi=[64, 64], vf=[64, 64])],
        }

        self.model = CustomizedPPO(
            CustomizedPolicy,
            venv,
            n_steps=args.rollout_n_steps,
            tensorboard_log="tb",
            policy_kwargs=policy_kwargs,
            device=self.device,
            verbose=1,
            rnn_move_window_step=args.rnn_move_window_step,
            rnn_sequence_length=args.rnn_sequence_length,
            use_sde=args.sde,
            n_epochs=args.n_epochs)

    def train(self) -> None:
        """ Start training """
        print(f"train using {self.model.device.type}")

        callback = [
            DebugCallback("Customized"),
            AdjustCameraCallback(),
            WandbCallback(self.args),
            CustomizedEvalCallback(
                self.eval_env,
                best_model_save_path=None,
                log_path=None,
                eval_freq=self.args.eval_freq,
                n_eval_episodes=3,
                verbose=0,
            )
        ]
        self.model.learn(self.args.total_timesteps, callback=callback)

    def test(self, model_filename, vnorm_filename):
        self.model.load(model_filename)
        self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env)
        self.eval_env.render()
        obs = self.eval_env.reset()
        with self.model.policy.features_extractor.start_testing():
            for i in range(1000):
                action = self.model.predict(obs, deterministic=True)
                self.eval_env.step(action)

        self.eval_env.close()