def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] env_params = { 'time_step': TIME_STEP, 'robot_class': QuadrupedRobot, 'on_rack': False, 'enable_self_collision': True, 'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT, 'train_or_test': test_or_train } policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]} if TEST_OR_TRAIN == "train": env = make_vec_env(env_change_input, n_envs=NUM_CPUS, seed=0, env_kwargs=env_params, vec_env_cls=SubprocVecEnv) env = VecNormalize(env) if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: # env = env_change_input(time_step=env_params['time_step'], # robot_class=env_params['robot_class'], # on_rack=env_params['on_rack'], # enable_self_collision=env_params['enable_self_collision'], # motor_control_mode=env_params['motor_control_mode'], # train_or_test=env_params['train_or_test']) env = env_change_input(**env_params) model_load_path = os.path.join(policy_save_dir, 'ppo_3_17-03-2021_15-39-42') model = PPO.load(model_load_path) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
class MultiModuleExp: """ A whole experiment. It should contain: (1) environments, (2) policies, (3) training, (4) testing. The results should be able to compare with other experiments. The Multi-RNN experiment. """ def __init__( self, args, env_id="HopperBulletEnv-v0", features_extractor_class=MultiExtractor, features_extractor_kwargs={}, ) -> None: print("Starting MultiModuleExp") """ Init with parameters to control the training process """ self.args = args self.env_id = env_id self.use_cuda = torch.cuda.is_available() and args.cuda self.device = torch.device("cuda" if self.use_cuda else "cpu") # Make Environments print("Making train environments...") venv = DummyVecEnv([ make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render) for i in range(args.num_envs) ]) self.eval_env = DummyVecEnv( [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)]) if args.vec_normalize: venv = VecNormalize(venv) self.eval_env = VecNormalize(self.eval_env, norm_reward=False) features_extractor_kwargs["num_envs"] = args.num_envs policy_kwargs = { "features_extractor_class": features_extractor_class, "features_extractor_kwargs": features_extractor_kwargs, # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor. # pi: Actor (policy-function); vf: Critic (value-function) "net_arch": [dict(pi=[64, 64], vf=[64, 64])], } self.model = CustomizedPPO( CustomizedPolicy, venv, n_steps=args.rollout_n_steps, tensorboard_log="tb", policy_kwargs=policy_kwargs, device=self.device, verbose=1, rnn_move_window_step=args.rnn_move_window_step, rnn_sequence_length=args.rnn_sequence_length, use_sde=args.sde, n_epochs=args.n_epochs) def train(self) -> None: """ Start training """ print(f"train using {self.model.device.type}") callback = [ DebugCallback("Customized"), AdjustCameraCallback(), WandbCallback(self.args), CustomizedEvalCallback( self.eval_env, best_model_save_path=None, log_path=None, eval_freq=self.args.eval_freq, n_eval_episodes=3, verbose=0, ) ] self.model.learn(self.args.total_timesteps, callback=callback) def test(self, model_filename, vnorm_filename): self.model.load(model_filename) self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env) self.eval_env.render() obs = self.eval_env.reset() with self.model.policy.features_extractor.start_testing(): for i in range(1000): action = self.model.predict(obs, deterministic=True) self.eval_env.step(action) self.eval_env.close()