def learn(env_name, save_file, total_timesteps): env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO(CnnPolicy, env, verbose=1) model.learn(total_timesteps=total_timesteps) model.save(save_file) del model env.close()
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False): print("Testing:") print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}") eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}") return total_reward, distance_x
def main(config, agent): with open(config) as fp: json_data = json.load(fp) video_path = os.path.join("./videos", agent, "pong") config = GameConfig.deserialize(json_data) config.agents_config[args.agent]["save_path"] += "best_model.zip" # config.agents_config[args.agent]["save_path"] = "my_models/pong/pong_ppo/best_model.zip" print(config.agents_config[args.agent]["save_path"]) # env = retro.make(config.game_name) env = gym.make("PongNoFrameskip-v4") agent = AgentLoader.get_agent(args.agent, config.agents_config, env, load=True) env.close() env = gym.make("PongNoFrameskip-v4") env = DummyVecEnv([lambda: env]) # env = retro.make(config.game_name, record=video_path) env = VecVideoRecorder( env, video_path, record_video_trigger=lambda x: x == 0, ) obs = env.reset() done = False while not done: actions, _ = agent.agent.predict(obs) obs, rew, done, info = env.step(actions) env.close()
def main_vs_5(config: str): with open(config) as fp: json_data = json.load(fp) config = GameConfig.deserialize(json_data) config.agents_config["A2C"]["save_path"] += "_vs_5" config.agents_config["A2C"]["tensorboard"] += "_vs_5" env = DummyVecEnv( [lambda: retro.make(config.game_name, state=config.train_states[0])]) agent = AgentLoader.get_agent("A2C", config.agents_config, env) env.close() start_time = time.time() for st in tqdm(config.train_states, desc='Main Loop'): print(st) env = DummyVecEnv([ lambda: retro.make(config.game_name, state=st, scenario='scenario') ]) agent.agent.set_env(env) agent.agent.learn(total_timesteps=10000) agent.save() env.close() end_time = time.time() - start_time print(f'\n The Training Took {end_time} seconds')
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) val_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(eval_env, video_folder='tmp', record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() for i in range(video_length): action, _ = model.predict(obs) obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def record_video_example(): # Record a Video. env_id = "CartPole-v1" video_folder = "logs/videos/" video_length = 100 env = DummyVecEnv([lambda: gym.make(env_id)]) obs = env.reset() # Record the video starting at the first step. env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix=f"random-agent-{env_id}") env.reset() for _ in range(video_length + 1): action = [env.action_space.sample()] obs, _, _, _ = env.step(action) # Save the video. env.close()
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv([lambda: gym.make(env_id)]) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(eval_env, video_folder=video_folder, record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() for _ in range(video_length): action, _ = model.predict(obs) obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def eval_100_trials(args): with open(args.config) as fp: json_data = json.load(fp) config = GameConfig.deserialize(json_data) config.agents_config[args.agent]["save_path"] += "_vs_time_pt.zip" env = DummyVecEnv( [lambda: retro.make(config.game_name, state=config.eval_state[1])]) agent = AgentLoader.get_agent(args.agent, config.agents_config, env, load=True) rew_list = [] trials = 100 for i in tqdm(range(trials)): obs = env.reset() done = False reward = 0 while not done: actions, _ = agent.agent.predict(obs) obs, rew, done, info = env.step(actions) reward += rew rew_list.append(reward) env.close() count = sum(i > 0 for i in rew_list) print("win percentage = {}%".format(count / trials * 100))
def eval_time(args): with open(args.config) as fp: json_data = json.load(fp) video_path = os.path.join("./videos", args.agent) config = GameConfig.deserialize(json_data) config.agents_config[args.agent]["save_path"] += "_vs_time_pt_check.zip" env = DummyVecEnv( [lambda: retro.make(config.game_name, state=config.eval_state[1])]) agent = AgentLoader.get_agent(args.agent, config.agents_config, env, load=True) env.close() env = DummyVecEnv([ lambda: retro.make( config.game_name, state=config.eval_state[1], record=video_path) ]) obs = env.reset() done = False while not done: actions, _ = agent.agent.predict(obs) obs, rew, done, info = env.step(actions) # env.render() env.close()
def run_experiment(args): # Again could have used the SB3 tools here, buuuut... vecEnv = [] for i in range(args.n_envs): # Bit of trickery here to avoid referencing # to the same "i" vecEnv.append((lambda idx: lambda: create_env(args, idx))(i)) vecEnv = DummyVecEnv(vecEnv) constraint = AVAILABLE_CONSTRAINTS[args.constraint] agent = None if constraint == "ClipPPO": # Create a vanilla PPO agent = PPO("MlpPolicy", vecEnv, verbose=2, device="cpu", n_steps=args.n_steps, clip_range=args.clip_range, learning_rate=args.learning_rate, gamma=args.gamma, ent_coef=args.ent_coef, gae_lambda=1.0, n_epochs=args.n_epochs) else: constraint = constraint(args) agent = SmallStepPPO("MlpPolicy", vecEnv, verbose=2, device="cpu", n_steps=args.n_steps, step_constraint=constraint, learning_rate=args.learning_rate, step_constraint_max_updates=args.max_updates, gamma=args.gamma, ent_coef=args.ent_coef, gae_lambda=1.0) output_log_file = None if args.output_log: output_log_file = open(args.output_log, "w") logger.Logger.CURRENT = logger.Logger( folder=None, output_formats=[logger.HumanOutputFormat(output_log_file)]) agent.learn(total_timesteps=args.total_timesteps) if args.output is not None: agent.save(os.path.join(args.output, AGENT_FILE)) vecEnv.close() if output_log_file: output_log_file.close()
def play(env_name, load_file, total_timesteps): env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO.load(load_file, verbose=1) obs = env.reset() for i in range(total_timesteps): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) # env.render() # dummy if done: print(info[0]['episode']) del model env.close()
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"): print("Testing:") total_rewards = [] distance_xs = [] for i in range(test_n): print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}") eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed+i) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}") total_rewards.append(total_reward) distance_xs.append(distance_x) # avoid yaml turn float64 to numpy array total_rewards = [float(x) for x in total_rewards] distance_xs = [float(x) for x in distance_xs] data = { "title": "test", "train": train, "test": test, "total_reward": total_rewards, "distance_x": distance_xs, } with open(f"{save_file}", "w") as f: yaml.dump(data, f)
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'): print('record_video function') # Wrap the env in a Vec Video Recorder local_eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) sync_envs_normalization(train_env, local_eval_env) local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath, record_video_trigger=lambda step: step == 0, video_length=videoLength, name_prefix=prefix) obs = local_eval_env.reset() for _ in range(videoLength): action, _ = model.predict(obs) obs, _, _, _ = local_eval_env.step(action) # Close the video recorder local_eval_env.close()
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) # if args.stats_path is None: envs = VecNormalize(envs) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs) learner.learn(total_timesteps=10000000, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs) learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def main_vs_time(config: str): with open(config) as fp: json_data = json.load(fp) config = GameConfig.deserialize(json_data) config.agents_config["A2C"]["save_path"] += "_vs_time_pt" config.agents_config["A2C"]["tensorboard"] += "_vs_time" env = DummyVecEnv( [lambda: (retro.make(config.game_name, state=config.eval_state[0]))]) agent = AgentLoader.get_agent("A2C", config.agents_config, env) start_time = time.time() with ProgressBarManager_new(40000) as callback: agent.agent.learn(total_timesteps=40000, callback=callback) agent.save() env.close() end_time = time.time() - start_time print(f'\n The Training Took {end_time} seconds')
def main_vs_time(config: str): with open(config) as fp: json_data = json.load(fp) config = GameConfig.deserialize(json_data) config.agents_config["PPO"]["save_path"] += "_vs_time_pt_check" config.agents_config["PPO"]["tensorboard"] += "_vs_time_check" env = DummyVecEnv( [lambda: (retro.make(config.game_name, state=config.eval_state[0]))]) agent = AgentLoader.get_agent("PPO", config.agents_config, env) env.close() env = DummyVecEnv( [lambda: (retro.make(config.game_name, state=config.eval_state[0]))]) agent.agent.set_env(env) with ProgressBarManager_new(1000) as callback: agent.agent.learn(1000, callback=callback) agent.save() env.close()
def test(seed, model, train, test, normalize_kwargs, body_info=0, render=False): print("Testing:") print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}") eval_env = DummyVecEnv([ utils.make_env(rank=0, seed=utils.seed + 1, render=False, robot_body=test, body_info=0) ]) eval_env = VecNormalize(eval_env, norm_reward=False, **normalize_kwargs) eval_env.seed(seed) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print( f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}" ) return total_reward, distance_x
def test_current_exp(args): if args.save_img: all_folders = glob.glob(os.path.join(img_path,"*")) all_folders = [os.path.basename(x) for x in all_folders] all_folders = [int(x) if x.isnumeric() else -1 for x in all_folders] + [0] current_folder = max(all_folders) + 1 current_folder = os.path.join(img_path, str(current_folder)) os.makedirs(current_folder, exist_ok=True) print(f"Writing into {current_folder}") input("Press Enter...") env = DummyVecEnv([make_env(env_id=args.env_id, rank=0, seed=0, render=True)]) env = VecNormalize.load(args.vnorm_filename, env) model = CustomizedPPO.load(args.model_filename, env=env) callback = AdjustCameraCallback() obs = env.reset() callback.reset_lights(env.envs[0].env._p) # once window is opened, change the lighting if args.save_img: time.sleep(1) # please use this time to maximize the window, so that the image recorded will be full size with model.policy.features_extractor.start_testing(): while True: for i in range(1000): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) callback.camera_simpy_follow_robot(target_env=env.envs[0]) if args.save_img: callback.write_a_image(current_folder=current_folder, step=i, target_env=env.envs[0]) if obs.shape[1]>100: # With Vision I guess image = np.rollaxis(obs[:, -3*8*8:].reshape([3,8,8]), 0, start=3) * 255.0 print(image.shape) # image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) cv2.imwrite(f"{current_folder}/vision_{i:05}.png", image) if done: break time.sleep(0.01) break time.sleep(0.1) env.close()
def collect_rollouts(env, experiment_path): """ Collect rollouts for given experiment. Based on the code here https://github.com/HumanCompatibleAI/imitation/blob/master/src/imitation/scripts/expert_demos.py """ rollout_file = os.path.join(experiment_path, ROLLOUTS_FILE) if os.path.isfile(rollout_file): return sample_until = rollout.make_sample_until(MAX_ROLLOUT_TIMESTEPS, MAX_ROLLOUT_EPISODES) venv = DummyVecEnv([lambda: create_env(env) for i in range(NUM_ENVS)]) agent_path = os.path.join(experiment_path, AGENT_FILE) agent = PPO.load(agent_path) rollout.rollout_and_save(rollout_file, agent, venv, sample_until) venv.close()
dtype=np.float32) distance_x = 0 total_reward = 0 step = 0 for step in tqdm(range(args.test_steps)): g_obs_data[step, :] = obs[0] # for i in obs[0]: # print(f"{i:.02f}", end=", ") # print("") action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_venv.step(action) if args.render: # eval_venv.envs[0].camera_adjust() time.sleep(0.015) if done: # it should not matter if the env reset. I guess... break # pass else: # the last observation will be after reset, so skip the last distance_x = eval_venv.envs[0].robot.body_xyz[0] total_reward += reward[0] eval_venv.close() print(f"model filename: {args.model_filename}") print(f"test on {test_body}") print( f"Results: last step {step}, total_reward {total_reward}, distance_x {distance_x}" ) print("\n" * 4)
optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (b_logprobs[minibatch_ind] - agent.get_action(b_obs[minibatch_ind], b_actions.long()[minibatch_ind])[1] ).mean() > args.target_kl: agent.load_state_dict(target_agent.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) print("SPS:", int(global_step / (time.time() - start_time))) envs.close() writer.close()
lambda game_path=args.game_path, b=args.base_port+(i*2), c="gamelog-"+str(i)+".txt", d=args.level_path, e=args.image_based, f=args.env_p: TankEnv(game_path, game_port=b, game_log_path=c, level_path=d, image_based=e, p=f ) ) env_stack = DummyVecEnv(envs) else: env_stack = TankEnv(args.game_path, game_port=args.base_port, game_log_path="gamelog.txt", level_path=args.level_path, image_based=args.image_based, p=args.env_p) try: population = [] for i in range(args.start): agent_name, agent = gen_agent(env_stack, args.num_envs, args.model_dir, args.noun_file_path, args.adj_file_path, batch_size=args.batch_size, image_based=args.image_based, image_pretrain=args.image_pretrain, env_p=args.env_p) population.append(agent_name) if args.nem: population.append(gen_nemesis(agent_name, agent, env_stack, args.num_envs, args.model_dir, image_based=args.image_based, image_pretrain=args.image_pretrain, env_p=args.env_p)) if args.surv: population.append(gen_survivor(agent_name, agent, env_stack, args.num_envs, args.model_dir, image_based=args.image_based, image_pretrain=args.image_pretrain, env_p=args.env_p)) if args.start: with open(args.model_dir + "/population.txt", 'w') as pop_file: for p in population: pop_file.write(p + '\n') finally: env_stack.close() print("PBT Preamble complete", flush=True)
def test_framestack_vecenv(): """Test that framestack environment stacks on desired axis""" image_space_shape = [12, 8, 3] zero_acts = np.zeros([N_ENVS] + image_space_shape) transposed_image_space_shape = image_space_shape[::-1] transposed_zero_acts = np.zeros([N_ENVS] + transposed_image_space_shape) def make_image_env(): return CustomGymEnv( gym.spaces.Box( low=np.zeros(image_space_shape), high=np.ones(image_space_shape) * 255, dtype=np.uint8, )) def make_transposed_image_env(): return CustomGymEnv( gym.spaces.Box( low=np.zeros(transposed_image_space_shape), high=np.ones(transposed_image_space_shape) * 255, dtype=np.uint8, )) def make_non_image_env(): return CustomGymEnv( gym.spaces.Box(low=np.zeros((2, )), high=np.ones((2, )))) vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2) obs, _, _, _ = vec_env.step(zero_acts) vec_env.close() # Should be stacked on the last dimension assert obs.shape[-1] == (image_space_shape[-1] * 2) # Try automatic stacking on first dimension now vec_env = DummyVecEnv([make_transposed_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2) obs, _, _, _ = vec_env.step(transposed_zero_acts) vec_env.close() # Should be stacked on the first dimension (note the transposing in make_transposed_image_env) assert obs.shape[1] == (image_space_shape[-1] * 2) # Try forcing dimensions vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="last") obs, _, _, _ = vec_env.step(zero_acts) vec_env.close() # Should be stacked on the last dimension assert obs.shape[-1] == (image_space_shape[-1] * 2) vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="first") obs, _, _, _ = vec_env.step(zero_acts) vec_env.close() # Should be stacked on the first dimension assert obs.shape[1] == (image_space_shape[0] * 2) # Test invalid channels_order vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)]) with pytest.raises(AssertionError): vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="not_valid") # Test that it works with non-image envs when no channels_order is given vec_env = DummyVecEnv([make_non_image_env for _ in range(N_ENVS)]) vec_env = VecFrameStack(vec_env, n_stack=2)
def main(): # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(), # torch.nn.Linear(64, 2)) os.makedirs(_log_dir, exist_ok=True) DoTraining = True StartFresh = True num_cpu = 8 if (DoTraining): # This doesn't work but it might have something to do with how the environment is written # num_cpu = 1 # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor # Create the callback: check every 1000 steps # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir) if (StartFresh): env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch': [128, 128, 128], } model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=2, tensorboard_log=tb_log) else: env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize.load(_stats_path, env) env.reset() model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl', tensorboard_log=tb_log) model.set_env(env) eval_env = gym.make(env_id) # print('!!!!Checking Environment!!!!') # print(check_env(eval_env)) mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') for _ in range(50): model.learn(total_timesteps=100000, tb_log_name=env_id, reset_num_timesteps=False) #, callback=callback mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) + '.mdl') env.save(_log_dir + 'vec_normalize_{}'.format(model.num_timesteps) + '.pkl') if (not DoTraining): # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # eval_env = VecVideoRecorder(eval_env, video_folder='videos/', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='test') # eval_env.training = False # eval_env.norm_reward = False # eval_env.reset() eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl', tensorboard_log=tb_log) model.set_env(eval_env) # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id) # Start the video at step=0 and record 500 steps # eval_env = VecVideoRecorder(eval_env, video_folder='tmp', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='') obs = eval_env.reset() # for i in range(500): # action, _ = model.predict(obs) # obs, _, _, _ = eval_env.step(action) # eval_env.close() while True: action, _states = model.predict(obs, deterministic=True) obs, _, done, _ = eval_env.step(action) # eval_env.render() if done.any(): # obs = eval_env.reset() # time.sleep(1/30) eval_env.close() break
def run_experiment(args): n_envs = None n_timesteps = None policy = "MlpPolicy" hyperparams = {} # Super-pretty manual hardcoding of the parameters # right here, but we only are going to run one type. if args.env == "BipedalWalkerHardcore-v3": # Adapted from # https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/ppo2.yml n_envs = 16 n_timesteps = int(10e7) policy = "MlpPolicy" hyperparams = { "n_steps": 2048, "gae_lambda": 0.95, "gamma": 0.99, "n_epochs": 10, "ent_coef": 0.001, "clip_range": 0.2, "clip_range_vf": 0.2, "learning_rate": 2.5e-4, "batch_size": (2048 * 16) // 32 } else: # LunarLander-v2 n_envs = 16 n_timesteps = int(1e6) policy = "MlpPolicy" hyperparams = { "n_steps": 1024, "gae_lambda": 0.98, "gamma": 0.999, "n_epochs": 4, "ent_coef": 0.01, "clip_range": 0.2, "clip_range_vf": 0.2, "batch_size": (1024 * 16) // 32 } vecEnv = [] for i in range(n_envs): # Bit of trickery here to avoid referencing # to the same "i" vecEnv.append((lambda idx: lambda: create_env(args, idx))(i)) vecEnv = DummyVecEnv(vecEnv) agent_class = AVAILABLE_ALGORITHMS[args.agent] agent = agent_class(policy, vecEnv, verbose=1, device="cpu", **hyperparams) # Prepare callback checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR) os.makedirs(checkpoint_dir) # Note that save_freq is counted in number of agent step-calls, # not env step-calls. save_freq = n_timesteps // (args.num_snapshots * n_envs) checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir) agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback) agent.save(os.path.join(args.output, AGENT_FILE)) vecEnv.close()
else: body = args.body_id print(body) env = DummyVecEnv([utils.make_env(rank=i, seed=utils.seed, render=args.render, robot_body=body, body_info=0) for i in range(train_num_envs)]) save_filename = f"model-ant-single-{body}" env = VecNormalize(env, **normalize_kwargs) keys_remove =["normalize", "n_envs", "n_timesteps", "policy"] for key in keys_remove: del hyperparams[key] eval_env = DummyVecEnv([utils.make_env(rank=0, seed=utils.seed+1, render=False, robot_body=2, body_info=0)]) eval_env = VecNormalize(eval_env, norm_reward=False, **normalize_kwargs) eval_callback = EvalCallback( eval_env=eval_env, n_eval_episodes=3, eval_freq=1e4, # will implicitly multiplied by 16 (train_num_envs) deterministic=True, ) # eval_callback = None model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=f"{folder}/tb/{save_filename}", seed=utils.seed, **hyperparams) model.learn(total_timesteps=total_timesteps, callback=eval_callback) model.save(f"{folder}/{save_filename}") # Important: save the running average, for testing the agent we need that normalization model.get_vec_normalize_env().save(f"{folder}/{save_filename}-vecnormalize.pkl") env.close()
verbose=1, tensorboard_log=str(common.output_data_folder / f"tensorboard" / saved_model_filename), seed=common.seed, **hyperparams) if len(args.initialize_weights_from) > 0: try: load_model = PPO.load(args.initialize_weights_from) load_weights = load_model.policy.state_dict() model.policy.load_state_dict(load_weights) print(f"Weights loaded from {args.initialize_weights_from}") except Exception: print("Initialize weights error.") raise Exception try: model.learn(total_timesteps=args.train_steps, callback=all_callbacks) except KeyboardInterrupt: pass model.save(str(common.output_data_folder / "models" / saved_model_filename)) if args.vec_normalize: # Important: save the running average, for testing the agent we need that normalization model.get_vec_normalize_env().save( str(common.output_data_folder / "models" / f"{saved_model_filename}.vnorm.pkl")) venv.close()
def test(seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"): global g_step, g_obs_data print("Testing:") total_rewards = [] distance_xs = [] if True: os.makedirs(f"{folder}/obs_data_videos", exist_ok=True) g_step = 0 print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}") if test_as_class>=0: bodyinfo = test_as_class else: if args.with_bodyinfo: bodyinfo = test//100 else: bodyinfo = 0 # default_wrapper = wrapper.BodyinfoWrapper # if args.disable_wrapper: # default_wrapper = None default_wrapper = wrapper.WalkerWrapper eval_env = utils.make_env(template=utils.template(test), render=render, robot_body=test, wrapper=default_wrapper, body_info=bodyinfo) eval_env = DummyVecEnv([eval_env]) if args.vec_normalize: eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() g_obs_data = np.zeros(shape=[args.test_steps, obs.shape[1]], dtype=np.float32) if render: # eval_env.env_method("set_view") import common.linux common.linux.fullscreen() print("\n\nWait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode.\n\n") time.sleep(2) # Wait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode. distance_x = 0 # print(obs) total_reward = 0 for step in tqdm(range(args.test_steps)): g_obs_data[step,:] = obs[0] action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if render: eval_env.envs[0].camera_adjust() if args.disable_saving_image: time.sleep(0.01) else: (width, height, rgbPixels, _, _) = eval_env.envs[0].pybullet.getCameraImage(1920,1080, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL) image = rgbPixels[:,:,:3] image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) cv2.imwrite(f"{folder}/obs_data_videos/getCameraImage_b{test}_s{seed}_{step:05}.png", image) if done: # it should not matter if the env reset. I guess... # break pass else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] # if render: # time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}") if args.save_obs_data: base_obs_data = None sorted_data = g_obs_data.copy() if test!=0 or seed!=0: # if sorted_arg_obs_data exists, use the existing one # because we want to compare the patterns of two experiments sorted_arg_obs_data = np.load(f"{folder}/sorted_arg_obs_data.npy") base_obs_data = np.load(f"{folder}/base_obs_data.npy") else: sorted_arg_obs_data = np.argsort(np.mean(sorted_data,axis=0)) np.save(f"{folder}/sorted_arg_obs_data.npy", sorted_arg_obs_data) base_obs_data = g_obs_data.copy() np.save(f"{folder}/base_obs_data.npy", base_obs_data) # sorted_data = sorted_data[:,sorted_arg_obs_data] # base_obs_data = base_obs_data[:, sorted_arg_obs_data] for step in tqdm(range(args.test_steps)): plt.close() plt.figure(figsize=[10,4]) if test!=0 or seed!=0: x = sorted_data[step] plt.bar(np.arange(len(x)), x, color=[0.1, 0.3, 0.7, 0.5]) x = base_obs_data[step] plt.bar(np.arange(len(x)), x, color=[0.6, 0.6, 0.6, 0.5]) plt.ylim(-2,2) plt.savefig(f"{folder}/obs_data_videos/barchart_b{test}_s{seed}_{step:05}.png") plt.close() total_rewards.append(total_reward) distance_xs.append(distance_x) # avoid yaml turn float64 to numpy array total_rewards = [float(x) for x in total_rewards] distance_xs = [float(x) for x in distance_xs] data = { "title": "test", "train": train, "test": test, "total_reward": total_rewards, "distance_x": distance_xs, } with open(f"{save_file}", "w") as f: yaml.dump(data, f)