def load_new_opp(self, idx, opp_fp, opp_elo): if idx < len(self.opponents): self.opponents[idx] = (PPO.load(opp_fp), opp_elo, opp_fp) self.curr_opp = idx else: self.opponents.append((PPO.load(opp_fp), opp_elo, opp_fp)) self.curr_opp = len(self.opponents) - 1
def main(): tensorboard_log = "./log" env = Pinokio5() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env]), tensorboard_log=tensorboard_log) else: model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log) try: while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=8000000, tb_log_name=tb_log_name) model.save(save_file) obs = env.reset() for i in range(100): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset() except KeyboardInterrupt: print("Saving before exiting...") model.save(save_file) print("k bye")
def run_model_stablebaseline3(flow_params, num_cpus=1, rollout_size=5, num_steps=5): from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy import torch.nn as nn if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) train_model = PPO(MlpPolicy, env=env, verbose=1, n_epochs=rollout_size, tensorboard_log="./PPO_tensorboard/", device="cuda") # cpu, gpu selection # automatically select gpu train_model.learn(total_timesteps=num_steps * rollout_size) # return train_model
def train(run_name: str, config: Dict[str, Any]): cfg_t = config['train'] cfg_p = config['preprocess'] run_dir = get_run_dir(run_name) dump_config(run_dir, config) os.makedirs(run_dir, exist_ok=False) def _make_env(n_envs: int, is_eval: bool): return make_env(seed=0, n_envs=n_envs, run_dir=run_dir, frame_skip=cfg_p['frame_skip'], frame_stack=cfg_p['frame_stack'], is_eval=is_eval) train_env = _make_env(cfg_t['n_envs'], False) eval_env = _make_env(1, False) model = PPO('CnnPolicy', train_env, n_steps=cfg_t['n_steps'], n_epochs=cfg_t['n_epochs'], batch_size=cfg_t['batch_size'], learning_rate=cfg_t['lr'], tensorboard_log=os.path.join(run_dir, 'tb')) model.learn(cfg_t['total_steps'], eval_env=eval_env, eval_freq=cfg_t['eval_freq'] // cfg_t['n_envs'], n_eval_episodes=cfg_t['n_eval_eps'], eval_log_path=run_dir)
def train_advil(env, n=0): venv = gym.make(env) for i in range(n): mean_rewards = [] std_rewards = [] for num_trajs in range(0, 26, 5): if num_trajs == 0: expert_data = make_sa_dataloader(env, normalize=True) pi = advil_training(expert_data, venv, iters=0) else: expert_data = make_sa_dataloader(env, max_trajs=num_trajs, normalize=True, batch_size=1024) pi = advil_training(expert_data, venv) def get_policy(*args, **kwargs): return pi model = PPO(get_policy, env, verbose=1) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Trajs: {1}".format(num_trajs, mean_reward)) np.savez(os.path.join("learners", env, "advil_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards)
def init_adv(adv_env_id, disable_adv=False, env_kwargs=None): bridge = Bridge() default_env_kwargs = { 'renders' if 'CartPole' in adv_env_id else 'render': render } if env_kwargs is None: env_kwargs = {} env_kwargs.update(default_env_kwargs) env = make_vec_env(adv_env_id, env_kwargs=env_kwargs, seed=seed) env = VecNormalize(env) prot_agent = PPO('MlpPolicy', env, verbose=verbose, seed=seed, n_steps=ts, bridge=bridge, is_protagonist=True) if disable_adv: bridge.link_agents(prot_agent, None) else: adv_agent = PPO('MlpPolicy', env, verbose=verbose, seed=seed, n_steps=ts, bridge=bridge, is_protagonist=False) bridge.link_agents(prot_agent, adv_agent) return prot_agent, env
def save_new_model(name, env, num_envs, model_dir, batch_size=None, n_steps=None, n_epochs=None, clip_range=None, gamma=None, gae_lambda=None, vf_coef=None, ent_coef=None, learning_rate=None, image_based=False, image_pretrain=None, verbose=0, w=.1): if not batch_size: batch_size = choose_hyperp("batch_size", 10, w=w) if not n_steps: n_steps = max(batch_size, choose_hyperp("n_steps", 10, w=w))//num_envs if not n_epochs: n_epochs = choose_hyperp("n_epochs", 2, w=w) if not clip_range: clip_range = choose_hyperp("clip_range", 1, w=w) if not gamma: gamma = choose_hyperp("gamma", 2, w=w) if not gae_lambda: gae_lambda = choose_hyperp("gae_lambda", 1, w=w) if not vf_coef: vf_coef = choose_hyperp("vf_coef", 0, w=w) if not ent_coef: ent_coef = choose_hyperp("ent_coef", 0, w=w) if not learning_rate: learning_rate = choose_hyperp("learning_rate", 5, w=w) feature_extractor = "MlpPolicy" if image_based: feature_extractor = "CnnPolicy" model = PPO(feature_extractor, env, batch_size=batch_size, n_steps=n_steps, n_epochs=n_epochs, clip_range=clip_range, gamma=gamma, gae_lambda=gae_lambda, vf_coef=vf_coef, ent_coef=ent_coef, learning_rate=learning_rate, verbose=verbose) if image_based and image_pretrain: model.policy.features_extractor.cnn.load_state_dict(T.load(image_pretrain+"_cnn.pth")) model.policy.features_extractor.linear.load_state_dict(T.load(image_pretrain+"_linear.pth")) model.save(model_dir + name + '/' + name + "_0") return model
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def train(): train_images, test_images = load_data("dataset") env = Monitor( PuzzleEnv(images=train_images, img_size=IMG_SIZE, channel_num=CHANNEL_NUM, puzzle_size=(3, 3), max_step_num=100, puzzle_type="switch", dist_type="manhattan", penalty_for_step=-0.2, reward_for_completiton=20, positive_reward_coefficient=1.0, obs_conf=OBS_CONF)) policy_kwargs = dict( features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=128), ) model = PPO('CnnPolicy', env, policy_kwargs=policy_kwargs, verbose=1, learning_rate=0.0005, seed=42) model.learn(total_timesteps=1000000) test(model, test_images)
def main(args): envs = make_vec_env(args.env_name, n_envs=args.num_envs, vec_env_cls=SubprocVecEnv) viz_env = None if args.viz: nm_core, nm_vrsn, = args.env_name.split('-') nm_core += 'Viz' if args.viz else 'Dbg' if args.debug else '' viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1) rl_learner = PPO('MlpPolicy', envs, verbose=1, seed=args.seed, device='cpu') for epoch in range(args.num_epochs): rl_learner.learn(args.steps_per_epoch) if args.viz: obs = viz_env.reset() done = False while not done: act, _ = rl_learner.predict(obs) if len(act.shape) > len(viz_env.action_space.shape): act = act[0:1] # just one viz env obs, rwd, done, _ = viz_env.step(act) time.sleep(0.01) # to make motions visible
def main(): num_cpu = 1 load_version = '' save_version = '1b_v0' load_dir = '../models' save_dir = '../models' timesteps_per_checkpoint = int(1e6) num_checkpoints = int(1e1) # controlling performance level of agent try: os.mkdir(save_dir) except OSError as error: pass alg_env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) print('created alg env') train_policy = 'MlpPolicy' load_path = '{}/alg_v{}.zip'.format(load_dir, load_version) if os.path.exists(load_path): alg = PPO(train_policy, alg_env, verbose=0) alg.set_parameters(load_path, exact_match=True) # alg = PPO.load(load_path, env=alg_env) print('loaded alg checkpoint' + load_path) else: alg = PPO(train_policy, alg_env, verbose=0) print('created alg model') save_path = '{}/alg_v{}.zip'.format(save_dir, save_version) for _ in range(num_checkpoints): alg.learn(total_timesteps=timesteps_per_checkpoint) alg.save(save_path) print('saved alg checkpoint' + save_path)
def main(): base_args, base_parser = get_logger2_args() args = get_args(base_parser) args.device = init_gpus_and_randomness(args.seed, args.gpu) logger = Logger2('/tmp/tmp', use_tensorboardX=True) logger.log_tb_object(args, 'args') envs = make_vec_env(args.env_name, n_envs=args.num_envs, vec_env_cls=SubprocVecEnv) viz_env = None if args.visualize: nm_core, nm_vrsn, = args.env_name.split('-') nm_core += 'Viz' if args.visualize else 'Dbg' if args.debug else '' viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1) rl_learner = PPO('MlpPolicy', envs, verbose=1, seed=args.seed, device='cpu') for epoch in range(args.num_epochs): rl_learner.learn(args.steps_per_epoch) if args.visualize: obs = viz_env.reset() done = False while not done: act, _ = rl_learner.predict(obs) if len(act.shape) > len(viz_env.action_space.shape): act = act[0:1] # just one viz env obs, rwd, done, _ = viz_env.step(act) time.sleep(0.01) # to make motions visible
def __call__(self): policy_kwargs = dict(activation_fn=th.nn.ReLU) model = PPO('CnnPolicy', self.env, learning_rate=1e-3, policy_kwargs=policy_kwargs).learn(self.total_time_steps) model.save('PPO_' + self.game_name) del model # since the model has been trained, its no longer needed any more...
def test_vec_with_ppo(): """ Test the `VecExtractDictObs` with PPO """ env = DictObsVecEnv() env = VecExtractDictObs(env, "rgb") monitor_env = VecMonitor(env) model = PPO("MlpPolicy", monitor_env, verbose=1, n_steps=64, device="cpu") model.learn(total_timesteps=250)
def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) model = PPO('MlpPolicy', env, verbose=1, n_steps=500, batch_size=10000) model.learn(total_timesteps=2500000000)
class Agent(object): def __init__(self, env, model=None): if model: self.model = model else: self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) policy_kwargs = dict( features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=256), net_arch=[dict(pi=[64, 64], vf=[64, 64])]) self.model = PPO(CustomCnnPolicy, vec_env, policy_kwargs=policy_kwargs, verbose=1, learning_rate=0.001) def function(self, obs, conf): import random col, _ = self.model.predict(np.array(obs['board']).reshape( 6, 7, 1)) # TODO: Connect-4 specific so far is_valid = (obs['board'][int(col)] == 0) if is_valid: return int(col) else: return random.choice([ col for col in range(config.columns) if obs.board[int(col)] == 0 ]) def train(self, timesteps): self.model.learn(total_timesteps=timesteps) def save(self, name: str): self.model.save(name) def load(self, name: str, env, replace_parameters=None): self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) self.model = PPO.load(name, env=vec_env, custom_objects=replace_parameters) def plot(self): # Plot cumulative reward with open(os.path.join(self.log_dir, "monitor.csv"), 'rt') as fh: firstline = fh.readline() assert firstline[0] == '#' df = pd.read_csv(fh, index_col=None)['r'] df.rolling(window=1000).mean().plot() plt.show()
def test_ppo_warnings(): """Test that PPO warns and errors correctly on problematic rollour buffer sizes""" # Only 1 step: advantage normalization will return NaN with pytest.raises(AssertionError): PPO("MlpPolicy", "Pendulum-v0", n_steps=1) # Truncated mini-batch with pytest.warns(UserWarning): PPO("MlpPolicy", "Pendulum-v0", n_steps=6, batch_size=8)
def test_ppo(): env = gym.make("fishing-v1") check_env(env) # takes about 200000 to get a decent policy, about a 12 min test.. model = PPO("MlpPolicy", env, verbose=0) model.learn(total_timesteps=200) # Simulate a run with the trained model, visualize result df = env.simulate(model) env.plot(df, "PPO-test.png") # Evaluate model mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)
def test_vec_monitor_ppo(recwarn): """ Test the `VecMonitor` with PPO """ env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) env.seed(0) monitor_env = VecMonitor(env) model = PPO("MlpPolicy", monitor_env, verbose=1, n_steps=64, device="cpu") model.learn(total_timesteps=250) # No warnings because using `VecMonitor` evaluate_policy(model, monitor_env) assert len(recwarn) == 0
def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def train(env, log_dir): callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) env = VecNormalize(env, training=True, norm_obs=True, norm_reward=True, gamma=0.9997, clip_obs=10., clip_reward=10., epsilon=0.1) drive = PPO("MlpPolicy", env, ent_coef=0.01, vf_coef=1, batch_size=32, learning_rate=linear_schedule(0.001), clip_range=linear_schedule(0.1), n_steps=1000, n_epochs=20, tensorboard_log=log_dir + "/drive_tensorboard_log", verbose=1) drive.learn(total_timesteps=total_timesteps, callback=callback) for i in range(total_train_runs): env.close() drive.learn(total_timesteps=total_timesteps, callback=callback, reset_num_timesteps=False) drive.save("conduziadrive")
def run(config: Dict[str, Any], logdir: pathlib.PosixPath): env = make_env(config) if config["mode"] == "evaluate": print("Start evaluation.") model = PPO.load(logdir / "model.zip") elif config["mode"] == "train" and args.logdir: print("Start training from existing model.") model = PPO.load(logdir / "model.zip") model.set_env(env) model.learn(total_timesteps=config["train_steps"]) else: print("Start training.") model = PPO( "CnnPolicy", env, verbose=1, tensorboard_log=logdir / "tensorboard", use_sde=True, ) model.learn(total_timesteps=config["train_steps"]) mean_reward, std_reward = evaluate_policy( model, env, n_eval_episodes=config["eval_eps"], deterministic=True) print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}") if config["mode"] == "train": model.save(logdir / "model") env.close()
def main(): env = gym.make(ENV_NAME) model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=100000) obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset() env.close()
def trained_agent(episodes=256, continuous=True, load=None, save_name="test", ent_coef=0.00001, total_timesteps=25000, learning_rate=lr()): env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10) env.reset((300, 300)) if load is None: model = PPO('MlpPolicy', env, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, tensorboard_log=f"./ppo_bilboquet_tensorboard/") model.learn(total_timesteps=total_timesteps, tb_log_name=save_name) model.save(save_name + '.zip') print('DONE') obs = env.reset() else: model = PPO.load(load) obs = env.reset() for i in range(episodes): action, _states = model.predict(obs, deterministic=True) # print(action) obs, reward, done, info = env.step(action) # print(reward) env.render() if done: obs = env.reset()
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] gym_config = SimulationParameters(time_step=TIME_STEP) robot_class = QuadrupedRobot robot_params = MiniCheetahParams( on_rack=False, enable_self_collision=True, motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT) task = TestTask(train_or_test=TEST_OR_TRAIN) env = LocomotionGymEnv(gym_config, robot_class, robot_params, task) policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) if TEST_OR_TRAIN == "train": model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: model = PPO.load(POLICY_SAVE_PATH) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def main(): tensorboard_log = "./log" env = Pinokio3() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists( save_file ): model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log ) else: policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch) model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log) #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/', name_prefix='pinokio3') while True: model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name ) model.save( save_file ) print( "saved" ) obs = env.reset() for i in range(20): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) print( "action {} -> reward {}".format( env.decode_action(action), reward ) ) env.render() if done: print( "resetting because " + str(done) ) env.reset()
def main(): env = Pinokio2() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env])) else: model = PPO(MlpPolicy, env, verbose=1) while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=100000) model.save(save_file) obs = env.reset() for i in range(10): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset()
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def load_model(model_path, policy_class, policy_kwargs, env, hp, partners, testing, try_load=True): load_successful = False if try_load: try: model = PPO.load(model_path) #, policy_kwargs=policy_kwargs) load_successful = True print("Model loaded successfully") except Exception as e: print("Could not load model", e) if not load_successful: print("Create new model") n_steps, batch_size, n_epochs, = hp['n_steps'], hp['batch_size'], hp[ 'n_epochs'] model = PPO(policy_class, env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0, ent_coef=0.00, marginal_reg_coef=hp['mreg']) for name, param in model.policy.named_parameters(): if param.requires_grad: print(name, param.data.size()) vec_env = DummyVecEnv([lambda: env]) model.set_env(vec_env) model.policy.set_partners(partners) if testing: model.policy.num_partners = 1 # only test 1 partner model.marginal_reg_coef = 0 model.n_epochs = hp['n_epochs_testing'] model.n_steps = hp['n_steps_testing'] model._init_rollout_buffer() return model
def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) # Create log dir log_dir = './ppo_data' os.makedirs(log_dir, exist_ok=True) env = VecMonitor(env, log_dir) callback = custom_call_back.CustomCallback(check_freq = 1000,log_dir = log_dir) model = PPO('MlpPolicy', env, verbose=1,n_steps=500,batch_size = 10000) model.learn(total_timesteps=2500000000,callback = callback)