def load(self, path, env): if self.trpo(): return TRPO.load(path, env=env) elif self.ppo(): return PPO2.load(path, env=env) else: return SAC.load(path, env=env)
def create_learner(self, env, parameters): if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv): env = DummyVecEnv([lambda: env]) if self.trpo(): model = TRPO(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = TRPOInterface(model, env.observation_space.shape[0]) elif self.ppo(): model = PPO2(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = PPOInterface(model, env.observation_space.shape[0]) else: model = SAC(SACMlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = SACInterface(model, env.observation_space.shape[0]) if "pretrain_data_path" in parameters: data_path = parameters["pretrain_data_path"] model.pretrain(ExpertDataset(expert_path=data_path, verbose=0), n_epochs=25) return model, interface
model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-PPO") # model.learn(total_timesteps=1000000, tb_log_name="tb/PPO") ######################## TRPO ########################### log_dir = "./logs/%s/AVEC-TRPO_%s" % (env_id, seed) # log_dir = "./logs/%s/TRPO_%s" % (env_id, seed) os.makedirs(log_dir, exist_ok=True) env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir) model = TRPO('MlpPolicy', env, verbose=1, avec_coef=1., vf_coef=0., tensorboard_log=log_dir) model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-TRPO") # model.learn(total_timesteps=1000000, tb_log_name="tb/TRPO") ######################### SAC ############################# log_dir = "./logs/%s/AVEC-SAC_%s" % (env_id, seed) # log_dir = "./logs/%s/SAC_%s" % (env_id, seed) os.makedirs(log_dir, exist_ok=True) env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir) model = SAC('CustomSACPolicy', env, verbose=1, avec_coef=1., value_coef=0., tensorboard_log=log_dir) model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-SAC") # model.learn(total_timesteps=1000000, tb_log_name="tb/SAC")
import gym import gym_handOfJustice import cv2 import tensorflow as tf from stable_baselines.sac import SAC from stable_baselines.sac.policies import LnCnnPolicy import os strea = cv2.VideoCapture(os.getcwd() + "\\dataset\\%06d.png") if not strea.isOpened(): raise Exception("Problem exporting the video stream") env = gym.make("handOfJustice-v0", cap=strea, epsilon=300) #tf.test.is_gpu_available() model = SAC(LnCnnPolicy, env, verbose=1, tensorboard_log=os.getcwd() + "\\logs\\", full_tensorboard_log=True) model.load("handicap_justice") model.learn(total_timesteps=100000, log_interval=10) model.save("handicap_justice") #model.load("handicap_justice") import time time.sleep(3) print("\n" + ("=" * 20) + "\nTraining complete\n" + ("=" * 20) + "\n\n") ## No is the first image gonna be taken obs = env.reset() done = False i = 45000
def run(env_id, seed, layer_norm, evaluation, agent, delay_step, gamma=0.99, **kwargs): # Create envs. env = create_env(env_id, delay_step, str(0)) print(env.observation_space, env.action_space) if evaluation: eval_env = create_env(env_id, delay_step, "eval_env") else: eval_env = None # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = time.time() policy = 'MlpPolicy' td3_variants = { "TD3": TD3, "TD3SIL": TD3SIL, "TD3NSTEP": TD3NSTEP, "TD3REDQ": TD3REDQ, "TD3DoubleTwin": TD3DoubleTwin, } if td3_variants.get(agent, None): model_func = td3_variants[agent] model = model_func(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, policy_kwargs={"layers": [400, 300]}) elif agent == "DDPG": model = DDPG(policy=policy, env=env, eval_env=eval_env, gamma=gamma, nb_eval_steps=5, batch_size=100, nb_train_steps=100, nb_rollout_steps=100, learning_starts=10000, actor_lr=1e-3, critic_lr=1e-3, critic_l2_reg=0, tau=0.005, normalize_observations=False, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2, n_cpu_tf_sess=10, policy_kwargs={"layers": [400, 300]}) elif agent == "SAC": model = SAC(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=256, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2, n_cpu_tf_sess=10, learning_starts=10000, policy_kwargs={"layers": [256, 256]}) elif agent == "GEM": policy = 'TD3LnMlpPolicy' model = TD3MemGEM(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, alpha=0.5, beta=-1, iterative_q=-1, num_q=4, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10, policy_kwargs={"layers": [400, 300]}) elif agent == "BP": policy = 'TD3LnMlpPolicy' model = TD3MemBackProp(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, alpha=0.5, beta=-1, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10, policy_kwargs={"layers": [400, 300]}) else: raise NotImplementedError print("model building finished") model.learn(total_timesteps=kwargs['num_timesteps']) env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))
self.model.save(self.save_path) return True # Create log dir log_dir = "/home/prem/Downloads/Hyper_4" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make('BipedalWalker-v3') # Logs will be saved in log_dir/monitor.csv env = Monitor(env, log_dir) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Because we use parameter noise, we should use a MlpPolicy with layer normalization model = SAC( MlpPolicy, env, verbose=1, #n_timesteps= int(float(1e6)), learning_rate=3e-4, #buffer_size= 1000000, batch_size=64, ent_coef=0.005, train_freq=1, gradient_steps=1, learning_starts=1000) # Train the agent model.learn(total_timesteps=1000000, callback=callback)