def test_monitor_load_results(tmp_path): """ test load_results on log files produced by the monitor wrapper """ tmp_path = str(tmp_path) env1 = gym.make("CartPole-v1") env1.seed(0) monitor_file1 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env1 = Monitor(env1, monitor_file1) monitor_files = get_monitor_files(tmp_path) assert len(monitor_files) == 1 assert monitor_file1 in monitor_files monitor_env1.reset() episode_count1 = 0 for _ in range(1000): _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample()) if done: episode_count1 += 1 monitor_env1.reset() results_size1 = len(load_results(os.path.join(tmp_path)).index) assert results_size1 == episode_count1 env2 = gym.make("CartPole-v1") env2.seed(0) monitor_file2 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env2 = Monitor(env2, monitor_file2) monitor_files = get_monitor_files(tmp_path) assert len(monitor_files) == 2 assert monitor_file1 in monitor_files assert monitor_file2 in monitor_files monitor_env2.reset() episode_count2 = 0 for _ in range(1000): _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample()) if done: episode_count2 += 1 monitor_env2.reset() results_size2 = len(load_results(os.path.join(tmp_path)).index) assert results_size2 == (results_size1 + episode_count2) os.remove(monitor_file1) os.remove(monitor_file2)
def test_monitor(): """ test the monitor wrapper """ env = gym.make("CartPole-v1") env.seed(0) mon_file = "/tmp/stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()) menv = Monitor(env, mon_file) menv.reset() for _ in range(1000): _, _, done, _ = menv.step(0) if done: menv.reset() file_handler = open(mon_file, 'rt') firstline = file_handler.readline() assert firstline.startswith('#') metadata = json.loads(firstline[1:]) assert metadata['env_id'] == "CartPole-v1" assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata" last_logline = pandas.read_csv(file_handler, index_col=None) assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" file_handler.close() os.remove(mon_file)
def hardcode(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) print("Running episodes with hardcoded policy.") inc = 0 done = False while inc < timesteps: obs = env.reset() while True: action = policy(obs) obs, _, done, _ = env.step(action) inc += 1 if done: break env.close()
def main(): """ Runs the test """ """ Create an argparse.ArgumentParser for run_mujoco.py. :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--play', default=False, action='store_true') return parse """ env_id = 'UR5Gripper-v0' model_path = '/tmp/gym/trpo_mpi/' # args = mujoco_arg_parser().parse_args() # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path) env = gym.make(env_id) env = Monitor(env, model_path, allow_early_resets=True) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path) model = model.load(model_path + "trpo.pkl") model.learn(total_timesteps=int(1e5), callback=callback) model.save(model_path + "trpo.pkl") # tf_util.save_state(model_path) # Enjoy trained agent obs = env.reset() for i in range(100): obs = env.reset() env.render() for i in range(200): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def random_agent(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) print("Running episodes with random policy.") # initalize timestep counter inc = 0 while inc < timesteps: obs = env.reset() while True: # choose a random action from action_space action = env.action_space.sample() obs, _, done, _ = env.step(action) inc += 1 if done: break env.close()
# Create log dir for callback model saving os.makedirs("./temp_models/", exist_ok=True) env = Monitor(env, "./temp_models/", allow_early_resets=True) ##### TRAIN ##### if args.train: check_overwrite(args.model) model = SAC(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard_log/") model.learn(total_timesteps=int(args.step), log_interval=10, tb_log_name="log", callback=callback.callback) model.save(MODELS_FOLDER_PATH) #### TEST ##### if not args.train: model = SAC.load(MODELS_FOLDER_PATH) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(scale_range(action, -1, 1, 0, 1)) env.render() if done: obs = env.reset()
env = gym.make('UR5Gripper-v0') # Create the vectorized environment # env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) env = Monitor(env, log_dir, allow_early_resets=True) # env = SubprocVecEnv([make_mujoco_env(env_id, i) for i in range(num_cpu)]) # env = SubprocVecEnv([lambda: env]) env = DummyVecEnv([lambda: env]) # env = SubprocVecEnv([lambda: gym.make('UR5Gripper-v0') for i in range(num_cpu)]) # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization # model = DDPG(MlpPolicy, env, param_noise=param_noise, verbose=1, tensorboard_log=log_dir) # model = PPO2(MlpPolicy, env, verbose=1) # model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) # Random Agent, before training mean_reward_before_train = evaluate(model, num_steps=1000) # Train the agent model.learn(total_timesteps=int(1e7), callback=callback) mean_reward_after_train = evaluate(model, num_steps=1000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(): global save_path, log_dir, model, best_mean_reward mk_dir(args.checkpoint_dir + args.policy) save_path = args.checkpoint_dir + args.policy + "/" + args.policy log_dir = args.summary_dir + args.policy mk_dir(log_dir) env = gym.make("SegmentationEnv-v0", objs_dir=args.objs_dir, max_scenes=args.max_scenes, sample_size=args.sample_size, diff_punishment=args.diff_punishment, max_steps_per_scene=args.max_steps_per_scene, scene_mode=args.scene_mode, point_mode=args.point_mode, voxel_size=args.voxel_size, voxel_mode=args.voxel_mode, single_scenes=args.single_scenes, early_diff=args.early_diff, wall_weight=args.wall_weight) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run env = VecCheckNan(env, raise_exception=True) net_module = importlib.import_module(args.policy) model = PPO2(net_module.Policy, env, verbose=args.verbose, tensorboard_log=log_dir, learning_rate=args.learning_rate, ent_coef=args.ent_coef, cliprange=args.cliprange, cliprange_vf=args.cliprange_vf, lam=args.lam, gamma=args.gamma, seed=args.seed, n_cpu_tf_sess=args.n_cpu_tf_sess, noptepochs=args.noptepochs, nminibatches=args.nminibatches, n_steps=args.n_steps, max_grad_norm=args.max_grad_norm) if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1: print("------------start pretrain------------") #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16) dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, train_fraction=args.train_fraction, batch_size=args.pretrain_batch_size) #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000) model = model.pretrain(dataset, val_interval=1, learning_rate=args.pretrain_learning_rate, n_epochs=args.pretrain_n_epochs) print("pretrain finished -- save model") model.save(save_path) returns = [] print("Calculate mean reward") n_episodes = 10 for i in range(n_episodes): total_reward = 0 obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) total_reward += reward if done: returns.append(total_reward) break returns = np.array(returns) best_mean_reward = np.mean(returns) print("Best mean reward: {:.2f}".format(best_mean_reward)) model.learn(total_timesteps=args.total_timesteps, callback=callback) env.close()
def step(self, action): if self.render_gui and self.rank == self.render_rank: self.render() return Monitor.step(self, action)
import gym import os from stable_baselines.bench import Monitor from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 # ログフォルダの生成 log_dir = './logs/' os.makedirs(log_dir, exist_ok=True) # 環境の生成 env = gym.make('CartPole-v1') env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # モデルの生成 model = PPO2(MlpPolicy, env, verbose=1) # モデルの学習 model.learn(total_timesteps=10000) # モデルのテスト state = env.reset() for i in range(200): env.render() action, _ = model.predict(state) state, rewards, done, info = env.step(action) if done: break
import gym import AI_test from stable_baselines.common.env_checker import check_env from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.bench import Monitor from stable_baselines.common.vec_env import DummyVecEnv env = gym.make('AI_test:Ai-v0') env = Monitor(env, filename=None, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = ACKTR('MlpPolicy', env, verbose=1).learn(10000) # Test the trained agent obs = env.reset() n_steps = 100 for step in range(n_steps): action, _ = model.predict(obs, deterministic=True) print("Step {}".format(step + 1)) print("Action: ", action) obs, reward, done, info = env.step(action) print('obs=', obs, 'reward=', reward, 'done=', done) env.render() if done: # Note that the VecEnv resets automatically # when a done signal is encountered print("Goal reached!", "reward=", reward) break
def step(self, action): if self.render_me: self.render() ret = Monitor.step(self, action) return ret
def main(): #criando diretorio log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) #criando envs envRoll = gym.make('gym_foo:DroneRoll-v0') envRoll = Monitor(envRoll, log_dir) modelRoll = PPO2(MlpPolicy, envRoll, gamma=0.99, n_steps=2048, ent_coef=0.0, learning_rate=3e-4, lam=0.95, nminibatches=32, noptepochs=10, cliprange=0.2, verbose=1) envPitch = gym.make('gym_foo:DronePitch-v0') envPitch = Monitor(envPitch, log_dir) modelPitch = PPO2(MlpPolicy, envPitch, gamma=0.99, n_steps=2048, ent_coef=0.0, learning_rate=3e-4, lam=0.95, nminibatches=32, noptepochs=10, cliprange=0.2, verbose=1) envYaw = gym.make('gym_foo:DroneYaw-v0') envYaw = Monitor(envYaw, log_dir) modelYaw = PPO2(MlpPolicy, envYaw, gamma=0.99, n_steps=2048, ent_coef=0.0, learning_rate=3e-4, lam=0.95, nminibatches=32, noptepochs=10, cliprange=0.2, verbose=1) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) #treinando time_steps = 2e6 modelRoll.learn(total_timesteps=int(2e6), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO Roll") plt.show() modelPitch.learn(total_timesteps=int(2e6), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO Pitch") plt.show() modelYaw.learn(total_timesteps=int(2e6), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO Yaw") plt.show() #salvando modelos modelRoll.save("Drone_Roll_PPO_001") modelPitch.save("Drone_Pitch_PPO_001") modelYaw.save("Drone_Yaw_PPO_001") #Load modelo #model = PPO2.load("Drone_Roll_PPO_0.01") #testando gerando resposta no tempo T = [0] # Loop de teste t = 0 #obs = env.reset() obsRoll = envRoll.reset() obsPitch = envPitch.reset() obsYaw = envYaw.reset() Roll = [envRoll.state[0]] Pitch = [envPitch.state[0]] Yaw = [envYaw.state[0]] #loop de simulação while t < 10: # ate 10 segundos actionRoll, _states = modelRoll.predict(obsRoll) # Retrieve new state, reward, and whether the state is terminal obsRoll, reward, done, info = envRoll.step(actionRoll) Roll.append((180 / np.pi) * envRoll.state[0]) actionPitch, _states = modelPitch.predict(obsPitch) # Retrieve new state, reward, and whether the state is terminal obsPitch, reward, done, info = envPitch.step(actionPitch) Pitch.append((180 / np.pi) * envPitch.state[0]) actionYaw, _states = modelYaw.predict(obsYaw) # Retrieve new state, reward, and whether the state is terminal obsYaw, reward, done, info = envYaw.step(actionYaw) Yaw.append((180 / np.pi) * envYaw.state[0]) t += 0.01 T.append(t) #Plots plt.figure(1) plt.plot(T, Roll) plt.yticks(np.arange(0, 190, 10)) plt.ylabel('Roll') plt.xlabel('Time (seconds)') plt.title('Roll Response') plt.grid() plt.show() plt.figure(2) plt.plot(T, Pitch) plt.yticks(np.arange(0, 190, 10)) plt.ylabel('Pitch') plt.xlabel('Time (seconds)') plt.title('Pitch Response') plt.grid() plt.show() plt.figure(3) plt.plot(T, Yaw) plt.yticks(np.arange(0, 190, 10)) plt.ylabel('Yaw') plt.xlabel('Time (seconds)') plt.title('Yaw Response') plt.grid() plt.show()