def main(): tensorboard_log = "./log" env = Pinokio5() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env]), tensorboard_log=tensorboard_log) else: model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log) try: while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=8000000, tb_log_name=tb_log_name) model.save(save_file) obs = env.reset() for i in range(100): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset() except KeyboardInterrupt: print("Saving before exiting...") model.save(save_file) print("k bye")
def main(): tensorboard_log = "./log" env = Pinokio3() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists( save_file ): model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log ) else: policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch) model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log) #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/', name_prefix='pinokio3') while True: model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name ) model.save( save_file ) print( "saved" ) obs = env.reset() for i in range(20): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) print( "action {} -> reward {}".format( env.decode_action(action), reward ) ) env.render() if done: print( "resetting because " + str(done) ) env.reset()
def trained_agent(episodes=256, continuous=True, load=None, save_name="test", ent_coef=0.00001, total_timesteps=25000, learning_rate=lr()): env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10) env.reset((300, 300)) if load is None: model = PPO('MlpPolicy', env, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, tensorboard_log=f"./ppo_bilboquet_tensorboard/") model.learn(total_timesteps=total_timesteps, tb_log_name=save_name) model.save(save_name + '.zip') print('DONE') obs = env.reset() else: model = PPO.load(load) obs = env.reset() for i in range(episodes): action, _states = model.predict(obs, deterministic=True) # print(action) obs, reward, done, info = env.step(action) # print(reward) env.render() if done: obs = env.reset()
def main(args): envs = make_vec_env(args.env_name, n_envs=args.num_envs, vec_env_cls=SubprocVecEnv) viz_env = None if args.viz: nm_core, nm_vrsn, = args.env_name.split('-') nm_core += 'Viz' if args.viz else 'Dbg' if args.debug else '' viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1) rl_learner = PPO('MlpPolicy', envs, verbose=1, seed=args.seed, device='cpu') for epoch in range(args.num_epochs): rl_learner.learn(args.steps_per_epoch) if args.viz: obs = viz_env.reset() done = False while not done: act, _ = rl_learner.predict(obs) if len(act.shape) > len(viz_env.action_space.shape): act = act[0:1] # just one viz env obs, rwd, done, _ = viz_env.step(act) time.sleep(0.01) # to make motions visible
def main(): base_args, base_parser = get_logger2_args() args = get_args(base_parser) args.device = init_gpus_and_randomness(args.seed, args.gpu) logger = Logger2('/tmp/tmp', use_tensorboardX=True) logger.log_tb_object(args, 'args') envs = make_vec_env(args.env_name, n_envs=args.num_envs, vec_env_cls=SubprocVecEnv) viz_env = None if args.visualize: nm_core, nm_vrsn, = args.env_name.split('-') nm_core += 'Viz' if args.visualize else 'Dbg' if args.debug else '' viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1) rl_learner = PPO('MlpPolicy', envs, verbose=1, seed=args.seed, device='cpu') for epoch in range(args.num_epochs): rl_learner.learn(args.steps_per_epoch) if args.visualize: obs = viz_env.reset() done = False while not done: act, _ = rl_learner.predict(obs) if len(act.shape) > len(viz_env.action_space.shape): act = act[0:1] # just one viz env obs, rwd, done, _ = viz_env.step(act) time.sleep(0.01) # to make motions visible
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] gym_config = SimulationParameters(time_step=TIME_STEP) robot_class = QuadrupedRobot robot_params = MiniCheetahParams( on_rack=False, enable_self_collision=True, motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT) task = TestTask(train_or_test=TEST_OR_TRAIN) env = LocomotionGymEnv(gym_config, robot_class, robot_params, task) policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) if TEST_OR_TRAIN == "train": model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: model = PPO.load(POLICY_SAVE_PATH) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def main(): env = Pinokio2() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env])) else: model = PPO(MlpPolicy, env, verbose=1) while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=100000) model.save(save_file) obs = env.reset() for i in range(10): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset()
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
class Agent(object): def __init__(self, env, model=None): if model: self.model = model else: self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) policy_kwargs = dict( features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=256), net_arch=[dict(pi=[64, 64], vf=[64, 64])]) self.model = PPO(CustomCnnPolicy, vec_env, policy_kwargs=policy_kwargs, verbose=1, learning_rate=0.001) def function(self, obs, conf): import random col, _ = self.model.predict(np.array(obs['board']).reshape( 6, 7, 1)) # TODO: Connect-4 specific so far is_valid = (obs['board'][int(col)] == 0) if is_valid: return int(col) else: return random.choice([ col for col in range(config.columns) if obs.board[int(col)] == 0 ]) def train(self, timesteps): self.model.learn(total_timesteps=timesteps) def save(self, name: str): self.model.save(name) def load(self, name: str, env, replace_parameters=None): self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) self.model = PPO.load(name, env=vec_env, custom_objects=replace_parameters) def plot(self): # Plot cumulative reward with open(os.path.join(self.log_dir, "monitor.csv"), 'rt') as fh: firstline = fh.readline() assert firstline[0] == '#' df = pd.read_csv(fh, index_col=None)['r'] df.rolling(window=1000).mean().plot() plt.show()
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] env_params = { 'time_step': TIME_STEP, 'robot_class': QuadrupedRobot, 'on_rack': False, 'enable_self_collision': True, 'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT, 'train_or_test': test_or_train } policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]} if TEST_OR_TRAIN == "train": env = make_vec_env(env_change_input, n_envs=NUM_CPUS, seed=0, env_kwargs=env_params, vec_env_cls=SubprocVecEnv) env = VecNormalize(env) if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: # env = env_change_input(time_step=env_params['time_step'], # robot_class=env_params['robot_class'], # on_rack=env_params['on_rack'], # enable_self_collision=env_params['enable_self_collision'], # motor_control_mode=env_params['motor_control_mode'], # train_or_test=env_params['train_or_test']) env = env_change_input(**env_params) model_load_path = os.path.join(policy_save_dir, 'ppo_3_17-03-2021_15-39-42') model = PPO.load(model_load_path) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def main(): env = gym.make(ENV_NAME) model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=100000) obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset() env.close()
def main(fitts_W, fitts_D, ocular_std, swapping_std, run, timesteps, logs_folder): # Create log dir lc_dir = f'./{logs_folder}/w{fitts_W}d{fitts_D}ocular{ocular_std}swapping{swapping_std}/' log_dir = f'{lc_dir}/run{run}/' os.makedirs(log_dir, exist_ok=True) # Instantiate the env env = Gaze(fitts_W=fitts_W, fitts_D=fitts_D, ocular_std=ocular_std, swapping_std=swapping_std) env = Monitor(env, log_dir) # Train the agent model = PPO('MlpPolicy', env, verbose=0, clip_range=0.15) model.learn(total_timesteps=int(timesteps)) # save the model model.save(f'{log_dir}savedmodel/model_ppo') # plot learning curve plot_results2(log_dir) plt.savefig(f'{lc_dir}learning_curve{run}.png') plt.close('all') ########################################################################### # Record Behaviour of the trained policy ########################################################################### # save the step data # Test the trained agent n_eps = 5000 number_of_saccades = np.ndarray(shape=(n_eps, 1), dtype=np.float32) eps = 0 while eps < n_eps: done = False step = 0 obs = env.reset() while not done: step += 1 action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) if done: number_of_saccades[eps] = step eps += 1 break np.savetxt(f'{log_dir}num_saccades.csv', number_of_saccades, delimiter=',')
def DRL_prediction(model: PPO, environment: StockTradingEnv) -> object: test_env, test_obs = environment.get_sb_env() """make a prediction""" account_memory = [] actions_memory = [] test_env.reset() for i in range(len(environment.df.index.unique())): action, _ = model.predict(test_obs, deterministic=True) test_obs, rewards, dones, info = test_env.step(action) if i == (len(environment.df.index.unique()) - 2): account_memory = test_env.env_method( method_name="save_asset_memory") actions_memory = test_env.env_method( method_name="save_action_memory") if dones[0]: print("hit end!") break return account_memory[0], actions_memory[0]
def train(is_learn=False, log=False): if is_learn: model = PPO("MlpPolicy", env, verbose=0) model.learn(total_timesteps=20000) model.save("ppo_stock") else: model = PPO.load("ppo_stock") obs = env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) if log: with open('./log/ppo.txt', 'a') as f: f.write(f'{env.env_method("log")[0]}\n') env.render()
def just_bob(): for i in [100000, 500000, 1000000, 5000000]: start = time.time() bob = PPO("CnnPolicy", VectorizedClass(GetBobEnvClass(25), 6), verbose=0).learn(i) end = time.time() print( f"For {i} we took {end-start} and got {evaluate(bob, 25, episodes=100)}" ) exit() done = False env = GetBobEnvClass(25)() obs = env.reset() while not done: action = bob.predict(obs) obs, rew, done, _ = env.step(action[0]) env.render()
def main(): # Initialize the environment env = WebotsStickEnv() check_env(env) # Train model = PPO('MlpPolicy', env, n_steps=2048, verbose=1) model.learn(total_timesteps=1e5) # Replay print('Training is finished, press `Y` for replay...') env.wait_keyboard() obs = env.reset() for t in range(100000): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) print(obs) if done: obs = env.reset()
class AgentDemoWrapper(gym.Wrapper): def __init__(self, env, agent_path=None, tempdir_path=None): self.alg = PPO('MlpPolicy', env, verbose=0) if agent_path is not None: load_path = agent_path self.alg.set_parameters(load_path, exact_match=True) if tempdir_path is None: tempdir_path = 'temp' try: os.mkdir(tempdir_path) except: pass self.save_dir = tempdir_path self.max_attempt = 1000 super(AgentDemoWrapper, self).__init__(env) def reset(self): obs = self.env.reset() return obs def step(self, action): obs, reward, done, info = self.env.step(action) return obs, reward, done, info def generate_episode_gif(self, init_map): images = [] done = False obs = self.env.manual_reset(init_map) im = room_to_rgb(obs) images.append(im) while not done: action, _ = self.alg.predict(obs, deterministic=True) obs, _, done, _ = self.env.step(action) im = room_to_rgb(obs) images.append(im) im_name = '{}/agent_episode.gif'.format(self.save_dir) imageio.mimsave(im_name, images, 'GIF', fps=2)
def stock_trade(stock_file): day_profits = [] df = pd.read_csv(stock_file) df = df.sort_values('date') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: StockTradingEnv(df)]) model = PPO('MlpPolicy', env, verbose=0, tensorboard_log='./log') model.learn(total_timesteps=int(1e6)) df_test = pd.read_csv(stock_file.replace('train', 'test')) env = DummyVecEnv([lambda: StockTradingEnv(df_test)]) obs = env.reset() for i in range(len(df_test) - 1): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) profit = env.render() day_profits.append(profit) if done: break return day_profits
def multiprocessing_example(): # Multiprocessing: Unleashing the Power of Vectorized Environments def make_env(env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID. :param num_env: (int) the number of environments you wish to have in subprocesses. :param seed: (int) the inital seed for RNG. :param rank: (int) index of the subprocess. """ def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use. # Create the vectorized environment. env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) # Stable Baselines provides you with make_vec_env() helper which does exactly the previous steps for you. # You can choose between 'DummyVecEnv' (usually faster) and 'SubprocVecEnv'. #env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv) model = PPO("MlpPolicy", env, verbose=1) model.learn(total_timesteps=25_000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def charlie(): for i in [100000 // 6, 500000 // 6, 1000000 // 6, 5000000 // 6]: start = time.time() bob = PPO("CnnPolicy", VectorizedClass(GetBobEnvClass(10), 6), verbose=0, n_steps=200) charli = PPO("MlpPolicy", CharlieEnv(bob, t=200, maxsize=10), verbose=0, n_steps=1000).learn(i) end = time.time() print( f"For {i} we took {end-start} and got {evaluate(bob, 10, episodes=100)}" ) exit() done = False env = GetBobEnvClass(25)() obs = env.reset() while not done: action = bob.predict(obs) obs, rew, done, _ = env.step(action[0]) env.render()
n_cpu = 6 batch_size = 64 env = make_vec_env("highway-fast-v0", n_envs=n_cpu, vec_env_cls=SubprocVecEnv) model = PPO( "MlpPolicy", env, policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])]), n_steps=batch_size * 12 // n_cpu, batch_size=batch_size, n_epochs=10, learning_rate=5e-4, gamma=0.8, verbose=2, tensorboard_log="highway_ppo/") # Train the agent model.learn(total_timesteps=int(2e4)) # Save the agent model.save("highway_ppo/model") model = PPO.load("highway_ppo/model") env = gym.make("highway-fast-v0") for _ in range(5): obs = env.reset() done = False while not done: action, _ = model.predict(obs) obs, reward, done, info = env.step(action) env.render()
done = False reward = 0 evasions = 0 evasion_history = {} # Train the agent agent = PPO("MlpPolicy", env, verbose=1) agent.learn(total_timesteps=2500) # Test the agent for i in range(episode_count): ob = env.reset() sha256 = env.env.sha256 while True: action, _states = agent.predict(ob, reward, done) obs, rewards, done, ep_history = env.step(action) if done and rewards >= 10.0: evasions += 1 evasion_history[sha256] = ep_history break elif done: break # Output metrics/evaluation stuff evasion_rate = (evasions / episode_count) * 100 mean_action_count = np.mean(env.get_episode_lengths()) print(f"{evasion_rate}% samples evaded model.") print(f"Average of {mean_action_count} moves to evade model.")
raise ValueError(f'Unrecognized action {action}') self._state = np.clip(self._state, 0, self._grid_size - 1) done = bool(self._state == self._grid_size - 1) reward = 1 if done else 0 return np.array([self._state]).astype(np.float32), reward, done, {} def reset(self): self._state = 0 return np.array([self._state]).astype(np.float32) def render(self, mode='human'): pass if __name__ == '__main__': check_env(GridWorld(10)) env = make_vec_env(lambda: GridWorld(10), n_envs=1) model = PPO('MlpPolicy', env, verbose=1).learn(5000) state = env.reset() for _ in range(20): action, _ = model.predict(state, deterministic=True) # action = 0 next_state, reward, done, info = env.step(action) print(f'{state} -> {action} -> {next_state}: {reward}') state = next_state if done: break
model = PPO.load('../model/ppo', env=env) result = {} mean_reward = [] scores = [] episodes = 1000 with open("../result/PPO.txt", "w") as txtfile: for episode in range(1, episodes + 1): print(f"episode: {episode}") state = env.reset() done = False temp_result = {} score = 0 while done != True: action, _states = model.predict(state) n_state, reward, done, info = env.step(action) score += reward mean_reward.append(score) scores.append(info[0]['score']) temp = str(episode) + "," + str(score[0]) + "," + str( info[0]['score']) + "\n" txtfile.write(temp) mean = sum(mean_reward) / len(mean_reward) mean_score = sum(scores) / len(scores) print(f"The mean reward is {mean}") print(f"The mean score reward is {mean_score}") print(f"The max score is {max(scores)}")
num_steps=num_gen_steps, num_boxes=num_boxes, second_player=False) _, state, _ = fix_room except: success = False for i in range(len(version_li)): version = version_li[i] load_path = '{}/agent_v{}.zip'.format(load_dir, version) agent.set_parameters(load_path, exact_match=True) # agent = agent_li[i] done = False obs = np.expand_dims(soko_env.env_method('manual_reset', state)[0], axis=0) while not done: action, _ = agent.predict(obs, deterministic=True) obs, _, done, info = soko_env.step(action) # solved if info[0]["all_boxes_on_target"]: num_solved_li[i] += 1 if unique_solver_idx == -1: unique_solver_idx = i else: unique_solver_idx = -1 if unique_solver_idx != -1: num_unique_solved_li[unique_solver_idx] += 1 for i in range(len(version_li)): print('{} solved {}, uniquely solved {}'.format(
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(mean_reward) print(std_reward) render_env = base_env.copy().parallel_env() render_env = ss.color_reduction_v0(render_env, mode='B') render_env = ss.resize_v0(render_env, x_size=84, y_size=84) render_env = ss.frame_stack_v1(render_env, 3) obs_list = [] i = 0 render_env.reset() while True: for agent in render_env.agent_iter(): observation, _, done, _ = render_env.last() action = model.predict(observation, deterministic=True)[0] if not done else None render_env.step(action) i += 1 if i % (len(render_env.possible_agents)) == 0: obs_list.append(np.transpose(render_env.render(mode='rgb_array'), axes=(1, 0, 2))) render_env.close() break print('Writing gif') write_gif(obs_list, 'kaz.gif', fps=15)
from stable_baselines3 import PPO from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.utils import set_random_seed def make_env(env_id, rank, seed=0): def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init if __name__ == '__main__': env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1) print(mean_reward) print(std_reward) # Maximum number of steps before reset, +1 because I'm scared of OBOE print("Starting rendering") num_steps = (max_time // delta_time) + 1 obs = env.reset() if os.path.exists("temp"): shutil.rmtree("temp") os.mkdir("temp") # img = disp.grab() # img.save(f"temp/img0.jpg") img = env.render() for t in trange(num_steps): actions, _ = model.predict(obs, state=None, deterministic=False) obs, reward, done, info = env.step(actions) img = env.render() img.save(f"temp/img{t}.jpg") subprocess.run(["ffmpeg", "-y", "-framerate", "5", "-i", "temp/img%d.jpg", "output.mp4"]) print("All done, cleaning up") shutil.rmtree("temp") env.close()
import gym from stable_baselines3 import PPO env = gym.make("CartPole-v1") model = PPO("MlpPolicy", env, verbose=1) import ipdb ipdb.set_trace() model.learn(total_timesteps=10000) obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset() env.close()
class ALGDemoWrapper(gym.Wrapper): def __init__(self, env, alg_path=None, alg_version=0, tempdir_path=None): self.alg = PPO('MlpPolicy', env, verbose=0) if alg_path is not None: load_path = alg_path + str(alg_version) self.alg.set_parameters(load_path, exact_match=True) if tempdir_path is None: tempdir_path = 'temp' try: os.mkdir(tempdir_path) except: pass self.save_dir = tempdir_path self.max_attempt = 1000 self.version = alg_version super(ALGDemoWrapper, self).__init__(env) def reset(self): obs = self.env.reset() return obs def step(self, action): obs, reward, done, info = self.env.step(action) return obs, reward, done, info def generate_level(self): while True: done = False obs = self.env.reset() while not done: action, _ = self.alg.predict(obs, deterministic=True) obs, _, done, info = self.env.step(action) if info['fail_type'] == -1: return obs def generate_episode_gif(self): attempt = 0 while True: images = [] done = False obs = self.env.reset() im = room_to_rgb(obs) images.append(im) while not done: action, _ = self.alg.predict(obs, deterministic=True) obs, _, done, info = self.env.step(action) im = room_to_rgb(obs) images.append(im) if info['train_result'] == 0: im_name = '{}/alg_episode_v{}.gif'.format( self.save_dir, self.version) imageio.mimsave(im_name, images, 'GIF', fps=2) return True, obs attempt += 1 if attempt >= self.max_attempt: print('Time out. Wasn\'t able to generate good map.') return False, None