def test(self, model_epoch: int = 0, should_render: bool = True): train_provider, test_provider = self.data_provider.split_data_train_test(self.train_split_percentage) del train_provider test_env = SubprocVecEnv([make_env(test_provider, i) for i in range(self.n_envs)]) model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl') model = self.Model.load(model_path, env=test_env) self.logger.info(f'Testing model ({self.study_name}__{model_epoch})') state = None obs, done, rewards = test_env.reset(), [False], [] while not all(done): action, state = model.predict(obs, state=state) obs, reward, done, _ = test_env.step(action) rewards.append(reward) if should_render and self.n_envs == 1: test_env.render(mode='human') self.logger.info( f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}')
def main(): #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)]) env = SubprocVecEnv([ (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10)) for i in range(1) ]) try: model = PPO2("MlpPolicy", env, verbose=1, tensorboard_log='/home/ralph/swoc2019/log') if SaveFile.exists(): print('loading...') model.load_parameters(SaveFile) else: print('Warning: No save file loaded') print('evaluating...', end='') obs = env.reset() totalRewards = None for i in range(100): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) totalRewards = totalRewards + rewards if totalRewards is not None else rewards env.render() sleep(0.2) print(f'mean reward: {np.mean(totalRewards)}') except KeyboardInterrupt: print('closing...') finally: env.close() print('closed')
def attention_render(model_name, env_name, num_cpu, log_dir): if not os.path.exists(log_dir): raise ('log_dir not Exists') env_id = env_name + 'NoFrameskip-v4' env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)]) # env = Monitor(env, log_dir, allow_early_resets=True) if model_name == 'A2C_Attention': model = A2C(AttentionPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention2': model = A2C(Attention2Policy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C': model = A2C(LstmPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') else: model = None model = model.load(log_dir + model_name + '_' + env_name, env=env) obs = env.reset() # print(env.observation_space) # cv2.imshow('test', RGB2BGR(obs[0])) # cv2.waitKey(0) while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) attentions = model.get_attention(obs, _states, done)[0] attentions_img = [] # print('attention', np.array(attention).shape) for i, attention in enumerate(attentions): attention = np.array(attention) attention = np.reshape(attention, [ env.observation_space.shape[0] // 10, env.observation_space.shape[1] // 10, 1 ]) attention = np.repeat(attention, [10] * attention.shape[0], axis=0) attention = np.repeat(attention, [10] * attention.shape[1], axis=1) attention = attention * 255 attentions_img.append(attention) # print(np.sum(attention)) attentions = tile_images(attentions_img) cv2.imshow('attention', attentions) cv2.waitKey(1) # break env.render() return model
def test(): # Parallel environments n_cpu = 4 env = SubprocVecEnv([lambda: RSEnv() for i in range(n_cpu)]) model = A2C(MlpPolicy, env, verbose=1) model.learn(total_timesteps=600000, log_interval=10) model.save("sba2c") env = TestRSEnv() obs = env.reset() done = False while not done: action, _ = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() env.close()
def main(): agent_data = pd.read_csv('../output_EURUSD_M1_/agentData.csv') agent_data = agent_data.drop(agent_data.columns[0], axis=1) agent_data = agent_data.astype('float32') env = SubprocVecEnv([lambda: ForexTradingEnv(agent_data)] * 10, ) #env = DummyVecEnv([lambda: ForexTradingEnv(agent_data)], ) # model = DQN(CustomDQNPolicy, env, gamma=0.95, verbose=1, tensorboard_log = "./tensorboard", entcoeff=0.005, adam_epsilon = 1e-6) import tensorflow as tf from TenorboardCallbacks import TensorboardCallback checkpoint_callback = CheckpointCallback(save_freq=1000000, save_path='./models/', name_prefix='ppo2') for curr in [1]: model = PPO2(PPO2Policy_Basic, env, verbose=1, tensorboard_log="./tensorboard", vf_coef=1e-7, ent_coef=1e-4, n_steps=512, gamma=0.99) #model = PPO2.load("5_days_model/ppo2_999000000_steps.zip", policy=PPO2Policy_Basic, env = env,verbose=1, tensorboard_log = "./tensorboard") model.learn(total_timesteps=10000000000, log_interval=10000000, callback=CallbackList( [TensorboardCallback(env), checkpoint_callback])) model.save(model_fileName) obs = env.reset() for i in range(2000000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) if i % 1 == 0: env.render() if done: break
def run_baseline_ppo2(env_name, n_cpu=4, train=True): from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import PPO2 if train: # multiprocess environment env = SubprocVecEnv([lambda: gym.make(env_name) for i in range(n_cpu)]) model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=100000) model.save("checkpoints/ppo2_" + env_name) else: from stable_baselines.common.vec_env import DummyVecEnv env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO2.load("checkpoints/ppo2_" + env_name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)
class BitmexTradingStrategySBL(TradingStrategy): """A trading strategy capable of self tuning, training, and evaluating with stable-baselines. Arguments: environments: An instance of a trading environments for the agent to trade within. model: The RL model to create the agent with. Defaults to DQN. policy: The RL policy to train the agent's model with. Defaults to 'MlpPolicy'. model_kwargs: Any additional keyword arguments to adjust the model. kwargs: Optional keyword arguments to adjust the strategy. """ def __init__(self, environment: BitmexEnvironment, model: BaseRLModel = DQN, policy: Union[str, BasePolicy] = 'MlpPolicy', model_kwargs: any = {}, policy_kwargs: any = {}, n_env: int = 1, **kwargs): self._model = model self._model_kwargs = model_kwargs self._policy_kwargs = policy_kwargs self._n_env = n_env self.environment = environment self._agent = self._model(policy, self._environment, **self._model_kwargs, policy_kwargs=self._policy_kwargs) @property def environment(self) -> 'BitmexEnvironment': """A `BitmexEnvironment` instance for the agent to trade within.""" return self._environment @environment.setter def environment(self, environment: 'BitmexEnvironment'): envs = [lambda: environment for _ in range(self._n_env)] if self._n_env == 1: self._environment = DummyVecEnv(envs) else: self._environment = SubprocVecEnv(envs) def restore_agent(self, path: str, custom_objects: any = {}): """Deserialize the strategy's learning agent from a file. Arguments: path: The `str` path of the file the agent specification is stored in. """ self._custom_objects = custom_objects self._agent = self._model.load(path, env=self._environment, custom_objects=self._custom_objects, kwargs=self._model_kwargs) def save_agent(self, path: str): """Serialize the learning agent to a file for restoring later. Arguments: path: The `str` path of the file to store the agent specification in. """ self._agent.save(path) def tune(self, steps: int = None, episodes: int = None, callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: raise NotImplementedError def _train_callback(self, _locals, _globals): # performance = self._environment.performance # # if self._episode_callback and self._environment.done(): # self._episode_callback(performance) return True def train( self, steps: int = None, episodes: int = None, render_mode: str = None, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: if steps is None: raise ValueError( 'You must set the number of `steps` to train the strategy.') self._agent.learn(steps, callback=self._train_callback) return True def test( self, steps: int = None, episodes=None, render_mode: str = None, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: if steps is None and episodes is None: raise ValueError( 'You must set the number of `steps` or `episodes` to test the strategy.' ) steps_completed, episodes_completed, average_reward = 0, 0, 0 obs, state, dones = self._environment.reset(), None, [False] performance = {} while (steps is not None and (steps == 0 or steps_completed < steps)) or ( episodes is not None and episodes_completed < episodes): actions, state = self._agent.predict(obs, state=state, mask=dones) # actions, state = self._agent.predict(obs) obs, rewards, dones, info = self._environment.step(actions) steps_completed += 1 average_reward -= average_reward / steps_completed average_reward += rewards[0] / (steps_completed + 1) exchange_performance = info[0].get('exchange').performance performance = exchange_performance if len( exchange_performance) > 0 else performance if render_mode is not None: self._environment.render(mode=render_mode) if dones[0]: if episode_callback is not None and not episode_callback( performance): break episodes_completed += 1 obs = self._environment.reset() print("Finished running strategy.") print("Total episodes: {} ({} timesteps).".format( episodes_completed, steps_completed)) print("Average reward: {}.".format(average_reward)) return performance
policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] if __name__ == '__main__': env = SubprocVecEnv([lambda: BaseEnv() for i in range(4)]) env = VecFrameStack(env, 3) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=100000000, tb_log_name='PPO2' + model_tag) model.save(model_folder + "PPO2" + model_tag) del model model = PPO2.load(model_folder + "PPO2" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_global_seeds(seed) return _init if __name__ == '__main__': env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) model = ACKTR(MlpPolicy, env, verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/") model.learn(total_timesteps=250000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render(mode='rgb_array')
def main(args): log_dir = args.log_path if (args.log_path is not None) else \ "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') configure_logger(log_dir) set_global_seeds(args.seed) n_cpu = get_num_workers(args.env) if not args.play else 1 env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type, args.n_object, args.curriculum) def make_thunk(rank): return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs) env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)]) eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 if "use_cu" in eval_env_kwargs: eval_env_kwargs['use_cu'] = False eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs) print(eval_env) if not args.play: os.makedirs(log_dir, exist_ok=True) train_kwargs = get_train_kwargs("ppo", args, parsed_action_noise=None, eval_env=eval_env) # policy = 'MlpPolicy' from utils.attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("ppo", args) print(policy_kwargs) model = PPO2(args.policy, env, verbose=1, nminibatches=32, lam=0.95, noptepochs=10, ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, **train_kwargs) print(model.get_parameter_list()) def callback(_locals, _globals): num_update = _locals["update"] if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model(eval_env, _locals["self"]) else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(num_update, mean_eval_reward) if num_update % 10 == 0: model_path = os.path.join(log_dir, 'model_' + str(num_update // 10)) model.save(model_path) print('model saved to', model_path) return True model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1) model.save(os.path.join(log_dir, 'final')) else: assert args.load_path is not None model = PPO2.load(args.load_path) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) obs = env.reset() goal_dim = env.get_attr('goal')[0].shape[0] if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'FetchPush' in args.env: while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61 and 0.7 < obs[0][4] < 0.8): obs = env.reset() env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0])) obs = env.env_method('get_obs') obs[0] = np.concatenate([ obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal'] ]) else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal', obs[0][-goal_dim:]) episode_reward = 0.0 num_episode = 0 frame_idx = 0 images = [] if 'max_episode_steps' not in env_kwargs.keys(): env_kwargs['max_episode_steps'] = 100 for i in range(env_kwargs['max_episode_steps'] * 10): img = env.render(mode='rgb_array') ax.cla() ax.imshow(img) if env.get_attr('goal')[0].shape[0] <= 3: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx)) else: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', goal idx ' + str(np.argmax(env.get_attr('goal')[0][3:]))) if 'FetchStack' in args.env: tasks = ['pick and place', 'stack'] ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', task: ' + tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 * goal_dim])]) images.append(img) action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) episode_reward += reward frame_idx += 1 if not args.export_video: plt.pause(0.1) else: plt.imsave( os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) if done: print('episode_reward', episode_reward) if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('goal', obs[0][-goal_dim:]) episode_reward = 0.0 frame_idx = 0 num_episode += 1 if num_episode >= 10: break if args.export_video: os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) for i in range(env_kwargs['max_episode_steps'] * 10): try: os.remove( os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) except: pass
if __name__ == '__main__': env_id = "TaxiDummy-v01" DATA_PATH = os.path.join(os.environ['ALLDATA_PATH'], "macaoFiles", "taxi_env_dummy") if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH) os.makedirs(DATA_PATH) num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = A2C(MlpPolicy, env, verbose=1) model.learn(total_timesteps=100000) obs = env.reset() images = [] img = env.render(mode="rgb_array") images.append(img) for _ in range(70): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) images.append(env.render(mode="rgb_array")) imageio.mimwrite(os.path.join(DATA_PATH, 'taxi_dummy_a2c.gif'), [np.array(img) for i, img in enumerate(images)], format="GIF-PIL", fps=5)
if __name__ == "__main__": # Parallel environments if parallel: env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) else: env = DummyVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = PPO2(CnnPolicy, env, verbose=1, tensorboard_log=log_dir) model.learn(total_timesteps=max_steps) model.save(log_dir + "ppo_minigrid") del model # remove to demonstrate saving and loading model = PPO2.load(log_dir + "ppo_minigrid") env = make_env(env_id, 0)() mean_reward, std_reward = evaluate_policy(model, env) print("Mean Reward: {}, std_dev: {}".format(mean_reward, std_reward)) demo = input("Watch model? (q to quit)") if demo != "q": for _ in range(1000): obs = env.reset() t = 0 while t < 200: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) print("Action: {}\tTimestep: {}".format(action, t)) env.render(mode='human') t += 1 if done: break
import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import A2C n_cpu = 4 # 支持4线程 env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) model = A2C(MlpPolicy, env, verbose=1) # 使用MlpPolicy的A2C算法 model.learn(total_timesteps=25000) # 训练 obs = env.reset() while True: action, _states = model.predict(obs) # 预测 obs, rewards, dones, info = env.step(action) # 执行一步游戏 env.render() # 显示