def generate(parameter_distribution, num_episodes, env_update_fn, filepath=None, n_cpu=6): env_name = 'CartPole-v1' model_dir = os.path.join(os.getcwd(), 'models') model_path = os.path.join(model_dir, 'ppo2_' + env_name + '.pkl') os.makedirs(model_dir, exist_ok=True) os.makedirs(os.path.dirname(filepath), exist_ok=True) def make_env(env_name): env = gym.make(env_name) return env env = SubprocVecEnv([lambda: make_env(env_name) for i in range(n_cpu)]) try: model = PPO2.load(model_path) except Exception as e: trainer = CartPoleTrainer(env) model = trainer.train(model_path) obs = env.reset() env = make_env(env_name) states, actions, next_states, parameters, steps = [], [], [], [], [] for ep in range(num_episodes): obs = env.reset() params = parameter_distribution() env_update_fn(env.unwrapped, params) done = False step = 0 while not done: action, _states = model.predict(obs) states.append(obs) actions.append([action]) obs, reward, done, info = env.step(action) next_states.append(obs) parameters.append(params) steps.append(step) step += 1 data = { 'states': np.array(states), 'actions': np.array(actions), 'next_states': np.array(next_states), 'parameters': np.array(parameters), 'steps': np.array(steps) } if filepath: print('filepath: ', filepath) with open(filepath, 'wb') as f: np.save(filepath, data) return data
def main(mode="train"): n_cpu = 2 env = SubprocVecEnv( [lambda: gym.make('balancebot-continuum-v0') for i in range(n_cpu)]) if mode == "train": model = ppo2(policy=MlpPolicy, env=env, learning_rate=1e-3, verbose=0, full_tensorboard_log=False, tensorboard_log="./ppo2_balancebot_tensorboard") model.learn(total_timesteps=100000, callback=callback) print("Saving model to ppo2_balance_continuum.pkl") model.save("ppo2_balance_continuum.pkl") del model # remove to demonstrate saving and loading if mode == "test": model = ppo2.load("ppo2_balance_continuum.pkl") obs = env.reset() done = [False, False] # env.set_done(5000) while not all(done): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # env.render() print(obs)
def play(env_name, seed, load_file, total_timesteps, n_cpu): np.set_printoptions(precision=5) def padding_obss(obss, dummy_obss): dummy_obss[ 0, :, :, : ] = obss return dummy_obss # trained LSTM model cannot change number of env. # so it needs to reshape observation by padding dummy data. dummy_obss = np.zeros((n_cpu, 64, 64, 4)) env = SubprocVecEnv([make_env(env_name, 0, seed)]) model = PPO2.load(load_file, verbose=1) obss = env.reset() obss = padding_obss(obss, dummy_obss) rewards_buf = [] steps_buf = [] # TODO: single for i in range(total_timesteps): actions, _states = model.predict(obss) actions = actions[0:1] obss, rewards, dones, infos = env.step(actions) obss = padding_obss(obss, dummy_obss) # env.render() # dummy if dones[0]: rewards_buf.append(infos[0]['episode']['r']) steps_buf.append(infos[0]['episode']['l']) line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)]) print(len(rewards_buf), line) obss = env.reset() obss = padding_obss(obss, dummy_obss) env.close()
def main(): #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)]) env = SubprocVecEnv([ (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10)) for i in range(1) ]) try: model = PPO2("MlpPolicy", env, verbose=1, tensorboard_log='/home/ralph/swoc2019/log') if SaveFile.exists(): print('loading...') model.load_parameters(SaveFile) else: print('Warning: No save file loaded') print('evaluating...', end='') obs = env.reset() totalRewards = None for i in range(100): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) totalRewards = totalRewards + rewards if totalRewards is not None else rewards env.render() sleep(0.2) print(f'mean reward: {np.mean(totalRewards)}') except KeyboardInterrupt: print('closing...') finally: env.close() print('closed')
def play(env_name, seed, load_file, total_timesteps, n_cpu): np.set_printoptions(precision=5) def padding_obss(obss, dummy_obss): dummy_obss[ 0, :, :, : ] = obss return dummy_obss # if it's GUI mode, number of env is changed to 1 to reduce GUI windows. # but trained LSTM model cannot change number of env. # so it needs to reshape observation by padding dummy data. isGUI = env_name.find('GUI') != -1 dummy_obss = np.zeros((n_cpu, 64, 64, 4)) if isGUI else None env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(1 if isGUI else n_cpu)]) model = PPO2.load(load_file, verbose=1) obss = env.reset() obss = padding_obss(obss, dummy_obss) if isGUI else obss rewards_buf = [] steps_buf = [] # TODO: single for i in range(total_timesteps): actions, _states = model.predict(obss) actions = actions[0:1] if isGUI else actions obss, rewards, dones, infos = env.step(actions) obss = padding_obss(obss, dummy_obss) if isGUI else obss # env.render() # dummy if dones.any(): rewards_buf.extend([ info['episode']['r'] for info in infos if 'episode' in info ]) steps_buf.extend([ info['episode']['l'] for info in infos if 'episode' in info ]) line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)]) print(len(rewards_buf), line) env.close()
def test(self, model_epoch: int = 0, should_render: bool = True): train_provider, test_provider = self.data_provider.split_data_train_test(self.train_split_percentage) del train_provider test_env = SubprocVecEnv([make_env(test_provider, i) for i in range(self.n_envs)]) model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl') model = self.Model.load(model_path, env=test_env) self.logger.info(f'Testing model ({self.study_name}__{model_epoch})') state = None obs, done, rewards = test_env.reset(), [False], [] while not all(done): action, state = model.predict(obs, state=state) obs, reward, done, _ = test_env.step(action) rewards.append(reward) if should_render and self.n_envs == 1: test_env.render(mode='human') self.logger.info( f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}')
def test(model_name, env_name, num_cpu, log_dir): env = SubprocVecEnv([ make_football_env(env_name, i, log_dir, useMonitor=False) for i in range(num_cpu) ]) # env = Monitor(env, log_dir, allow_early_resets=True) model = get_model(model_name, env, log_dir) model = model.load(log_dir + model_name + '_' + env_name, env=env) obs = env.reset() from matplotlib import pyplot as plt show_num = 1 while True: action, _states = model.predict(obs) # obs, rewards, done, info = env.step([int(input('action:'))]*num_cpu) obs, rewards, done, info = env.step(action) img = obs[show_num, :, :, :] fig = plt.figure(0) plt.clf() plt.imshow(img / 255) fig.canvas.draw() # env.render() plt.pause(0.000001)
def test(test_data, model_location): # Using a different environment to test the model env_test = SubprocVecEnv( [lambda: ExchangeEnv.ExchangeEnv(test_data, 10000, 0)]) model = PPO2.load(model_location) obs = env_test.reset() done = False price_history = [] portfolio_value = [] while not done: action, _states = model.predict(obs) obs, rewards, done, _ = env_test.step(action) # Appending the current time steps highest bid price_history.append(obs[0][0][0]) # Appending current portfolio value portfolio_value.append(rewards[0]) with open("price_history.txt", "w") as f: writer = csv.writer(f) writer.writerow(price_history) with open("portfolio_value.txt", "w") as f: writer = csv.writer(f) writer.writerow(portfolio_value)
def optimize_params(self, trial, n_prune_evals_per_trial: int = 2, n_tests_per_eval: int = 1): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) train_provider, validation_provider = train_provider.split_data_train_test( self.train_split_percentage) del test_provider train_env = SubprocVecEnv( [make_env(train_provider, i) for i in range(1)]) validation_env = SubprocVecEnv( [make_env(validation_provider, i) for i in range(1)]) model_params = self.optimize_agent_params(trial) model = self.Model(self.Policy, train_env, verbose=self.model_verbose, nminibatches=1, tensorboard_log=self.tensorboard_path, **model_params) last_reward = -np.finfo(np.float16).max n_steps_per_eval = int( len(train_provider.data_frame) / n_prune_evals_per_trial) for eval_idx in range(n_prune_evals_per_trial): try: model.learn(n_steps_per_eval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 state = None obs = validation_env.reset() while n_episodes < n_tests_per_eval: action, state = model.predict(obs, state=state) obs, reward, done, _ = validation_env.step(action) reward_sum += reward if all(done): rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = validation_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
def attention_render(model_name, env_name, num_cpu, log_dir): if not os.path.exists(log_dir): raise ('log_dir not Exists') env_id = env_name + 'NoFrameskip-v4' env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)]) # env = Monitor(env, log_dir, allow_early_resets=True) if model_name == 'A2C_Attention': model = A2C(AttentionPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention2': model = A2C(Attention2Policy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C': model = A2C(LstmPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') else: model = None model = model.load(log_dir + model_name + '_' + env_name, env=env) obs = env.reset() # print(env.observation_space) # cv2.imshow('test', RGB2BGR(obs[0])) # cv2.waitKey(0) while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) attentions = model.get_attention(obs, _states, done)[0] attentions_img = [] # print('attention', np.array(attention).shape) for i, attention in enumerate(attentions): attention = np.array(attention) attention = np.reshape(attention, [ env.observation_space.shape[0] // 10, env.observation_space.shape[1] // 10, 1 ]) attention = np.repeat(attention, [10] * attention.shape[0], axis=0) attention = np.repeat(attention, [10] * attention.shape[1], axis=1) attention = attention * 255 attentions_img.append(attention) # print(np.sum(attention)) attentions = tile_images(attentions_img) cv2.imshow('attention', attentions) cv2.waitKey(1) # break env.render() return model
def run(): torch.multiprocessing.freeze_support() env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
class SimulatorModel(object): def __init__(self, _make_env_func, parallel_agents): """ This class instantiates a dynamics model based on the pybullet simulator (i.e: simulates exactly the result of the actions), it can be used for reward tuning and verifying tasks..etc :param _make_env_func: (func) a function if called it will return a gym environment. :param parallel_agents: (int) number of parallel agents to siumulate to evaluate the actions. """ self.parallel_agents = parallel_agents self.envs = SubprocVecEnv( [_make_env_func() for i in range(self.parallel_agents)]) return def evaluate_trajectories(self, action_sequences): """ A function to be called to evaluate the action sequences and return the corresponding reward for each sequence. :param action_sequences: (nd.array) actions to be evaluated (number of sequences, horizon length) :return: (nd.array) sum of rewards for each action sequence. """ horizon_length = action_sequences.shape[1] num_of_particles = action_sequences.shape[0] rewards = np.zeros([num_of_particles]) assert ((float(num_of_particles) / self.parallel_agents).is_integer()) for j in range(0, num_of_particles, self.parallel_agents): self.envs.reset() total_reward = np.zeros([self.parallel_agents]) for k in range(horizon_length): actions = action_sequences[j:j + self.parallel_agents, k] task_observations, current_reward, done, info = \ self.envs.step(actions) total_reward += current_reward rewards[j:j + self.parallel_agents] = total_reward return rewards def end_sim(self): """ Closes the environments that were used for simulation. :return: """ self.envs.close() return
def main(): num_envs = 20 num_players_per_env = 2 envs = [makeEnv for i in range(num_envs)] actionSpace = makeEnv().action_space env = SubprocVecEnv(envs) env.reset() gameFinished = [False] * num_envs while not all(gameFinished): inputs = [] for game in range(num_envs): if not gameFinished[game]: inputs.append([actionSpace.sample() for player in range(num_players_per_env)]) else: inputs.append([0] * num_players_per_env) _, _, done, info = env.step(inputs) gameFinished = [gameElem or doneElem for gameElem, doneElem in list(zip(gameFinished, done))] time.sleep(.0001)
def test(): # Parallel environments n_cpu = 4 env = SubprocVecEnv([lambda: RSEnv() for i in range(n_cpu)]) model = A2C(MlpPolicy, env, verbose=1) model.learn(total_timesteps=600000, log_interval=10) model.save("sba2c") env = TestRSEnv() obs = env.reset() done = False while not done: action, _ = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() env.close()
def objective(trial): # Hyper-parameters to adjust policy = trial.suggest_categorical('policy', ['MlpPolicy', 'MlpLnPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy']) gamma = trial.suggest_uniform('gamma', 0.10, 1.0) ent_coef = trial.suggest_uniform('ent_coef', 0.01, 0.10) learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2) vf_coef = trial.suggest_uniform('vf_coef', 0.10, 1.0) lam = trial.suggest_uniform('lam', 0.01, 0.95) if policy == 'MlpPolicy': policy = MlpPolicy elif policy == 'MlpLnPolicy': policy = MlpLnPolicy elif policy == 'MlpLstmPolicy': policy = MlpLstmPolicy elif policy == 'MlpLnLstmPolicy': policy = MlpLnLstmPolicy # Train with those hyper-parameters n_cpu = 4 env = SubprocVecEnv([lambda: gimbal(5, 500) for i in range(n_cpu)]) model = PPO2(policy=policy, env=env, gamma=gamma, n_steps=100, ent_coef=ent_coef, learning_rate=learning_rate, vf_coef=vf_coef, max_grad_norm=0.5, lam=lam, nminibatches=4, noptepochs=4, cliprange=0.2, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False) model.learn(total_timesteps=250000, callback=None, seed=None, log_interval=1, tb_log_name='PPO2', reset_num_timesteps=True) # Calculate worth env = gimbal(5, 500) MAX_episodes = 25 reward_avg = 0 for episodes in range(MAX_episodes): obs = env.reset() r = 0 while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) r += rewards #env.render() if dones: reward_avg += r break return - (reward_avg / MAX_episodes)
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_TD3(trial) env = SubprocVecEnv([ lambda: NormalizeActionWrapper(LearningRocket(visualize=False)) for i in range(n_cpu) ]) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, policy_kwargs=dict(layers=[400, 300])) model.learn(50000) rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() step = 0 while n_episodes < 4: step += 1 action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, step) return -1 * last_reward
def main(): agent_data = pd.read_csv('../output_EURUSD_M1_/agentData.csv') agent_data = agent_data.drop(agent_data.columns[0], axis=1) agent_data = agent_data.astype('float32') env = SubprocVecEnv([lambda: ForexTradingEnv(agent_data)] * 10, ) #env = DummyVecEnv([lambda: ForexTradingEnv(agent_data)], ) # model = DQN(CustomDQNPolicy, env, gamma=0.95, verbose=1, tensorboard_log = "./tensorboard", entcoeff=0.005, adam_epsilon = 1e-6) import tensorflow as tf from TenorboardCallbacks import TensorboardCallback checkpoint_callback = CheckpointCallback(save_freq=1000000, save_path='./models/', name_prefix='ppo2') for curr in [1]: model = PPO2(PPO2Policy_Basic, env, verbose=1, tensorboard_log="./tensorboard", vf_coef=1e-7, ent_coef=1e-4, n_steps=512, gamma=0.99) #model = PPO2.load("5_days_model/ppo2_999000000_steps.zip", policy=PPO2Policy_Basic, env = env,verbose=1, tensorboard_log = "./tensorboard") model.learn(total_timesteps=10000000000, log_interval=10000000, callback=CallbackList( [TensorboardCallback(env), checkpoint_callback])) model.save(model_fileName) obs = env.reset() for i in range(2000000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) if i % 1 == 0: env.render() if done: break
class Agent: def __init__(self): self.env = None self.model = None def create_env(self, game, envs, render=False, sleep=0.): env = gym.make(game) # env = FrameStack(env, 4) env = CustomGym(env, render=render, sleep=sleep) self.env = SubprocVecEnv([lambda: env for i in range(envs)]) def create_model(self): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): self.model = PPO2(policy=MlpPolicy, env=self.env, n_steps=8192, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=0.001, learning_rate=lambda _: 2e-5, cliprange=lambda _: 0.2, verbose=1, tensorboard_log="gym_logs") def train(self, timesteps, loops, name="agent"): for i in range(loops): self.model.learn(timesteps) self.model.save(name + str(i)) def evaluate(self, timesteps, agent_name): self.model = PPO2.load(agent_name) obs = self.env.reset() for i in range(timesteps): action, _states = self.model.predict(obs) obs, rewards, dones, info = self.env.step(action)
def run_baseline_ppo2(env_name, n_cpu=4, train=True): from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import PPO2 if train: # multiprocess environment env = SubprocVecEnv([lambda: gym.make(env_name) for i in range(n_cpu)]) model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=100000) model.save("checkpoints/ppo2_" + env_name) else: from stable_baselines.common.vec_env import DummyVecEnv env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO2.load("checkpoints/ppo2_" + env_name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)
class BitmexTradingStrategySBL(TradingStrategy): """A trading strategy capable of self tuning, training, and evaluating with stable-baselines. Arguments: environments: An instance of a trading environments for the agent to trade within. model: The RL model to create the agent with. Defaults to DQN. policy: The RL policy to train the agent's model with. Defaults to 'MlpPolicy'. model_kwargs: Any additional keyword arguments to adjust the model. kwargs: Optional keyword arguments to adjust the strategy. """ def __init__(self, environment: BitmexEnvironment, model: BaseRLModel = DQN, policy: Union[str, BasePolicy] = 'MlpPolicy', model_kwargs: any = {}, policy_kwargs: any = {}, n_env: int = 1, **kwargs): self._model = model self._model_kwargs = model_kwargs self._policy_kwargs = policy_kwargs self._n_env = n_env self.environment = environment self._agent = self._model(policy, self._environment, **self._model_kwargs, policy_kwargs=self._policy_kwargs) @property def environment(self) -> 'BitmexEnvironment': """A `BitmexEnvironment` instance for the agent to trade within.""" return self._environment @environment.setter def environment(self, environment: 'BitmexEnvironment'): envs = [lambda: environment for _ in range(self._n_env)] if self._n_env == 1: self._environment = DummyVecEnv(envs) else: self._environment = SubprocVecEnv(envs) def restore_agent(self, path: str, custom_objects: any = {}): """Deserialize the strategy's learning agent from a file. Arguments: path: The `str` path of the file the agent specification is stored in. """ self._custom_objects = custom_objects self._agent = self._model.load(path, env=self._environment, custom_objects=self._custom_objects, kwargs=self._model_kwargs) def save_agent(self, path: str): """Serialize the learning agent to a file for restoring later. Arguments: path: The `str` path of the file to store the agent specification in. """ self._agent.save(path) def tune(self, steps: int = None, episodes: int = None, callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: raise NotImplementedError def _train_callback(self, _locals, _globals): # performance = self._environment.performance # # if self._episode_callback and self._environment.done(): # self._episode_callback(performance) return True def train( self, steps: int = None, episodes: int = None, render_mode: str = None, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: if steps is None: raise ValueError( 'You must set the number of `steps` to train the strategy.') self._agent.learn(steps, callback=self._train_callback) return True def test( self, steps: int = None, episodes=None, render_mode: str = None, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: if steps is None and episodes is None: raise ValueError( 'You must set the number of `steps` or `episodes` to test the strategy.' ) steps_completed, episodes_completed, average_reward = 0, 0, 0 obs, state, dones = self._environment.reset(), None, [False] performance = {} while (steps is not None and (steps == 0 or steps_completed < steps)) or ( episodes is not None and episodes_completed < episodes): actions, state = self._agent.predict(obs, state=state, mask=dones) # actions, state = self._agent.predict(obs) obs, rewards, dones, info = self._environment.step(actions) steps_completed += 1 average_reward -= average_reward / steps_completed average_reward += rewards[0] / (steps_completed + 1) exchange_performance = info[0].get('exchange').performance performance = exchange_performance if len( exchange_performance) > 0 else performance if render_mode is not None: self._environment.render(mode=render_mode) if dones[0]: if episode_callback is not None and not episode_callback( performance): break episodes_completed += 1 obs = self._environment.reset() print("Finished running strategy.") print("Total episodes: {} ({} timesteps).".format( episodes_completed, steps_completed)) print("Average reward: {}.".format(average_reward)) return performance
def run(alg, alg_kwargs, task, task_kwargs, wrappers_kwargs, expl_params, rollout, num_trials, folder, n_thrds, n_lstm, rerun=False, test_kwargs={}, num_retrains=10, seed=0, train_mode=None, sl_kwargs=None): train_mode = train_mode or 'RL' env = test_env(task, kwargs=task_kwargs, num_steps=1000) num_timesteps = int(1000 * num_trials / (env.num_tr)) files = glob.glob(folder + '/*model*') vars_ = { 'alg': alg, 'alg_kwargs': alg_kwargs, 'task': task, 'task_kwargs': task_kwargs, 'wrappers_kwargs': wrappers_kwargs, 'expl_params': expl_params, 'rollout': rollout, 'folder': folder, 'num_trials': num_trials, 'n_thrds': n_thrds, 'n_lstm': n_lstm } np.savez(folder + '/params.npz', **vars_) if len(files) == 0 or rerun: if train_mode == 'RL': if alg == "A2C": from stable_baselines import A2C as algo elif alg == "ACER": from stable_baselines import ACER as algo elif alg == "ACKTR": from stable_baselines import ACKTR as algo elif alg == "PPO2": from stable_baselines import PPO2 as algo env = SubprocVecEnv([ make_env(env_id=task, rank=i, seed=seed, wrapps=wrappers_kwargs, **task_kwargs) for i in range(n_thrds) ]) model = algo(LstmPolicy, env, verbose=0, n_steps=rollout, n_cpu_tf_sess=n_thrds, tensorboard_log=None, policy_kwargs={ "feature_extraction": "mlp", "n_lstm": n_lstm }, **alg_kwargs) # this assumes 1 trial ~ 10 steps sv_freq = 5 * wrappers_kwargs['MonitorExtended-v0']['sv_per'] chckpnt_cllbck = CheckpointCallback(save_freq=sv_freq, save_path=folder, name_prefix='model') model.learn(total_timesteps=num_timesteps, callback=chckpnt_cllbck) model.save(f"{folder}/model_{num_timesteps}_steps.zip") plotting.plot_rew_across_training(folder=folder) elif train_mode == 'SL': stps_ep = sl_kwargs['steps_per_epoch'] wraps_sl = deepc(wrappers_kwargs) del wraps_sl['PassAction-v0'] del wraps_sl['PassReward-v0'] del wraps_sl['MonitorExtended-v0'] env = make_env(env_id=task, rank=0, seed=seed, wrapps=wraps_sl, **task_kwargs)() dataset = ngym.Dataset(env, batch_size=sl_kwargs['btch_s'], seq_len=rollout, batch_first=True) obs_size = env.observation_space.shape[0] act_size = env.action_space.n model = define_model(seq_len=rollout, num_h=n_lstm, obs_size=obs_size, act_size=act_size, batch_size=sl_kwargs['btch_s'], stateful=sl_kwargs['stateful'], loss=sl_kwargs['loss']) # Train network data_generator = (dataset() for i in range(stps_ep)) model.fit(data_generator, verbose=1, steps_per_epoch=stps_ep) model.save(f"{folder}/model_{stps_ep}_steps") if len(test_kwargs) != 0: for key in test_kwargs.keys(): sv_folder = folder + key test_kwargs[key]['seed'] = seed if train_mode == 'RL': if '_all' not in key: ga.get_activity(folder, alg, sv_folder, **test_kwargs[key]) else: files = glob.glob(folder + '/model_*_steps.zip') for f in files: model_name = os.path.basename(f) sv_f = folder + key + '_' + model_name[:-4] ga.get_activity(folder, alg, sv_folder=sv_f, model_name=model_name, **test_kwargs[key]) elif train_mode == 'SL': stps_ep = sl_kwargs['steps_per_epoch'] wraps_sl = deepc(wrappers_kwargs) wraps_sl.update(test_kwargs[key]['wrappers']) del wraps_sl['PassAction-v0'] del wraps_sl['PassReward-v0'] env = make_env(env_id=task, rank=0, seed=seed, wrapps=wraps_sl, **task_kwargs)() obs_size = env.observation_space.shape[0] act_size = env.action_space.n model_test = define_model(seq_len=1, batch_size=1, obs_size=obs_size, act_size=act_size, stateful=sl_kwargs['stateful'], num_h=n_lstm, loss=sl_kwargs['loss']) ld_f = folder + 'model_' + str(stps_ep) + '_steps'.replace( '//', '/') model_test.load_weights(ld_f) env.reset() for ind_stp in range(sl_kwargs['test_steps']): obs = env.ob_now obs = obs[np.newaxis] obs = obs[np.newaxis] action = model_test.predict(obs) action = np.argmax(action, axis=-1)[0] _, _, _, _ = env.step(action)
env = SubprocVecEnv([make_env(x) for x in range(num_envs)], start_method='forkserver') # env.get_valid_actions = lambda: np.array([e.get_valid_actions() for e in env.envs]) env.get_valid_actions = lambda: np.array( env.env_method('get_valid_actions')) model = algo.MaskedPPO(CustomLSTMPolicy, env, verbose=1, n_steps=20, nminibatches=batch_size, tensorboard_log="../out/meta_opt/") model.learn(total_timesteps=100000, log_interval=10) model.save('meta_optimizer') obs = env.reset() state = None total_rewards = 0 done = [False for _ in range(env.num_envs)] for i in range(1000): action, _states = model.predict(obs, state=state, mask=done) obs, rewards, done, info = env.step(action) total_rewards += rewards # if done: # break print(total_rewards)
target_vel=params["target_vel"], use_contacts=params["use_contacts"]) print("Testing") policy_name = "QWZ" # LX3: joints + contacts + yaw policy_path = 'agents/SBL_{}'.format(policy_name) model = A2C.load(policy_path) print("Loading policy from: {}".format(policy_path)) obs = env.reset() for _ in range(100): cum_rew = 0 t1 = time.time() env_idx = 0 for i in range(800): actions, _states = model.predict(np.tile(obs, (4, 1)), deterministic=True) obs, reward, done, info = env.step(actions[env_idx], render=True) cum_rew += reward #env.render(mode='human') #time.sleep(0.05) if done: t2 = time.time() print("Time taken for episode: {}".format(t2 - t1)) obs = env.reset() print(cum_rew) break env.close()
policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] if __name__ == '__main__': env = SubprocVecEnv([lambda: BaseEnv() for i in range(4)]) env = VecFrameStack(env, 3) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=100000000, tb_log_name='PPO2' + model_tag) model.save(model_folder + "PPO2" + model_tag) del model model = PPO2.load(model_folder + "PPO2" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
class PPO2_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] self.environs = [ 'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3', 'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1', 'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3', 'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1', 'LabyrinthZone.Act3', 'SpringYardZone.Act1', 'GreenHillZone.Act2', 'StarLightZone.Act3', 'ScrapBrainZone.Act1' ] self.environsv2 = ['1Player.Axel.Level1'] self.generate_expert_traj = generate_expert_traj def create_envs(self, game_name, state_name, num_env): for i in range(num_env): self.env_fns.append( partial(make_env, game=game_name, state=state_name)) self.env_names.append(game_name + '-' + state_name) self.env = SubprocVecEnv(self.env_fns) def train(self, game, state, num_e=1, n_timesteps=25000000, save='default2'): self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.model = PPO2.load("default2", SubprocVecEnv(self.env_fns), policy=CnnPolicy, tensorboard_log="./sonic/" ) #self.model = PPO2(CnnPolicy, SubprocVecEnv(self.env_fns), learning_rate=1e-5, verbose=1,tensorboard_log="./sonic/" ) self.model = PPO2(policy=CnnPolicy, env=SubprocVecEnv(self.env_fns), n_steps=8192, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=0.001, learning_rate=lambda _: 2e-5, cliprange=lambda _: 0.2, verbose=1, tensorboard_log="./sonic/") self.model.learn(n_timesteps) self.model.save(save) self.model.learn(n_timesteps) self.model.save(save + '2') self.model.learn(n_timesteps) self.model.save(save + '3') self.model.learn(n_timesteps) self.model.save(save + '4') def evaluate(self, game, state, num_e=1, num_steps=14400): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ self.create_envs(game_name=game, state_name=state, num_env=num_e) self.model = PPO2.load("default2", SubprocVecEnv(self.env_fns), policy=CnnPolicy, tensorboard_log="./sonic/") episode_rewards = [[0.0] for _ in range(self.env.num_envs)] obs = self.env.reset() for i in range(num_steps): # _states are only useful when using LSTM policies actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env obs, rewards, dones, info = self.env.step(actions) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward def pre_train(self): # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='expert_cartpole.npz', traj_limitation=1, batch_size=128) model = PPO2('MlpPolicy', 'CartPole-v1', verbose=1) # Pretrain the PPO2 model model.pretrain(dataset, n_epochs=1000) # As an option, you can train the RL agent # model.learn(int(1e5)) # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0.0 for _ in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward env.render() if done: print(reward_sum) reward_sum = 0.0 obs = env.reset() env.close() def gen_pre_train(self, game, state, num_e=1, save='default2', episodes=10): self.create_envs(game_name=game, state_name=state, num_env=num_e) env = SubprocVecEnv(self.env_fns) self.expert_agent = "moose" self.generate_expert_traj(self.expert_agent, save, env, n_episodes=episodes)
:param num_env: (int) the number of environments you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_global_seeds(seed) return _init if __name__ == '__main__': env_id = "HumanoidPyBulletEnv-v0" num_cpu = 1 env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = PPO2.load("HumanoidPyBulletEnv-v0_PPO2_2020_11_3016_29_44") ob = env.reset() reward = 0 while True: action, _states = model.predict(ob) ob, r, done, info = env.step(action) reward += r time.sleep(0.01) if done: ob = env.reset() print('r is {}'.format(r)) print('Episode reward is {}'.format(reward)) reward = 0
class PPO2_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] def make_env(self, env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = Template_Gym() env.seed(seed + rank) return env set_global_seeds(seed) return _init def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(policy=CnnPolicy, #env=SubprocVecEnv(self.env_fns), #n_steps=8192, #nminibatches=8, #lam=0.95, #gamma=0.99, #noptepochs=4, #ent_coef=0.001, #learning_rate=lambda _: 2e-5, #cliprange=lambda _: 0.2, #verbose=1, #tensorboard_log="./breakorbust") self.model = PPO2(CustomPolicy, env=self.env, verbose=0, learning_rate=1e-5, tensorboard_log=save) for i in range(10): self.model.learn(n_timesteps) self.model.save(save) def evaluate(self, num_env=1, num_steps=14400): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = "default" num_e = 1 self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) self.model = PPO2.load('saves/agent.pkl', self.env, policy=CustomPolicy, tensorboard_log="./ppocnn/") episode_rewards = [[0.0] for _ in range(self.env.num_envs)] obs = self.env.reset() for i in range(num_steps): # _states are only useful when using LSTM policies actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env obs, rewards, dones, info = self.env.step(actions) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward
env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)]) model = ACKTR(MlpPolicy, env, verbose=1, ent_coef=0.) model.learn(total_timesteps=30000) model.save(log_dir+"model") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True: action, _states = model.predict(obs) obs, rewards, dones, infos = env.step(action) total_l += 1. total_r += rewards[0] if dones[0]: break ep_r.append(total_r) ep_l.append(total_l) print("episode mean reward: {:0.3f} mean length: {:0.3f}".format(np.mean(ep_r), np.mean(ep_l))) with open('{}_eval.pkl'.format(log_dir), 'wb') as f: pickle.dump(ep_r, f) pickle.dump(ep_l, f) env.close()
import gym import os from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import PPO2 from env import OsmoEnv if __name__ == '__main__': env = SubprocVecEnv([lambda: OsmoEnv() for i in range(os.cpu_count())]) # model = PPO2(MlpPolicy, env, verbose=1, learning_rate=1e-4) # model.learn(total_timesteps=25000) # model.save('PPO2_baselines') model = PPO2.load('PPO2_baselines') model.set_env(env) model.learning_rate = 1e-5 model.learn(total_timesteps=30000) model.save('PP02_baselines') env = OsmoEnv() for i in range(10): observation = env.reset() done = False while not done: action, _ = model.predict(observation) observation, _, done, info = env.step(action) else: print(info)
import tensorflow as tf import gym import random import time from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy from stable_baselines import A2C from register import create_env, create_network, create_scheduler, create_optimizer, create_agent import matplotlib.pyplot as pp if __name__ == '__main__': env = SubprocVecEnv([lambda: create_env(1)]) # for i in range(1)]) obs = env.reset() done = False while not done: obs, _, done, _ = env.step([10]) env.render() env.close()