class PPO2_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] def make_env(self, env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = Template_Gym() env.seed(seed + rank) return env set_global_seeds(seed) return _init def train(self, num_e=1, n_timesteps=1000000, save_fraction=0.1, save='saves/aud5'): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) self.model = PPO2(CustomPolicy_4, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./day1" ) #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) log_dir = "saves" for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(save+str(i)) self.env.save_running_average(log_dir) self.env.save_running_average(log_dir) def evaluate(self, num_env=1, num_steps=21900, load="saves/aud5", runs=10): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = 'default' num_e = 1 log_dir = "saves" self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) self.env.load_running_average(log_dir) for i in range(runs): self.model = PPO2.load(load+str(i), self.env, policy=CustomPolicy_4, tensorboard_log="./default/" ) self.env.load_running_average(log_dir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) #actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward def pre_train(self, num_e=1, load="saves/m19"): env_id = 'default' num_e = 1 log_dir = "saves" # Usingenv = make_env() only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='default2.npz',traj_limitation=1, batch_size=128) self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2(CustomPolicy, self.env, verbose=1, nminibatches=1, learning_rate=1e-5, tensorboard_log="./m1ln4" ) #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") # Pretrain the PPO2 model self.model.pretrain(dataset, n_epochs=10000) # As an option, you can train the RL agent #self.model.learn(int(100000000)) # Test the pre-trained model self.env = self.model.get_env() self.env.load_running_average("saves") obs = self.env.reset() reward_sum = 0.0 for _ in range(1000000): action, _ = self.model.predict(obs) obs, reward, done, _ = self.env.step(action) reward_sum += reward #self.env.render() if done: print(reward_sum) reward_sum = 0.0 obs = self.env.reset() self.env.close() def gen_pre_train(self, num_e=1, save='default2', episodes=1000): #self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.env=SubprocVecEnv(self.env_fns) env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") #env = make_env() #self.expert_agent = generate_expert_traj(self.model, save, self.env, n_episodes=episodes)
class PPO2_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] def make_env(self, env_id, rank, seed=0, eval=False,config=pc.configeurcad4h): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): self.config = config self.eval= eval env = Template_Gym(config=self.config, eval=self.eval) env.seed(seed + rank) return env set_global_seeds(seed) return _init def train(self, num_e=1, n_timesteps=1000000, save_fraction=0.0125, save='saves/audbuyh4120', config=pc.configgbpchf4h): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona self.config = config self.env = SubprocVecEnv([self.make_env(env_id, i, eval=False, config=self.config) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(CustomPolicy_4, self.env, verbose=0, nminibatches=1, tensorboard_log="./gbp_chf_4h_r", **self.config.params ) #self.model = PPO2(CustomPolicy_5, self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5 )#**self.config.params #self.model = PPO2.load('saves/playerdetails39', self.env, policy=CustomPolicy, tensorboard_log="./playerdetailsex" ) self.model = PPO2.load(self.config.path+str(79)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) log_dir = "saves" #self.env.load_running_average(log_dir) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(self.config.path+'8'+str(i)) self.env.save_running_average(log_dir) self.env.save_running_average(log_dir) def evaluate(self, num_env=1, num_steps=1461, load='saves/audbuyh1', runs=80, config=pc.configgbpchf4h): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = config.year+config.pair num_e = 1 self.config = config log_dir = self.config.log #log_dir = self.config.norm #self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_env)]) self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True, config=self.config) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) try: self.env.load_running_average(log_dir) except: print('cant load') for i in range(runs): #self.model = PPO2(CustomPolicy, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./moose14" ) #self.model = PPO2.load(self.config.path, self.env, policy=CustomPolicy_2, tensorboard_log="./default/" ) self.model = PPO2.load(self.config.path+'8'+str(i)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) #self.env.load_running_average(log_dir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(self.env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) #actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) #self.env.save_running_average(log_dir) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) #self.env.save(log_dir) return mean_reward def live(self, num_env=1, num_steps=1461, load='saves/gbp_usd_buy', runs=1, config=pc.configgbpcad4h): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ self.config = config env_id = self.config.pair num_e = 1 log_dir = self.config.log self.config.live = True self.config.load = False self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True, config=self.config) for i in range(num_env)]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) try: self.env.load_running_average(self.config.log) except: print('cant load') self.env.num_envs = 1 for i in range(runs): self.model = PPO2.load(self.config.path+str(self.config.best)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] print(datetime.datetime.now()) print(time.ctime()) print('Market Check') print("Market time check") obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(self.env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies print("live step") action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) print(datetime.datetime.now()) print(time.ctime()) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward_gbp_buy:", mean_reward, "Num episodes:", n_episodes) return mean_reward # The two function below are not working atm def pre_train(self, num_e=1, load="saves/m19"): env_id = 'default' num_e = 1 log_dir = "saves" # Usingenv = make_env() only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='default2.npz',traj_limitation=1, batch_size=128) self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.save_running_average("saves"+self.config.pair) self.model = PPO2(CustomPolicy, self.env, verbose=1, nminibatches=1, learning_rate=1e-5, tensorboard_log="./m1ln4" ) #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.save_running_average("saves"+self.config.pair) # Pretrain the PPO2 model self.model.pretrain(dataset, n_epochs=10000) # As an option, you can train the RL agent #self.model.learn(int(100000000)) # Test the pre-trained model self.env = self.model.get_env() self.env.save_running_average("saves"+self.config.pair) obs = self.env.reset() reward_sum = 0.0 for _ in range(1000000): action, _ = self.model.predict(obs) obs, reward, done, _ = self.env.step(action) reward_sum += reward #self.env.render() if done: print(reward_sum) reward_sum = 0.0 obs = self.env.reset() self.env.close() def gen_pre_train(self, num_e=1, save='default2', episodes=1000): #self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.env=SubprocVecEnv(self.env_fns) env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") #env = make_env() #self.expert_agent = generate_expert_traj(self.model, save, self.env, n_episodes=episodes) def gen_pre_train_2(self, game, state, num_e=1, save='default2', episodes=10): self.create_envs(game_name=game, state_name=state, num_env=num_e) env=SubprocVecEnv(self.env_fns) self.expert_agent = "moose" self.generate_expert_traj(self.expert_agent, save, env, n_episodes=episodes)
env.reset() fp_path = '/Users/austin/PycharmProjects/RLDock/' with open('run.pml', 'w') as fp: i = 0 with open('pdbs_traj/test' + str(i) + '.pdb', 'w') as f: cur_m = env.render() f.write(cur_m.toPDB()) fp.write("load " + fp_path + 'pdbs_traj/test' + str(i) + '.pdb ') fp.write(", ligand, " + str(i + 1) + "\n") for i in range(1, 100): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) print(action, rewards, done) atom = env.render() header = atom.dump_header() states.append(atom.dump_coords()) cur_m = atom with open('pdbs_traj/test' + str(i) + '.pdb', 'w') as f: f.write(cur_m.toPDB()) fp.write("load " + fp_path + 'pdbs_traj/test' + str(i) + '.pdb ') fp.write(", ligand, " + str(i + 1) + "\n") if done: obs = env.reset() env.close()
def train(params, model=None, env=None): print("Training Parameters: ", params) data_dir, tb_path = get_paths(params) os.makedirs(data_dir, exist_ok=True) # Save parameters immediately params.save(data_dir) rank = mpi_rank_or_zero() if rank != 0: logger.set_level(logger.DISABLED) # Create the environment if not given if env is None: def make_env(i): env = get_env(params) print("ENV IN UTIL" ,env) # TODO: make monitor work for multiple agent. env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset']) return env # if 'PPO' in params['alg']: # env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])]) # else: # env = make_env(0) env = make_env(0) if params['normalize']: env = VecNormalize(env) # Set the seeds if params['seed']: seed = params['seed'] + 100000 * rank set_global_seeds(seed) params['alg_args']['seed'] = seed if 'noise' in params and params['noise']: from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions)) print("ENV", env, env.action_space) if model is None: alg = get_alg(params) policy = get_policy(params) model = alg(policy, env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args']) else: model.set_env(env) print("\n===============================\n") print("TENSORBOARD PATH:", tb_path) print("\n===============================\n") model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], callback=create_training_callback(data_dir, params, env, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq'])) print("Saving model to", data_dir) model.save(data_dir +'/final_model') if params['normalize']: env.save(data_dir + '/environment.pkl') env.close()