def main(): env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) if not USE_LOADED_MODEL: model = ACKTR('MlpPolicy', env, verbose=1) # Multiprocessed RL Training start_time = time.time() model.learn(total_timesteps=n_timesteps, log_interval=10) total_time_multi = time.time() - start_time model.save("cartpole_v1_acktr") loaded_model = ACKTR.load("cartpole_v1_acktr") loaded_model.set_env(env) # Single Process RL Training single_process_model = ACKTR('MlpPolicy', env_id, verbose=1) start_time = time.time() single_process_model.learn(n_timesteps) total_time_single = time.time() - start_time print("Single-process: {0}s, Multi-process: {1}s".format( total_time_single, total_time_multi)) # create separate clean environment for evaluation eval_env = gym.make(env_id) mean_reward, std_reward = evaluate_policy(loaded_model, eval_env, n_eval_episodes=10) print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
if __name__ == "__main__": env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux" #env = UnityEnv(env_id, worker_id=2, use_visual=False) # Create log dir time_int = int(time.time()) log_dir = "stable_results/basic_env_{}/".format(time_int) os.makedirs(log_dir, exist_ok=True) #env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run num_env = 2 worker_id = 9 env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)]) model = ACKTR(MlpPolicy, env, verbose=1, ent_coef=0.) model.learn(total_timesteps=30000) model.save(log_dir+"model") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True: action, _states = model.predict(obs) obs, rewards, dones, infos = env.step(action) total_l += 1. total_r += rewards[0] if dones[0]:
def get_reward(r): if r == 'magni_reward': return magni_reward elif r == 'cameron_reward': return cameron_reward elif r == 'risk_event': return risk_event elif r == 'reward_target': return reward_target elif r == 'default': return risk_diff else: assert False, "Reward must be valid ('magni_reward', 'cameron_reward', 'risk_event', 'reward_target')" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-g", "--age_group", default="adult") parser.add_argument("-r", "--reward", default="default") args = parser.parse_args() group = get_group(args.age_group) reward_fun = get_reward(args.reward) env = DummyVecEnv([ make_env(group + '#0{}'.format(str(i).zfill(2)), i, reward_fun) for i in range(1, 11) ]) model = ACKTR(MlpLstmPolicy, env, verbose=1) model.learn(total_timesteps=256000) model.save("ACKTR_MlpLSTM_" + group + "_def_reward")
fBG = 3.5506*(np.log(bg)**.8353-3.7932) risk = 10 * (fBG)**2 return -1*risk def cameron_reward(bg_hist, **kwargs): bg = bg_hist[-1] a = .2370 # 1/(mg/dL) b = -36.21 c = 6.0e-5 # (1/(mg/dL)**3) d = 177 # mg/dL if bg < d: risk = a*bg+b+(c*(d-bg)**3) else: risk = a*bg+b return -1*risk person_options = (['child#0{}'.format(str(i).zfill(2)) for i in range(1, 11)]+['adolescent#0{}'.format(str(i).zfill(2)) for i in range(1, 11)]+['adult#0{}'.format(str(i).zfill(2)) for i in range(1, 11)]) for i,p in enumerate(person_options): patient_id = p.split('#')[0] + str(i + 1) # Create a simulation environment print(p) patient = T1DPatient.withName(p) register(id='simglucose-'+p+'-v0',entry_point='simglucose.envs:T1DSimEnv',kwargs={'patient_name': p},'reward_fun': reward_target) env = gym.make('simglucose-'+p+'-v0') model = ACKTR(MlpLstmPolicy, env, verbose=1) model.learn(total_timesteps=250000) model.save('mlplstm_trained-'+p+'-reward_target') print('Model Trained and Saved for : '+ p)
#env = CustomEnv(3, 6, "tcp://*:5556") # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) # Create log dir log_dir = "Logs/Custom_env/" os.makedirs(log_dir, exist_ok=True) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=500, log_dir=log_dir) #env = Monitor(env, log_dir) model = ACKTR(MlpPolicy, env, verbose=2) #model.load("DQN_agent") model.learn(total_timesteps=20000, callback=callback) model.save("temp_agent") a = input("Training completed") obs = env.reset() for _ in range(1000): action, _states = model.predict(obs, deterministic=True) probs = model.action_probability(obs) obs, rewards, dones, info = env.step(action) print("Observation:", obs, rewards, probs) results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "Lane Manager") plt.show()
env = VecFrameStack(env, 3) model = ACKTR(get_policy(policy), env, n_steps=100, verbose=0, gae_lambda=0.95, vf_fisher_coef=0.5, tensorboard_log=tensorboard_folder, kfac_update=10, n_cpu_tf_sess=2, async_eigen_decomp=False) model.learn(total_timesteps=100000000, tb_log_name='ACKTR_PPO2' + model_tag) model.save(model_folder + "ACKTR_PPO2" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_PPO2" + model_tag) done = False states = None action_masks = [] obs = env.reset() while not done: action, states = model.predict(obs, states, action_mask=action_masks) obs, _, done, infos = env.step(action) env.render() action_masks.clear() for info in infos: env_action_mask = info.get('action_mask')
gamma=config['gamma'], policy_kwargs=config['policy_kwargs'], verbose=1, tensorboard_log=save_path) elif config['algorithm'] == 'PPO2': env = make_vec_env(lambda: env, n_envs=1) model = PPO2(config['policy_network'], env, learning_rate=config['learning_rate'], gamma=config['gamma'], policy_kwargs=config['policy_kwargs'], verbose=1, tensorboard_log=save_path) elif config['algorithm'] == 'DQN': model = DQN( config['policy_network'], env, learning_rate=config['learning_rate'], buffer_size=config['buffer_size'], target_network_update_freq=64, gamma=config['gamma'], # policy_kwargs = config['policy_kwargs'], verbose=1, tensorboard_log=save_path) model.learn(config['total_steps'], callback=callback) model.save(os.path.join(save_path, 'model')) env.close()
print(env_id) return env set_global_seeds(seed) return _init if __name__ == "__main__": env = DummyVecEnv([ make_env('adolescent#0{}'.format(str(i).zfill(2)), i) for i in range(1, 11) ]) # model = SAC(LnMlpPolicy, env, verbose=1) model = ACKTR(MlpLstmPolicy, env, verbose=1) model.learn(total_timesteps=256000) model.save("ACKTR_MlpLSTM_adolescent_def_reward") # for i,p in enumerate(child_options): # patient_id = p.split('#')[0] + str(i + 1) # register( # id='simglucose-' + patient_id + '-v0', # entry_point='simglucose.envs:T1DSimEnv', # kwargs={'patient_name': p} # ) # env = gym.make('simglucose-' + patient_id + '-v0') # model = SAC(LnMlpPolicy, env, verbose=1) # print(p, patient_id) # model.learn(total_timesteps=250000) # print("Finished training for " + patient_id)
# Log dir log_dir = "./tmp10/" os.makedirs(log_dir, exist_ok=True) callback = SaveOnBestTrainingRewardCallback(check_freq=10000, log_dir=log_dir, save_name="acktr") env = Manipulator2D() # multiprocess environment #env = make_vec_env('CartPole-v1', n_envs=4) env = Monitor(env, log_dir) # Custom MLP policy of two layers of size 32 each with tanh activation function #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32]) # Create the agent #model = PPO2(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs,) #model = PPO2(MlpPolicy, env, verbose=1) # Train the agent model = ACKTR(MlpPolicy, env, verbose=1, ent_coef=0.0) # 3e5 # policy: 'MlpPolicy' # ent_coef: 0.0) model.learn(total_timesteps=20000000, callback=callback) # Save the agent model.save("acktr-man1") # del model # # the policy_kwargs are automatically loaded # model = PPO2.load("ppo2-cartpole")
class ACKTR_Agent: def __init__(self, params: Params): self.params: Params = params policy_name = self.params.agent_config['policy'] self.policy = eval(policy_name) def create_model(self, n_envs=1): """ Create env and agent model """ env_cls = SprEnv self.env = make_vec_env(env_cls, n_envs=n_envs, env_kwargs={"params": self.params}, seed=self.params.seed) self.model = ACKTR( self.policy, self.env, gamma=self.params.agent_config['gamma'], n_steps=self.params.agent_config['n_steps'], ent_coef=self.params.agent_config['ent_coef'], vf_coef=self.params.agent_config['vf_coef'], vf_fisher_coef=self.params.agent_config['vf_fisher_coef'], max_grad_norm=self.params.agent_config['max_grad_norm'], learning_rate=self.params.agent_config['learning_rate'], gae_lambda=self.params.agent_config['gae_lambda'], lr_schedule=self.params.agent_config['lr_schedule'], kfac_clip=self.params.agent_config['kfac_clip'], kfac_update=self.params.agent_config['kfac_update'], async_eigen_decomp=self.params.agent_config['async_eigen_decomp'], verbose=self.params.agent_config['verbose'], tensorboard_log="./tb/acktr/", seed=self.params.seed, policy_kwargs={"params": self.params}) def train(self): with ProgressBarManager(self.params.training_duration) as callback: self.model.learn(total_timesteps=self.params.training_duration, tb_log_name=self.params.tb_log_name, callback=callback) def test(self): self.params.test_mode = True obs = self.env.reset() self.setup_writer() episode = 1 step = 0 episode_reward = [0.0] done = False # Test for 1 episode while not done: action, _states = self.model.predict(obs) obs, reward, dones, info = self.env.step(action) episode_reward[episode - 1] += reward[0] if info[0]['sim_time'] >= self.params.testing_duration: done = True self.write_reward(episode, episode_reward[episode - 1]) episode += 1 sys.stdout.write( "\rTesting:" + f"Current Simulator Time: {info[0]['sim_time']}. Testing duration: {self.params.testing_duration}" ) sys.stdout.flush() step += 1 print("") def save_model(self): """ Save the model to a zip archive """ self.model.save(self.params.model_path) def load_model(self, path=None): """ Load the model from a zip archive """ if path is not None: self.model = ACKTR.load(path) else: self.model = ACKTR.load(self.params.model_path) # Copy the model to the new directory self.model.save(self.params.model_path) def setup_writer(self): episode_reward_filename = f"{self.params.result_dir}/episode_reward.csv" episode_reward_header = ['episode', 'reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) def write_reward(self, episode, reward): self.episode_reward_writer.writerow([episode, reward])
# index = np.argmin(best_mean_reward) # if mean_reward > best_mean_reward[index]: # best_mean_reward[index] = mean_reward # print('best_mean_reward', best_mean_reward) # _locals['self'].save(log_dir + 'best_model_{}.pkl'.format(str(mean_reward))) # n_steps += 1 # return False # log_dir = 'LiveStream_1229/ACKTRCust3_deletem8_zhongwang_diff_delay/' log_dir = 'ACKTRtest/' if not os.path.exists(log_dir): os.makedirs(log_dir) os.environ['CUDA_VISIBLE_DEVICES'] = '1' tstart = time.time() num_cpu = 2 env = SubprocVecEnv([make_env(i, log_dir) for i in range(num_cpu)]) model = ACKTR( env=env, policy=LstmCust3Policy, verbose=1, ) model.learn(total_timesteps=int(5e6), callback=callback) model.save(log_dir + "last_model") print('Time taken: {:.2f}'.format(time.time() - tstart))
def train(environment, algorithm, timesteps): from envs import cpa, mountain_car from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.bench import Monitor from stable_baselines import PPO2, ACKTR, DQN, A2C now = datetime.now() current_time = now.strftime("%Y-%m-%d-%H-%M-%S") training_info_dir = "training_info" + os.path.sep current_training_info = "{}-{}-{}".format(current_time, algorithm, environment) current_training_info_dir = training_info_dir + current_training_info + os.path.sep model_file_path = current_training_info_dir + "model" log_file_path = current_training_info_dir + "monitor.csv" tensorboard_dir = training_info_dir + TENSORBOARD_DIR_NAME + os.path.sep dirs_to_create = [model_file_path, tensorboard_dir, model_file_path] for directory in dirs_to_create: create_dir(directory) env = None if environment == 'cpa_sparse': env = cpa.CPAEnvSparse() elif environment == 'cpa_dense': env = cpa.CPAEnvDense() elif environment == 'mc_sparse': env = mountain_car.MountainCarSparseEnv() elif environment == 'mc_dense': env = mountain_car.MountainCarDenseEnv() else: raise Exception("Environment '{}' is unknown.".format(environment)) # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor env = Monitor(env, filename=log_file_path, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = None if algorithm == 'acktr': model = ACKTR('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) elif algorithm == 'ppo': model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) elif algorithm == 'a2c': model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) elif algorithm == 'dqn': model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) else: raise Exception("Algorithm '{}' is unknown.".format(algorithm)) # Train the agent model.learn(total_timesteps=timesteps, tb_log_name=current_training_info) model.save(model_file_path) print("Finished training model: {}. Saved training info in: {}".format(model, current_training_info_dir))
import gym gym.logger.set_level(40) import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) from env import GoLeftEnv from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold env = GoLeftEnv(grid_size=10) env = make_vec_env(lambda: env, n_envs=1) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=0.9, verbose=1) eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, verbose=1) model = ACKTR('MlpPolicy', env, verbose=1) model.learn(int(1e10), callback=eval_callback) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") model.save('models/best') env.close()
os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv(10, 10)]) model = ACKTR(get_policy(policy), env, verbose=0, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag) model.save(model_folder + "ACKTR_A2C" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
model.learn(total_timesteps=1000000, reset_num_timesteps=False, callback=callback) model.save(log_dir + 'model_PPO_' + str(id + 1)) if args.algo == "acktr": id = balboa.utils.tensorboard_latest_directory_number( log_dir, 'ACKTR_') print('Using acktr') if args.load_id == None: # tensorboard_log=log_dir model = ACKTR("MlpPolicy", env, policy_kwargs=policy_kwargs, ent_coef=0.0, verbose=1) # verbose=1, n_steps=48, learning_rate=0.1, lr_schedule='constant', else: print("Loading model: " + str(args.load_id)) model = ACKTR.load(log_dir + 'ACKTR_' + str(args.load_id) + ".zip", env=env) model.tensorboard_log = log_dir # model.learning_rate = stable_baselines.common.schedules.LinearSchedule(1.0, 0.06, initial_p=0.06).value # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value model.learn(total_timesteps=3000000, reset_num_timesteps=False, callback=callback) print("Saving to: " + log_dir + 'ACKTR_' + str(id + 1)) model.save(log_dir + 'model_ACKTR_' + str(id + 1))
import gym from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import ACKTR # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("acktr_cartpole") del model # remove to demonstrate saving and loading model = ACKTR.load("acktr_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
env.seed(seed) print(env_id) return env set_global_seeds(seed) return _init if __name__ == "__main__": env = DummyVecEnv([ make_env('adult#0{}'.format(str(i).zfill(2)), i) for i in range(1, 11) ]) # model = SAC(LnMlpPolicy, env, verbose=1) model = ACKTR(MlpLstmPolicy, env, verbose=1) model.learn(total_timesteps=256000) model.save("ACKTR_MlpLSTM_adult_def_reward") # for i,p in enumerate(child_options): # patient_id = p.split('#')[0] + str(i + 1) # register( # id='simglucose-' + patient_id + '-v0', # entry_point='simglucose.envs:T1DSimEnv', # kwargs={'patient_name': p} # ) # env = gym.make('simglucose-' + patient_id + '-v0') # model = SAC(LnMlpPolicy, env, verbose=1) # print(p, patient_id) # model.learn(total_timesteps=250000) # print("Finished training for " + patient_id)