def build(self): """ This function builds the ACER agent based on the selected mode and runs it according to: 1- Initializes the env 2- If mode train is selected, train the model from scratch, learn, and save. 3- If mode continue is selected, provide a path for pretrained model, load the model, learn, and save. 4- If mode test is selected, provide a path fror pretrained model, load the model and test. """ # Create the vectorized environment if self.inp.acer_dict['ncores'][0] > 1: self.env = SubprocVecEnv([ self.make_env(self.inp.gen_dict['env'][0], i) for i in range(self.inp.acer_dict['ncores'][0]) ], daemon=self.inp.gen_dict['daemon'][0]) else: self.env = gym.make(self.inp.gen_dict['env'][0], casename=self.inp.acer_dict['casename'][0], log_dir=self.log_dir, exepath=self.inp.gen_dict['exepath'][0], env_data=self.inp.gen_dict['env_data'][0], env_seed=1) #tensorboard activation (if used) #to view tensorboard type #tensorboard --logdir=./log_dir/{self.casename}_tensorlog if self.inp.acer_dict['tensorboard'][0]: tensorboard_log = self.log_dir + '{}_tensorlog'.format( self.inp.acer_dict['casename'][0]) else: tensorboard_log = None if self.mode == 'train': # Train from scratch, initialize the model and then learn, and save the last model. # Callbacks are used if provided model = ACER( MlpPolicy, self.env, n_steps=self.inp.acer_dict['n_steps'][0], gamma=self.inp.acer_dict['gamma'][0], learning_rate=self.inp.acer_dict['learning_rate'][0], q_coef=self.inp.acer_dict['q_coef'][0], max_grad_norm=self.inp.acer_dict['max_grad_norm'][0], ent_coef=self.inp.acer_dict['ent_coef'][0], alpha=self.inp.acer_dict['alpha'][0], lr_schedule=self.inp.acer_dict['lr_schedule'][0], rprop_alpha=self.inp.acer_dict['rprop_alpha'][0], rprop_epsilon=self.inp.acer_dict['rprop_epsilon'][0], buffer_size=self.inp.acer_dict['buffer_size'][0], replay_ratio=self.inp.acer_dict['replay_ratio'][0], replay_start=self.inp.acer_dict['replay_start'][0], correction_term=self.inp.acer_dict['correction_term'][0], trust_region=self.inp.acer_dict['trust_region'][0], delta=self.inp.acer_dict['delta'][0], verbose=1, tensorboard_log=tensorboard_log, seed=2, n_cpu_tf_sess=1) model.learn(total_timesteps=self.inp.acer_dict['time_steps'][0], callback=self.callback) model.save(self.log_dir + self.inp.acer_dict['casename'][0] + '_model_last.pkl') if self.mode == 'continue': # load, contine learning, and save last model model = ACER.load( self.inp.acer_dict['model_load_path'][0], env=self.env, n_steps=self.inp.acer_dict['n_steps'][0], gamma=self.inp.acer_dict['gamma'][0], learning_rate=self.inp.acer_dict['learning_rate'][0], q_coef=self.inp.acer_dict['q_coef'][0], max_grad_norm=self.inp.acer_dict['max_grad_norm'][0], ent_coef=self.inp.acer_dict['ent_coef'][0], alpha=self.inp.acer_dict['alpha'][0], lr_schedule=self.inp.acer_dict['lr_schedule'][0], rprop_alpha=self.inp.acer_dict['rprop_alpha'][0], rprop_epsilon=self.inp.acer_dict['rprop_epsilon'][0], buffer_size=self.inp.acer_dict['buffer_size'][0], replay_ratio=self.inp.acer_dict['replay_ratio'][0], replay_start=self.inp.acer_dict['replay_start'][0], correction_term=self.inp.acer_dict['correction_term'][0], trust_region=self.inp.acer_dict['trust_region'][0], delta=self.inp.acer_dict['delta'][0], verbose=1, tensorboard_log=tensorboard_log, seed=2, n_cpu_tf_sess=1) model.learn(total_timesteps=self.inp.acer_dict['time_steps'][0], callback=self.callback) model.save(self.log_dir + self.inp.acer_dict['casename'][0] + '_lastmodel.pkl') if self.mode == 'test': # load and test the agent. Env is recreated since test mode only works in single core print( 'debug: acer is running in test mode, single core is used to test the policy' ) env = gym.make(self.inp.gen_dict['env'][0], casename=self.inp.acer_dict['casename'][0], log_dir=self.log_dir, exepath=self.inp.gen_dict['exepath'][0], env_data=self.inp.gen_dict['env_data'][0], env_seed=1) model = ACER.load(self.inp.acer_dict['model_load_path'][0]) evaluate_policy( model, env, log_dir=self.log_dir + 'acer', n_eval_episodes=self.inp.acer_dict["n_eval_episodes"][0], render=self.inp.acer_dict["render"][0])
def build(self): """ This function builds the PPO agent based on the selected mode and runs it according to: 1- Initializes the env 2- If mode train is selected, train the model from scratch, learn, and save. 3- If mode continue is selected, provide a path for pretrained model, load the model, learn, and save. 4- If mode test is selected, provide a path fror pretrained model, load the model and test. """ # Create the vectorized environment if self.inp.ppo_dict['ncores'][0] > 1: self.env = SubprocVecEnv([ self.make_env(self.inp.gen_dict['env'][0], i) for i in range(self.inp.ppo_dict['ncores'][0]) ], daemon=self.inp.gen_dict['daemon'][0]) else: self.env = gym.make(self.inp.gen_dict['env'][0], casename=self.inp.ppo_dict['casename'][0], log_dir=self.log_dir, exepath=self.inp.gen_dict['exepath'][0], env_data=self.inp.gen_dict['env_data'][0], env_seed=1) #tensorboard activation (if used) #to view tensorboard type #tensorboard --logdir=./log_dir/{self.casename}_tensorlog if self.inp.ppo_dict['tensorboard'][0]: tensorboard_log = self.log_dir + '{}_tensorlog'.format( self.inp.ppo_dict['casename'][0]) else: tensorboard_log = None if self.mode == 'train': # Train from scratch, initialize the model and then learn, and save the last model. # Callbacks are used if provided model = PPO2(MlpPolicy, self.env, n_steps=self.inp.ppo_dict['n_steps'][0], gamma=self.inp.ppo_dict['gamma'][0], learning_rate=self.inp.ppo_dict['learning_rate'][0], ent_coef=self.inp.ppo_dict['ent_coef'][0], vf_coef=self.inp.ppo_dict['vf_coef'][0], max_grad_norm=self.inp.ppo_dict['max_grad_norm'][0], lam=self.inp.ppo_dict['lam'][0], nminibatches=self.inp.ppo_dict['nminibatches'][0], noptepochs=self.inp.ppo_dict['noptepochs'][0], cliprange=self.inp.ppo_dict['cliprange'][0], verbose=1, seed=3) model.learn(total_timesteps=self.inp.ppo_dict['time_steps'][0], callback=self.callback) model.save(self.log_dir + self.inp.ppo_dict['casename'][0] + '_lastmodel.pkl') if self.mode == 'continue': # load, contine learning, and save last model model = PPO2.load( self.inp.ppo_dict['model_load_path'][0], env=self.env, n_steps=self.inp.ppo_dict['n_steps'][0], gamma=self.inp.ppo_dict['gamma'][0], learning_rate=self.inp.ppo_dict['learning_rate'][0], ent_coef=self.inp.ppo_dict['ent_coef'][0], vf_coef=self.inp.ppo_dict['vf_coef'][0], max_grad_norm=self.inp.ppo_dict['max_grad_norm'][0], lam=self.inp.ppo_dict['lam'][0], nminibatches=self.inp.ppo_dict['nminibatches'][0], noptepochs=self.inp.ppo_dict['noptepochs'][0], cliprange=self.inp.ppo_dict['cliprange'][0], verbose=1, seed=3) model.learn(total_timesteps=self.inp.ppo_dict['time_steps'][0], callback=self.callback) model.save(self.log_dir + self.inp.ppo_dict['casename'][0] + '_lastmodel.pkl') if self.mode == 'test': # load and test the agent. Env is recreated since test mode only works in single core print( 'debug: ppo is running in test mode, single core is used to test the policy' ) env = gym.make(self.inp.gen_dict['env'][0], log_dir=self.log_dir, casename=self.inp.ppo_dict['casename'][0], exepath=self.inp.gen_dict['exepath'][0], env_data=self.inp.gen_dict['env_data'][0], env_seed=1) model = PPO2.load(self.inp.ppo_dict['model_load_path'][0]) evaluate_policy( model, env, log_dir=self.log_dir + 'ppo', n_eval_episodes=self.inp.ppo_dict["n_eval_episodes"][0], render=self.inp.ppo_dict["render"][0])
def build(self): """ This function builds the DQN agent (ONLY 1 Env/Core is supported) based on the selected mode and runs it according to: 1- Initializes the env 2- If mode train is selected, train the model from scratch, learn, and save. 3- If mode continue is selected, provide a path for pretrained model, load the model, learn, and save. 4- If mode test is selected, provide a path fror pretrained model, load the model and test. """ #tensorboard activation (if used) #to view tensorboard type #tensorboard --logdir=./log_dir/{self.casename}_tensorlog if self.inp.dqn_dict['tensorboard'][0]: tensorboard_log = self.log_dir + '{}_tensorlog'.format( self.inp.dqn_dict['casename'][0]) else: tensorboard_log = None if self.mode == 'train': # Train from scratch, initialize the model and then learn, and save the last model. # Callbacks are used if provided model = DQN( MlpPolicy, self.env, gamma=self.inp.dqn_dict['gamma'][0], learning_rate=self.inp.dqn_dict['learning_rate'][0], buffer_size=self.inp.dqn_dict['buffer_size'][0], exploration_fraction=self.inp.dqn_dict['exploration_fraction'] [0], eps_final=self.inp.dqn_dict['eps_final'][0], learning_starts=self.inp.dqn_dict['learning_starts'][0], batch_size=self.inp.dqn_dict['batch_size'][0], target_network_update_freq=self.inp. dqn_dict['target_network_update_freq'][0], eps_init=self.inp.dqn_dict['eps_init'][0], train_freq=self.inp.dqn_dict['train_freq'][0], prioritized_replay=self.inp.dqn_dict['prioritized_replay'][0], verbose=2, seed=1) model.learn(total_timesteps=self.inp.dqn_dict['time_steps'][0], callback=self.callback) model.save(self.log_dir + self.inp.dqn_dict['casename'][0] + '_lastmodel.pkl') if self.mode == 'continue': # load, contine learning, and save last model model = DQN.load( self.inp.dqn_dict['model_load_path'][0], env=self.env, gamma=self.inp.dqn_dict['gamma'][0], learning_rate=self.inp.dqn_dict['learning_rate'][0], buffer_size=self.inp.dqn_dict['buffer_size'][0], exploration_fraction=self.inp.dqn_dict['exploration_fraction'] [0], eps_final=self.inp.dqn_dict['eps_final'][0], learning_starts=self.inp.dqn_dict['learning_starts'][0], batch_size=self.inp.dqn_dict['batch_size'][0], target_network_update_freq=self.inp. dqn_dict['target_network_update_freq'][0], eps_init=self.inp.dqn_dict['eps_init'][0], train_freq=self.inp.dqn_dict['train_freq'][0], prioritized_replay=self.inp.dqn_dict['prioritized_replay'][0], verbose=2, seed=1) model.learn(total_timesteps=self.inp.dqn_dict['time_steps'][0], callback=self.callback) model.save(self.log_dir + self.inp.dqn_dict['casename'][0] + '_lastmodel.pkl') if self.mode == 'test': # load and test the agent. Env is recreated since test mode only works in single core print( 'debug: dqn is running in test mode, single core is used to test the policy' ) env = gym.make(self.inp.gen_dict['env'][0], casename=self.inp.dqn_dict['casename'][0], log_dir=self.log_dir, exepath=self.inp.gen_dict['exepath'][0], env_data=self.inp.gen_dict['env_data'][0], env_seed=1) model = DQN.load(self.inp.dqn_dict['model_load_path'][0]) evaluate_policy( model, env, log_dir=self.log_dir + 'dqn', n_eval_episodes=self.inp.dqn_dict["n_eval_episodes"][0], render=self.inp.dqn_dict["render"][0])