def run(): torch.multiprocessing.freeze_support() env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
def test_action_mask_run_acktr(vec_env, policy, env_class): env = vec_env([env_class]) model = ACKTR(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_acktr(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = ACKTR("MlpPolicy", env, nprocs=1, verbose=0, **model_params) print("DOING LEARING acer") original_env.force_progression = False model.learn(int(2e4), seed=seed) print("DONE LEARING acer") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def stable_baseline_test(env_origin): env = make_vec_env(lambda: env_origin, n_envs=1) model = ACKTR('CnnPolicy', env_origin, verbose=1) model.learn(total_timesteps=2000000) print("Stable_baseline evaluation starts.....\n") #NOTE:evaluate_policy needs vec_env reward_mean, reward_std = evaluate_policy(model, env, n_eval_episodes=20, deterministic=False) print("mean reward:" + str(reward_mean) + '\n') print("reward std:" + str(reward_std) + '\n') print("custom evaluation begin\n") env = env_origin obs = env.reset() reward_list_total = [] epilen_list = [] reward_list = [] last_end = 0 for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) reward_list.append(rewards) if dones: obs = env.reset() epilen_list.append(i - last_end) last_end = i reward_list_total.append(np.sum(reward_list)) reward_list = [] if i > 900: break print("mean reward:{}\n".format(np.mean(reward_list_total))) print("mean epilen:{}\n".format(np.mean(epilen_list)))
import gym from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import ACKTR # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("acktr_cartpole") del model # remove to demonstrate saving and loading model = ACKTR.load("acktr_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv(10, 10)]) model = ACKTR(get_policy(policy), env, verbose=0, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag) model.save(model_folder + "ACKTR_A2C" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] if __name__ == '__main__': env = SubprocVecEnv([lambda: ActionMaskEnv() for i in range(4)]) env = VecFrameStack(env, 3) model = ACKTR(get_policy(policy), env, n_steps=100, verbose=0,vf_fisher_coef=0.5 , tensorboard_log=tensorboard_folder, kfac_update=10, n_cpu_tf_sess=2, async_eigen_decomp=False) model.learn(total_timesteps=100000000, tb_log_name='ACKTR_A2C' + model_tag) model.save(model_folder + "ACKTR_A2C" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag) done = False states = None action_masks = [] obs = env.reset() while not done: action, states = model.predict(obs, states, action_mask=action_masks) obs, _, done, infos = env.step(action) env.render() action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask)
# # images.append(img) # action, _ = model.predict(obs) # obs, r, done, _ = model.env.step(action) # # print(type(done[0])) # # model.env.render(mode='human') # if done[0]: # print(done[0]) # model.env.render(mode='human') # img = model.env.render(mode='rgb_array') # cv2.imshow('image', img) # cv2.waitKey(0) # cv2.destroyAllWindows() # if i % 20 == 0 : # model.env.render(mode='human') # imageio.mimsave('uav_learning.gif', [img for i, img in enumerate(images) if i % 5 == 0], fps=60) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") trajectory_dir = './logs/Experiment_ACKTR_{}/trajectories/'.format(time) os.makedirs(trajectory_dir, exist_ok=True) print('evaluating runs') for i in range(100): episode_done = [False] while not episode_done[0]: action, _ = model.predict(obs) obs, r, episode_done, _ = model.env.step(action) fig = model.env.render(mode='human') if episode_done[0]: print(i) plt.savefig('{}run_{}_r{}.png'.format(trajectory_dir, i, r[0]))
#env = CustomEnv(3, 6, "tcp://*:5556") # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) # Create log dir log_dir = "Logs/Custom_env/" os.makedirs(log_dir, exist_ok=True) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=500, log_dir=log_dir) #env = Monitor(env, log_dir) model = ACKTR(MlpPolicy, env, verbose=2) #model.load("DQN_agent") model.learn(total_timesteps=20000, callback=callback) model.save("temp_agent") a = input("Training completed") obs = env.reset() for _ in range(1000): action, _states = model.predict(obs, deterministic=True) probs = model.action_probability(obs) obs, rewards, dones, info = env.step(action) print("Observation:", obs, rewards, probs) results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "Lane Manager") plt.show()
set_global_seeds(seed) return _init if __name__ == '__main__': env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment #env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) #env = gym.make(env_id) env = CustomEnv(3, 6, "tcp://*:5556") # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) # Create log dir log_dir = "Logs/env_id/" os.makedirs(log_dir, exist_ok=True) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # env = Monitor(env, log_dir) model = ACKTR(MlpPolicy, env, verbose=2) model.load("RL_agent") while True: user_in = input("Enter States: ").split(',') obs = [int(i) for i in user_in] print(model.action_probability(obs)) action = model.predict(obs, deterministic = True) print(action)
class ACKTR_Agent: def __init__(self, params: Params): self.params: Params = params policy_name = self.params.agent_config['policy'] self.policy = eval(policy_name) def create_model(self, n_envs=1): """ Create env and agent model """ env_cls = SprEnv self.env = make_vec_env(env_cls, n_envs=n_envs, env_kwargs={"params": self.params}, seed=self.params.seed) self.model = ACKTR( self.policy, self.env, gamma=self.params.agent_config['gamma'], n_steps=self.params.agent_config['n_steps'], ent_coef=self.params.agent_config['ent_coef'], vf_coef=self.params.agent_config['vf_coef'], vf_fisher_coef=self.params.agent_config['vf_fisher_coef'], max_grad_norm=self.params.agent_config['max_grad_norm'], learning_rate=self.params.agent_config['learning_rate'], gae_lambda=self.params.agent_config['gae_lambda'], lr_schedule=self.params.agent_config['lr_schedule'], kfac_clip=self.params.agent_config['kfac_clip'], kfac_update=self.params.agent_config['kfac_update'], async_eigen_decomp=self.params.agent_config['async_eigen_decomp'], verbose=self.params.agent_config['verbose'], tensorboard_log="./tb/acktr/", seed=self.params.seed, policy_kwargs={"params": self.params}) def train(self): with ProgressBarManager(self.params.training_duration) as callback: self.model.learn(total_timesteps=self.params.training_duration, tb_log_name=self.params.tb_log_name, callback=callback) def test(self): self.params.test_mode = True obs = self.env.reset() self.setup_writer() episode = 1 step = 0 episode_reward = [0.0] done = False # Test for 1 episode while not done: action, _states = self.model.predict(obs) obs, reward, dones, info = self.env.step(action) episode_reward[episode - 1] += reward[0] if info[0]['sim_time'] >= self.params.testing_duration: done = True self.write_reward(episode, episode_reward[episode - 1]) episode += 1 sys.stdout.write( "\rTesting:" + f"Current Simulator Time: {info[0]['sim_time']}. Testing duration: {self.params.testing_duration}" ) sys.stdout.flush() step += 1 print("") def save_model(self): """ Save the model to a zip archive """ self.model.save(self.params.model_path) def load_model(self, path=None): """ Load the model from a zip archive """ if path is not None: self.model = ACKTR.load(path) else: self.model = ACKTR.load(self.params.model_path) # Copy the model to the new directory self.model.save(self.params.model_path) def setup_writer(self): episode_reward_filename = f"{self.params.result_dir}/episode_reward.csv" episode_reward_header = ['episode', 'reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) def write_reward(self, episode, reward): self.episode_reward_writer.writerow([episode, reward])