def fed_and_eval(base_index, w): base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0", n_envs=1, seed=seed) base_agent = ACKTR.load( f"./base_agent/{subenv_dict[base_index]}/model.zip") base_parameter_dict = base_agent.get_parameters() sub_model_parameters = [] for subenv in subenv_dict.values(): client_policy = ACKTR.load( f"./base{base_index}_client_model/{subenv}/policy.zip") sub_model_parameters.append(client_policy.get_parameters()) aligned_agent = base_agent base_parameter_dict = aligned_agent.get_parameters() model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha) aligned_agent.load_parameters(base_parameter_dict) avg_reward, reward_std = evaluate_policy(aligned_agent, base_env, n_eval_episodes=100) print(f"base {base_index}, weight {w} done") return (avg_reward, reward_std)
def load_model(self, path=None): """ Load the model from a zip archive """ if path is not None: self.model = ACKTR.load(path) else: self.model = ACKTR.load(self.params.model_path) # Copy the model to the new directory self.model.save(self.params.model_path)
def main(): env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) if not USE_LOADED_MODEL: model = ACKTR('MlpPolicy', env, verbose=1) # Multiprocessed RL Training start_time = time.time() model.learn(total_timesteps=n_timesteps, log_interval=10) total_time_multi = time.time() - start_time model.save("cartpole_v1_acktr") loaded_model = ACKTR.load("cartpole_v1_acktr") loaded_model.set_env(env) # Single Process RL Training single_process_model = ACKTR('MlpPolicy', env_id, verbose=1) start_time = time.time() single_process_model.learn(n_timesteps) total_time_single = time.time() - start_time print("Single-process: {0}s, Multi-process: {1}s".format( total_time_single, total_time_multi)) # create separate clean environment for evaluation eval_env = gym.make(env_id) mean_reward, std_reward = evaluate_policy(loaded_model, eval_env, n_eval_episodes=10) print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
def get_intrinsic_reward(base_index): intrinsic_rewards = [[] for _ in range(len(subenv_dict))] # base env base_name = subenv_dict[base_index] base_env = make_vec_env(f"selected-bipedal-{base_name}-v0", n_envs=1, seed=seed) base_agent = ACKTR.load(f"./base_agent/{base_name}/model.zip") # rnd model rnd_dict = {} for client_env in subenv_dict.values(): rnd = RandomNetworkDistillation(input_size=24) rnd.load(f"./base{base_index}_client_model/{client_env}/rnd") rnd_dict[client_env] = rnd obs = base_env.reset() for _ in range(num_test): for i, client_env in subenv_dict.items(): intrinsic_rewards[i].append( rnd_dict[client_env].get_intrinsic_reward(obs)) action = base_agent.predict(obs) obs, reward, done, info = base_env.step(action[0]) if done: obs = base_env.reset() return intrinsic_rewards
def NewPotential(current_window, algorithm='PPO'): # Determine the pretrained agent if algorithm == 'A2C': model = A2C.load("pretrained_A2C") elif algorithm == 'PPO': model = PPO2.load("pretrained_PPO") elif algorithm == 'ACKTR': model = ACKTR.load("pretrained_ACKTR") elif algorithm == 'ACER': model = ACER.load("pretrained_ACER") else: raise ValueError("%s is not a valid algorithm." % algorithm) if len(current_window) != model.observation_space.shape[0]: raise ValueError("%s is does not match the model's window size." % len(current_window)) action, _states = model.predict(current_window, deterministic=False) voltages = np.linspace(0, 1, num=model.action_space.n) if action >= 0 and action <= model.action_space.n - 1: voltage = voltages[action] else: raise ValueError( "Received invalid action={} which is not part of the action space". format(action)) return voltage
def run_sonobuoy_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'sonobuoy_training' else: writer = None tb_log_name = None env = gym.make('plark-env-v0', panther_agent_filepath='/data/agents/models/PPO2_20200429_073132_panther/') if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def run_illegal_move_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'Illegal_move_prevention_training' else: writer = None tb_log_name = None if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) # env = plark_env_illegal_move.PlarkEnvIllegalMove( config_file_path='/Components/plark-game/plark_game/game_config/10x10/balanced.json') env = gym.make('plark-env-illegal-move-v0') if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model env = gym.make('plark-env-illegal-move-v0') model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def loadAgent(self, filepath, algorithm_type): try: if algorithm_type.lower() == 'dqn': self.model = DQN.load(filepath) elif algorithm_type.lower() == 'ppo2': self.model = PPO2.load(filepath) elif algorithm_type.lower() == 'a2c': self.model = A2C.load(filepath) elif algorithm_type.lower() == 'acktr': self.model = ACKTR.load(filepath) except: raise ValueError('Error loading pelican agent. File : "' + filepath + '" does not exsist')
def loadAgent(self, filepath, algorithm_type): try: if algorithm_type.lower() == "dqn": self.model = DQN.load(filepath) elif algorithm_type.lower() == "ppo2": self.model = PPO2.load(filepath) elif algorithm_type.lower() == "ppo": self.model = PPO.load(filepath) elif algorithm_type.lower() == "a2c": self.model = A2C.load(filepath) elif algorithm_type.lower() == "acktr": self.model = ACKTR.load(filepath) except: raise ValueError('Error loading panther agent. File : "' + filepath + '" does not exsist')
def save_client(base_index, subenv_id): base_agent = ACKTR.load( f"./base_agent/{subenv_dict[base_index]}/model.zip") subenv = subenv_dict[subenv_id] env = make_vec_env(f"selected-bipedal-{subenv}-v0", n_envs=n_envs, seed=seed) learner = base_agent learner.env = env learner.verbose = 0 callback = SaveRNDDatasetCallback(base_index=base_index) learner.learn( total_timesteps=client_timesteps, callback=callback, ) dir_name = f"base{base_index}_client_model/{subenv}" Path(dir_name).mkdir(parents=True, exist_ok=True) learner.save(f"{dir_name}/policy.zip") print(f"base {base_index} sub-env {subenv} done")
def eval_base_agent(agent_index): mean_result = [] std_result = [] agent = ACKTR.load(f"./base_agent/{subenv_dict[agent_index]}/model.zip") for env_index in range(4): env = gym.make(f"selected-bipedal-{subenv_dict[env_index]}-v0") env.seed = seed mean, std = evaluate_policy(agent, env, n_eval_episodes=100) mean_result.append(mean) std_result.append(std) Path("log").mkdir(parents=True, exist_ok=True) file = open(f"log/agent{agent_index}_simple_agent_test.csv", "w", newline="") writer = csv.writer(file) writer.writerow(mean_result) writer.writerow(std_result) file.close() print(f">>> Agent {agent_index}:") print(mean_result) print(std_result) return
def train_acktr(seed): """ test ACKTR on the uav_env(cartesian,discrete) """ """ ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False) """ algo = 'ACKTR' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo), _init_setup_model=True) # , async_eigen_decomp=False) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = ACKTR.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
#! /usr/bin/env python import gym gym.logger.set_level(40) import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) from env import GoLeftEnv from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy env = GoLeftEnv(grid_size=10) env = make_vec_env(lambda: env, n_envs=1) model = ACKTR.load("models/acktr_goleft", env=env) obs = env.reset() n_steps = 20 for step in range(n_steps): action, _ = model.predict(obs, deterministic=True) print("Step {}".format(step + 1)) print("Action: ", action) obs, reward, done, info = env.step(action) print('obs=', obs, 'reward=', reward, 'done=', done) if done: # Note that the VecEnv resets automatically # when a done signal is encountered print("Goal reached!", "reward=", reward) break
train(model, env, out_dir) else: #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "rl") path = '{}/best_model.zip'.format(args.eval) env = CarEnv(args.eval, cam_idx_list=(0, 3, 4)) env.next_weather() #env = Monitor(env, args.eval) #print(env.num_envs) if args.model == 'trpo': model = TRPO.load(path) elif args.model == 'acer': model = ACER.load(path) elif args.model == 'ppo': model = PPO2.load(path) elif args.model == 'acktr': model = ACKTR.load(path) elif args.model == 'ddpg': model = DDPG.load(path) elif args.model == 'a2c': model = A2C.load(path) elif args.model == 'sac': model = SAC.load(path) #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True) #eps_rewards, eps_len = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True) # print(eps_rewards) # print(eps_len) # print(np.mean(eps_rewards)) #print("Mean reward = {}","Std reward = {}".format(np.mean(eps),std_reward)) rs = evaluate(model, env) with open("{}/result.txt".format(args.eval), 'w') as f: for item in rs:
envTmp = gym.make('Battleships-v0', config=config) #Wrap environment into a vector environment env = DummyVecEnv([lambda: envTmp]) # Choose to display board print("Diplay board: Yes (1), No (0)") choiceRender = bool(int(input())) # Choose Model randomAgent = True print("Choose Agent: Radom (1), ACKTR (2), DQN (3)") choice = int(input()) if choice == 2: # Load ACKTR Model model = ACKTR.load("./ACKTR_Models/ACKTR_5x5_3_2_2_Dynamic.zip", verbose=0, env=env) # Disable Random Agent randomAgent = False elif choice == 3: # load DQN Model model = DQN.load("./DQN_Models/DQN_5x5_3_2_2_Dynamic.zip", verbose=0, env=env) # Disable Random Agent randomAgent = False # Inits result Array results = [] # Iteration: Amount of played Games for iteration in range(10): score = 0 print('Iteration', iteration)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'} import warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=Warning) import tensorflow as tf tf.get_logger().setLevel('INFO') tf.autograph.set_verbosity(0) import logging tf.get_logger().setLevel(logging.ERROR) import env_yaw import gym from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy env = gym.make("Yaw-v0") env = make_vec_env(lambda: env, n_envs=1) model = ACKTR.load("models/acktr_yaw", env=env) obs = env.reset() n_steps = 20 for step in range(n_steps): action, _ = model.predict(obs, deterministic=True) print("Step {}".format(step + 1)) print("Action: ", action) obs, reward, done, info = env.step(action) print('obs=', obs, 'reward=', reward, 'done=', done) if done: print("Goal reached!", "reward=", reward) break
pass def close(self): pass env = AItest() env = Monitor(env, filename=None, allow_early_resets=True) env = DummyVecEnv([lambda: env]) #Train the agent and save #model = ACKTR('MlpPolicy', env, verbose=1).learn(50000) #print ("learning done") #model.save('Macke_AI') #print ("save done") # run with saved agent model = ACKTR.load('Macke_AI') ## Test the trained agent #obs = env.reset() #n_steps = 100 #for step in range(n_steps): # action, _ = model.predict(obs, deterministic=True) # #print("Step {}".format(step + 1)) # #print("Action: ", action) # obs, reward, done, info = env.step(action) # #print('obs=', obs, 'reward=', reward, 'done=', done) # env.render() # if done: # # Note that the VecEnv resets automatically # # when a done signal is encountered # print("Goal rreached!", "reward=", reward)
param_list=config['param_list']), config['wrapper_args']) else: env = gym.make('jackal_navigation-v0', gui=gui, VLP16=config['VLP16'], world_name=config['world_name'], init_position=config['init_position'], goal_position=config['goal_position'], max_step=config['max_step'], time_step=config['time_step'], param_delta=config['param_delta'], param_init=config['param_init'], param_list=config['param_list']) if config['algorithm'] == 'ACKTR': model = ACKTR.load(model_path) elif config['algorithm'] == 'PPO2': model = PPO2.load(model_path) elif config['algorithm'] == 'DQN': model = DQN.load(model_path) range_dict = { 'max_vel_x': [0.1, 2], 'max_vel_theta': [0.314, 3.14], 'vx_samples': [1, 12], 'vtheta_samples': [1, 40], 'path_distance_bias': [0.1, 1.5], 'goal_distance_bias': [0.1, 2] } rs = []
# New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(model_file) n_steps += 1 # Returning False will stop training early return True env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) if os.path.isfile(model_file): model = ACKTR.load(model_file, env=env) else: model = ACKTR( MlpLnLstmPolicy, env, tensorboard_log=f"./test{base_test_file}/", verbose=0 ) # add tensorboard_log="./test/" and run tensorboard --logdir /Users/constantin/Documents/bn/rl/test/PPO2_1 model.learn(total_timesteps=10**5, callback=callback) # def evaluate(model, num_steps=1000): # obs = env.reset() # for i in range(num_steps): # # _states are only useful when using LSTM policies # action, _states = model.predict(obs) #
# shutil.rmtree(checkpoint_name) tf.saved_model.simple_save( model.sess, checkpoint_name, inputs={"obs": model.act_model.obs_ph}, outputs={"action": model.act_model._deterministic_action}) if __name__ == '__main__': if os.path.isdir(file): shutil.rmtree(file) if args.algo == 'ppo': model = PPO2.load(file) elif args.algo == 'acktr': model = ACKTR.load(file) # generate_checkpoint_from_model(model, file) # converter = tf.lite.TFLiteConverter.from_saved_model(file) # tflite_model = converter.convert() # open(file + "/converted_model.tflite", "wb").write(tflite_model) # multiprocess environment n_cpu = 1 env = SubprocVecEnv( [lambda: gym.make('PendulumA-v0', renders=True) for i in range(n_cpu)]) obs = env.reset() # When using VecEnv, done is a vector done = [False for _ in range(env.num_envs)] while True:
model = ACKTR(get_policy(policy), env, n_steps=100, verbose=0, gae_lambda=0.95, vf_fisher_coef=0.5, tensorboard_log=tensorboard_folder, kfac_update=10, n_cpu_tf_sess=2, async_eigen_decomp=False) model.learn(total_timesteps=100000000, tb_log_name='ACKTR_PPO2' + model_tag) model.save(model_folder + "ACKTR_PPO2" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_PPO2" + model_tag) done = False states = None action_masks = [] obs = env.reset() while not done: action, states = model.predict(obs, states, action_mask=action_masks) obs, _, done, infos = env.step(action) env.render() action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask)
# model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value model.learn(total_timesteps=1000000, reset_num_timesteps=False, callback=callback) model.save(log_dir + 'model_PPO_' + str(id + 1)) if args.algo == "acktr": id = balboa.utils.tensorboard_latest_directory_number( log_dir, 'ACKTR_') print('Using acktr') if args.load_id == None: # tensorboard_log=log_dir model = ACKTR("MlpPolicy", env, policy_kwargs=policy_kwargs, ent_coef=0.0, verbose=1) # verbose=1, n_steps=48, learning_rate=0.1, lr_schedule='constant', else: print("Loading model: " + str(args.load_id)) model = ACKTR.load(log_dir + 'ACKTR_' + str(args.load_id) + ".zip", env=env) model.tensorboard_log = log_dir # model.learning_rate = stable_baselines.common.schedules.LinearSchedule(1.0, 0.06, initial_p=0.06).value # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value model.learn(total_timesteps=3000000, reset_num_timesteps=False, callback=callback) print("Saving to: " + log_dir + 'ACKTR_' + str(id + 1)) model.save(log_dir + 'model_ACKTR_' + str(id + 1))
global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 """ ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, policy_kwargs=None, full_tensorboard_log=False) """ # model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20, # ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, # max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, # tensorboard_log=None, _init_setup_model=True) model = ACKTR.load( '/home/daniel/Desktop/Experiment_CurriculumLearning_2019-04-12_13-06-45/curriculum/curriculum/curriculum_seed_0/models/model_3000000_steps.pkl' ) model.set_env(env) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=100) images = [] obs = model.env.reset() # img = model.env.render(mode='rgb_array') model.env.render(mode='human') # for i in range(30000): # # images.append(img)
from stable_baselines.common.policies import MlpPolicy #from stable_baselines.common.policies import LnMlpPolicy #from stable_baselines import PPO2 from stable_baselines import ACKTR import os from callback import SaveOnBestTrainingRewardCallback from stable_baselines.bench import Monitor from stable_baselines.common import make_vec_env # Gym Environment 호출 env = Manipulator2D() load_model_path = "tmp9/acktr_16110000.zip" #load_model_path = "ppo2-mani7.zip" #저장된 학습 파일로부터 weight 등을 로드 model = ACKTR.load(load_model_path) # 시뮬레이션 환경을 초기화 obs = env.reset() points = 0 total_time = 0 while (total_time <= 120): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones: total_time += env.t # env.buffer = env.buffer_csv
env6 = DummyVecEnv([lambda: env6]) state[6] = env6.reset() env7 = gym.make(env_id[7]) env7 = DummyVecEnv([lambda: env7]) state[7] = env7.reset() env8 = gym.make(env_id[8]) env8 = DummyVecEnv([lambda: env8]) state[8] = env8.reset() env9 = gym.make(env_id[9]) env9 = DummyVecEnv([lambda: env9]) state[9] = env9.reset() print('Environment Created') model_name = 'ACKTR_MlpLSTM_' + group + '_' + args.reward MODEL_PATH = 'Saved_models' tr_model = ACKTR.load(MODEL_PATH + '/' + model_name) t = 480 ## number of time steps to evaluate. t = 480 is 1 day all_state = np.zeros((10, t)) print('Simulation Started ... ...') for i in range(t): aa, _ = tr_model.predict(state) # print(aa) action = Action(basal=aa[0] / 6000, bolus=0) state[0], reward, done, _ = env0.step(action) action = Action(basal=aa[1] / 6000, bolus=0) state[1], reward, done, _ = env1.step(action) action = Action(basal=aa[2] / 6000, bolus=0) state[2], reward, done, _ = env2.step(action) action = Action(basal=aa[3] / 6000, bolus=0)
from helperFun import grav_options import sys import pprint if __name__ == "__main__": try: if len(sys.argv) < 3: raise StringException("Usage: training_logs/<path to agent model> <grav-option>") # env = myPandaFreeSpaceTraj(has_renderer=True) run_name = sys.argv[1] grav_option = sys.argv[2] if grav_option == "ee_PD_cont": env = myPandaIKWrapper3D(has_renderer=True) else: env = myPandaFreeSpace1Goal(has_renderer=True, grav_option=grav_options[grav_option]) model = ACKTR.load("training_logs/" + run_name) # mean_reward, n_steps = evaluate_policy(model, env, 10) # print("avg reward:{}\nnumber of steps:{}".format(mean_reward, n_steps)) ## Play Agent done = False obs = env.reset() cum_reward = 0 action_band = 10 count = 0 pp = pprint.PrettyPrinter() while True: if done: print("Reward:", cum_reward) cum_reward = 0
import gym from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import ACKTR # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("acktr_cartpole") del model # remove to demonstrate saving and loading model = ACKTR.load("acktr_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv(10, 10)]) model = ACKTR(get_policy(policy), env, verbose=0, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag) model.save(model_folder + "ACKTR_A2C" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
set_global_seeds(seed) return _init if __name__ == '__main__': env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment #env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) #env = gym.make(env_id) env = CustomEnv(3, 6, "tcp://*:5556") # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) # Create log dir log_dir = "Logs/env_id/" os.makedirs(log_dir, exist_ok=True) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # env = Monitor(env, log_dir) model = ACKTR(MlpPolicy, env, verbose=2) model.load("RL_agent") while True: user_in = input("Enter States: ").split(',') obs = [int(i) for i in user_in] print(model.action_probability(obs)) action = model.predict(obs, deterministic = True) print(action)
# Callback safes the currently best model eval_callback = EvalCallback(env4, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path='./ACKTR_Models/best/') checkpoint_callback = CheckpointCallback(save_freq=1e4, save_path='./model_checkpoints/') # Uncomment, to train a new fresh model, otherwise a allready trained model will be trained # If a frehs model is trained, it should be trained with binary reward (Config) first, to reduce multiple # shots onto the same field. #model = ACKTR(MlpPolicy, env, verbose=2, tensorboard_log="./logs/progress_tensorboard/", n_cpu_tf_sess=4) # Load current best model model = ACKTR.load("./ACKTR_Models/best/best_model.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/") # Train model model.learn(1000000, callback=[checkpoint_callback, eval_callback]) # Delete current model and load the best model del model model = ACKTR.load("./ACKTR_Models/best/best_model.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/") # Test trained model results = [] for iteration in range(100):