def test_gail(expert_env): env_id, expert_path = expert_env env = gym.make(env_id) dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(1000) model.save("GAIL-{}".format(env_id)) model = model.load("GAIL-{}".format(env_id), env=env) model.learn(1000) obs = env.reset() for _ in range(1000): action, _ = model.predict(obs) obs, _, done, _ = env.step(action) if done: obs = env.reset() del dataset, model
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None): model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"]) conf_pth = os.path.join(model_logdir, "train.json") model_path = os.path.join(model_logdir, "best_model.zip") arg_dict["model_path"] = model_path with open(conf_pth, "w") as f: json.dump(arg_dict, f, indent=4) model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1] model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2] if pretrained_model: if not os.path.isabs(pretrained_model): pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model) env = model_args[1] vec_env = DummyVecEnv([lambda: env]) model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env) else: model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs) if arg_dict["algo"] == "gail": # Multi processing: (using MPI) if arg_dict["train_framework"] == 'tensorflow': # Generate expert trajectories (train expert) generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100) # Load the expert dataset dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1) model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy start_time = time.time() callbacks_list = [] if pretrained_model: model_logdir = pretrained_model.split('/') model_logdir = model_logdir[:-1] model_logdir = "/".join(model_logdir) auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) else: auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"]) callbacks_list.append(auto_save_callback) if arg_dict["eval_freq"]: eval_env = configure_env(arg_dict, model_logdir, for_train=False) eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir, eval_freq=arg_dict["eval_freq"], n_eval_episodes=arg_dict["eval_episodes"], record=arg_dict["record"], camera_id=arg_dict["camera"]) callbacks_list.append(eval_callback) #callbacks_list.append(PlottingCallback(model_logdir)) with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback: callbacks_list.append(progress_callback) model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list) model.save(os.path.join(model_logdir, model_name)) print("Training time: {:.2f} s".format(time.time() - start_time)) # info_keywords in monitor class above is neccessary for pybullet to save_results # when using the info_keywords for mujoco we get an error if arg_dict["engine"] == "pybullet": save_results(arg_dict, model_name, env, model_logdir) return model
def test_gail_callback(tmp_path): dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, sequential_preprocessing=True, verbose=0) model = GAIL("MlpPolicy", "Pendulum-v0", dataset) checkpoint_callback = CheckpointCallback(save_freq=500, save_path=str(tmp_path / 'logs/gail/'), name_prefix='gail') model.learn(total_timesteps=1000, callback=checkpoint_callback) shutil.rmtree(str(tmp_path / 'logs/gail/')) del dataset, model
def test_gail(tmp_path, expert_env): env_id, expert_path, load_from_memory = expert_env env = gym.make(env_id) traj_data = None if load_from_memory: traj_data = np.load(expert_path) expert_path = None dataset = ExpertDataset(traj_data=traj_data, expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(300) model.save(str(tmp_path / "GAIL-{}".format(env_id))) model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env) model.learn(300) evaluate_policy(model, env, n_eval_episodes=5) del dataset, model
def bc_from_dataset_and_params(dataset, bc_params, model_save_dir, num_epochs, lr, adam_eps): # Setup env gym_env = init_gym_env(bc_params) # Train and save model create_dir_if_not_exists(BC_SAVE_DIR + model_save_dir) model = GAIL("MlpPolicy", gym_env, dataset, verbose=1) model.pretrain(dataset, n_epochs=num_epochs, learning_rate=lr, adam_epsilon=adam_eps, save_dir=BC_SAVE_DIR + model_save_dir) save_bc_model(model_save_dir, model, bc_params) return model
def train(): # Load Model env = gym.make('roundabout-v0') model = DQN(MlpPolicy, env, verbose=1) generate_expert_traj(model, 'expert_roundabout', n_timesteps=1000, n_episodes=10) #Data Augmentation expert_data = dict(np.load('expert_roundabout.npz')) print("my keys are:" + str(expert_data.keys())) obs = expert_data['obs'] obs.shape expert_data['obs'] = obs.ravel() # convert to 1D array print("my keys are:" + str(expert_data.keys())) np.savez('expert_roundabout.npz', expert_data) dataset = ExpertDataset(expert_path='expert_roundabout.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1) model.learn(total_timesteps=1000) model.save("gail_roundabout") env.close() del env
def load_bc_model_from_path(model_name): # NOTE: The lowest loss and highest accuracy models # were also saved, can be found in the same dir with # special suffixes. bc_metadata = load_pickle(BC_SAVE_DIR + model_name + "/bc_metadata") bc_params = bc_metadata["bc_params"] model = GAIL.load(BC_SAVE_DIR + model_name + "/model") return model, bc_params
def train(params): # create model env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("model_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("model_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("expert_exists") is False: print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=10000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save("BC" + exp_name) env.close() del env
def stable_gail( venv, expert=None, expert_venv=None, state_only=False, total_timesteps=10000, gen_batch_size=200, disc_batch_size=100, policy_lr=1e-3, callback=None, **kwargs, ): dataset = get_expert_dataset(expert, expert_venv, total_timesteps) policy = GAIL("MlpPolicy", venv, dataset) policy.learn(total_timesteps=total_timesteps) results = {} results["policy"] = policy return results
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device env = gym.make(args.env) train_log_dir = os.path.join( args.train_log_dir, args.env + '_' + args.expert + '_' + args.policy_type) if args.expert == 'PPO': expert_model = PPO1(args.policy_type, env, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=1000, n_episodes=args.expert_episodes) dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, env, num_steps=10000) gail_model.save(train_log_dir) env.close()
def run_gail(): parser = argparse.ArgumentParser() parser.add_argument('expert', type=str, default=None, help='Expert path (*.npz)') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--note', type=str, default='test') parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--num-steps', type=int, default=1000000) parser.add_argument('--policy', type=str, default='CnnPolicy', choices=[ 'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy', 'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy' ], help='Policy architecture') args = parser.parse_args() logger.configure(os.path.join('logs', args.env, args.note)) logger.info(args) if 'NoFrameskip' in args.env: env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) else: import gym env = gym.make(args.env) dataset = ExpertDataset(expert_path=args.expert, batch_size=128, train_fraction=0.99, verbose=1) model = GAIL(args.policy, env, dataset, timesteps_per_batch=1280, verbose=1) model.learn(len(dataset.train_loader) * 1280)
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_GAIL(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = GAIL("MlpPolicy", env, verbose=0, **model_params) print("DOING LEARING a2c") original_env.force_progression = False model.learn(int(2e4 * 5), seed=seed) print("DONE LEARING a2c") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def load_model(path: str, algorithm: str): from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO if algorithm == 'PPO2': return PPO2.load(path) if algorithm == 'DQN': return DQN.load(path) if algorithm == 'A2C': return A2C.load(path) if algorithm == 'ACER': return ACER.load(path) if algorithm == 'GAIL': return GAIL.load(path) if algorithm == 'TRPO': return TRPO.load(path) return None
def train_GAIL(env_train, model_name, timesteps=1000): """GAIL Model""" #from stable_baselines.gail import ExportDataset, generate_expert_traj start = time.time() # generate expert trajectories model = SAC('MLpPolicy', env_train, verbose=1) generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10) # Load dataset dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1) model = GAIL('MLpPolicy', env_train, dataset, verbose=1) model.learn(total_timesteps=1000) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (PPO): ', (end - start) / 60, ' minutes') return model
def train_gail_withppo2(): env = gimbal(5, 500) env = DummyVecEnv([lambda: env]) model = PPO2.load("./models/baseline_ppo2_t1") generate_expert_traj(model, './models/baseline_expert_t1', env, n_timesteps=0, n_episodes=100) dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz', traj_limitation=-1, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=500000) model.save("./models/baseline_gail_ppo2_t1")
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device # train expert model for multiple times and save the best model best_reward = -np.inf train_env = make_vec_env(args.env, n_envs=args.n_env) eval_env = gym.make(args.env) for i in range(args.times_expert): train_env.reset() train_log_dir = os.path.join(args.train_log_dir, args.env + '_' + args.expert) if args.expert == 'PPO': expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\ lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) mean_reward = evaluate(expert_model, eval_env, num_steps=10000) if mean_reward > best_reward: best_reward = mean_reward expert_model.save( os.path.join(args.train_log_dir, args.env + '_expert')) del expert_model train_env.reset() expert_model = PPO2.load(os.path.join(args.train_log_dir, args.env + '_expert'), env=train_env) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=-1, n_episodes=args.expert_episodes) train_env.close() dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, args.env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, eval_env, num_steps=10000) gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL')) eval_env.close()
def eval_with_standard_baselines(n_games, model_name, display=False): """Method to evaluate agent performance with stable-baselines infrastructure, just to make sure everything is compatible and integrating correctly.""" bc_metadata = load_pickle(BC_SAVE_DIR + model_name + "/bc_metadata") bc_params = bc_metadata["bc_params"] model = GAIL.load(BC_SAVE_DIR + model_name + "/model") gym_env = init_gym_env(bc_params) tot_rew = 0 for i in tqdm.trange(n_games): obs, _ = gym_env.reset() done = False while not done: ob0, ob1 = obs a0 = stable_baselines_predict_fn(model, ob0) a1 = stable_baselines_predict_fn(model, ob1) joint_action = (a0, a1) (obs, _), rewards, done, info = gym_env.step(joint_action) tot_rew += rewards print("avg reward", tot_rew / n_games) return tot_rew / n_games
def trian_agent_with_gail(load): from stable_baselines.common.policies import MlpPolicy from stable_baselines import GAIL env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: ExpData = ExpertDataset("./lqr_export.npz") model = GAIL(CustomPolicy, env, ExpData, verbose=1) model.learn(total_timesteps=1000000) model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128") else: # with model.graph.as_default(): # for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): # print(i) model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env) with model.graph.as_default(): print(tf.all_variables()) return model
# Generate expert trajectories (train expert) env = PrticleEnv(alpha=1, beta=10, win_thre=1, max_timestep=256, for_circle_traj=True) model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip") model.set_env(env) generate_expert_traj(model, 'expert_part_circle_exp2_epoch05_sib', n_episodes=10) # Load the expert dataset dataset = ExpertDataset(expert_path='expert_part_circle_exp2_epoch05_sib.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy'\ ,DummyVecEnv([lambda: PrticleEnv(alpha=1,beta=10,win_thre=1, max_timestep=256,for_circle_traj=True)])\ , dataset, verbose=1, n_cpu_tf_sess=None) # Note: in practice, you need to train for 1M steps to have a working policy model.learn(total_timesteps=int(1e4)) model.save("_gail_sanity_test_exp1") del model # %%
def build_model(algo, env_name, log_dir, expert_dataset=None): """ Initialize model according to algorithm, architecture and hyperparameters :param algo: (str) Name of rl algorithm - 'sac', 'ppo2' etc. :param env_name:(str) :param log_dir:(str) :param expert_dataset:(ExpertDataset) :return:model: stable_baselines model """ model = None if algo == 'sac': policy_kwargs = dict(layers=[64, 64, 64], layer_norm=False) model = SAC('MlpPolicy', env_name, gamma=0.99, learning_rate=1e-4, buffer_size=500000, learning_starts=5000, train_freq=500, batch_size=64, policy_kwargs=policy_kwargs, tau=0.01, ent_coef='auto_0.1', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=2, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True, seed=None, n_cpu_tf_sess=None) elif algo == 'ppo1': model = PPO1('MlpPolicy', env_name, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) elif algo == 'trpo': model = TRPO('MlpPolicy', env_name, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1) elif algo == 'gail': assert expert_dataset is not None model = GAIL('MlpPolicy', env_name, expert_dataset, tensorboard_log=log_dir, verbose=1) assert model is not None return model
type=int, help='Number of games to test.') parser.add_argument('-s', '--save', default=True, type=bool) args = parser.parse_args() sys.path.append('/Users/cusgadmin/Documents/UCB/Academics/SSastry/\ Multi_agent_competition') os.chdir( '/Users/cusgadmin/Documents/UCB/Academics/SSastry/Multi_agent_competition/' ) print(colored('Testing learnt policy from model file {} for {} games!'.\ format(args.model,args.num_test),'red')) start_time = time.time() model = GAIL.load(args.model) env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0') g = 1 obs = env.reset(ep=g) e_win_games = int(0) env.render(mode='human', highlight=True, ep=g) if args.save: metadata = dict(title='Game') writer = FFMpegWriter(fps=5, metadata=metadata) writer.setup(env.window.fig, "test_game.mp4", 300) writer.grab_frame() while True: action, _states = model.predict(obs) obs, rewards, done, e_win = env.step(action) env.render(mode='human', highlight=True, ep=g) if args.save:
import gym from stable_baselines import GAIL, SAC from stable_baselines.gail import ExpertDataset, generate_expert_traj # Generate expert trajectories (train expert) model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1) generate_expert_traj(model, 'expert_pendulum', n_timesteps=100, n_episodes=10) # Load the expert dataset dataset = ExpertDataset(expert_path='expert_pendulum.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy', 'Pendulum-v0', dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy model.learn(total_timesteps=100000) model.save("gail_pendulum") del model # remove to demonstrate saving and loading model = GAIL.load("gail_pendulum") env = gym.make('Pendulum-v0') obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
#env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) env = gym.make('gym_docking:docking-v1') # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env, n_envs=num_cpu, seed=0) checkpoint_callback = CheckpointCallback( save_freq=int(5e4), save_path='./logs/', name_prefix='rl_model_621_gail_10M') dataset = ExpertDataset(expert_path='./expert_PID/expert_PID_new.npz', traj_limitation=-1, batch_size=10) model = GAIL(policy='MlpPolicy', env=env, verbose=1, tensorboard_log="./gail_docking_tensorboard/", policy_kwargs=dict( net_arch=[dict(pi=[128, 128], vf=[128, 128])], act_fun=tf.nn.relu), expert_dataset=dataset) # load trained model # model = PPO2.load("./ppo2_docking_621_random_pre.zip", env=env, tensorboard_log="./ppo2_docking_tensorboard/") model.learn(total_timesteps=int(10e6), callback=checkpoint_callback) model.save("gail_docking_621_10M")
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log, expert_path, pretrain, pretrain_epochs, mdpo_update_steps, num_trajectories, expert_model, exploration_bonus, bonus_coef, random_action_len, is_action_features, dir_name, neural, lipschitz, args): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): # from mpi4py import MPI # rank = MPI.COMM_WORLD.Get_rank() rank = 0 env_name = env_id[:-3].lower() log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\ + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam) log_dir += '_' + dir_name + '/' log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps) # log_name += '_randLen' + str(random_action_len) if exploration_bonus: log_name += '_exploration' + str(bonus_coef) if pretrain: log_name += '_pretrain' + str(pretrain_epochs) if not is_action_features: log_name += "_states_only" log_name += '_s' + str(seed) log_path = log_dir + log_name expert_path = './experts/' + expert_path num_timesteps = int(num_timesteps) args = args.__dict__ dir_path = os.getcwd() + log_dir[1:] if not os.path.exists(dir_path): os.makedirs(dir_path) with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file: file.write("Experiment Arguments:") for key, val in args.items(): print(key, ": ", val, file=file) if log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() # env = make_mujoco_env(env_id, workerseed) def make_env(): # env_out = gym.make(env_id, reset_noise_scale=1.0) env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) env_out = wrap_mujoco(env_out, random_action_len=random_action_len) return env_out # env = DummyVecEnv([make_env]) # env = VecNormalize(env) if algo == 'Train': train = True else: train = False if algo == 'Evaluate': eval = True else: eval = False if train: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) if num_timesteps > 0: model = SAC('MlpPolicy', env_id, verbose=1, buffer_size=1000000, batch_size=256, ent_coef='auto', train_freq=1, tau=0.01, gradient_steps=1, learning_starts=10000) else: model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=num_trajectories) if num_timesteps > 0: model.save('sac_' + env_name + '_' + str(num_timesteps)) elif eval: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=10, evaluate=True) else: expert_path = expert_path + '.npz' dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, verbose=1) if algo == 'MDAL': model = MDAL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal/", seed=seed, buffer_size=1000000, ent_coef=0.0, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, d_step=10, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lipschitz=lipschitz) elif algo == 'MDAL_ON_POLICY': model = MDAL_MDPO_ON('MlpPolicy', env, dataset, verbose=1, timesteps_per_batch=2048, tensorboard_log="./experiments/" + env_name + "/mdal_mdpo_on/", seed=seed, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, adversary_entcoeff=0.001, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, sgd_steps=sgd_steps, klcoeff=1.0, method="multistep-SGD", tsallis_q=1.0, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural) elif algo == 'MDAL_TRPO': model = MDAL_TRPO('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal_trpo/", seed=seed, gamma=0.99, g_step=3, d_step=5, sgd_steps=1, d_stepsize=9e-5, entcoeff=0.0, adversary_entcoeff=0.001, max_kl=t_pi, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lam=0.98, timesteps_per_batch=2000, lipschitz=lipschitz) elif algo == 'GAIL': from mpi4py import MPI from stable_baselines import GAIL model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail/", seed=seed, entcoeff=0.0, adversary_entcoeff=0.001, lipschitz=lipschitz) elif algo == 'GAIL_MDPO_OFF': # from mpi4py import MPI from stable_baselines import GAIL_MDPO_OFF model = GAIL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail_mdpo_off/", seed=seed, ent_coef=0.0, adversary_entcoeff=0.001, buffer_size=1000000, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, lipschitz=lipschitz) else: raise ValueError("Not a valid algorithm.") if pretrain: model.pretrain(dataset, n_epochs=pretrain_epochs) model.learn(total_timesteps=num_timesteps, tb_log_name=log_name) env.close()
def build_model(algo, policy, env_name, log_dir, expert_dataset=None): """ Initialize model according to algorithm, architecture and hyperparameters :param algo: (str) Name of rl algorithm - 'sac', 'ppo2' etc. :param env_name:(str) :param log_dir:(str) :param expert_dataset:(ExpertDataset) :return:model: stable_baselines model """ from stable_baselines.common.vec_env import DummyVecEnv model = None if algo == 'sac': # policy_kwargs = dict(layers=[64, 64, 64],layer_norm=False) # model = SAC(policy, env_name, gamma=0.99, learning_rate=1e-4, buffer_size=500000, # learning_starts=5000, train_freq=500, batch_size=64, policy_kwargs=policy_kwargs, # tau=0.01, ent_coef='auto_0.1', target_update_interval=1, # gradient_steps=1, target_entropy='auto', action_noise=None, # random_exploration=0.0, verbose=2, tensorboard_log=log_dir, # _init_setup_model=True, full_tensorboard_log=True, # seed=None, n_cpu_tf_sess=None) # SAC - start learning from scratch # policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[32, 32, 32]) policy_kwargs = dict(layers=[32, 32, 32], layer_norm=False) env = DummyVecEnv([lambda: gym.make(env_name)]) # model = A2C(CnnMlpPolicy, env, verbose=1,gamma=0.99, learning_rate=1e-4, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True,seed=None, n_cpu_tf_sess=None) model = SAC(CustomSacCnnMlpPolicy, env=env, gamma=0.99, learning_rate=1e-4, buffer_size=50000, learning_starts=1000, train_freq=100, batch_size=1, tau=0.01, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=1, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True, seed=None, n_cpu_tf_sess=None) elif algo == 'ppo1': model = PPO1('MlpPolicy', env_name, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) elif algo == 'trpo': model = TRPO('MlpPolicy', env_name, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1) elif algo == 'gail': assert expert_dataset is not None model = GAIL('MlpPolicy', env_name, expert_dataset, tensorboard_log=log_dir, verbose=1) assert model is not None return model
'host': '172.21.217.140', 'nget': 150 } env = gym.make(**env_dict) # Generate expert trajectories (train expert) model = SAC('MlpPolicy', env, verbose=1) generate_expert_traj(model, 'expert_prescan', n_timesteps=100, n_episodes=10) # Load the expert dataset dataset = ExpertDataset(expert_path='expert_prescan.npz', traj_limitation=10, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy try: model.learn(total_timesteps=1000) except: pass model.save(save_load) ''' del model # remove to demonstrate saving and loading model = GAIL.load(save_load) env = gym.make(**env_dict) obs = env.reset() while True: action, _states = model.predict(obs)
sys.path.append('/Users/cusgadmin/Documents/UCB/Academics/SSastry/\ Multi_agent_competition') os.chdir('/Users/cusgadmin/Documents/UCB/Academics/SSastry/Multi_agent_competition/') if args.train: now = datetime.datetime.now() print(colored('Loading expert data from {}!'.format(args.exp_file),'red')) exp_data = np.load(args.exp_file) print(colored('Expert evader has won {} games!'\ .format(len(exp_data['episode_returns'])),'red')) dataset = ExpertDataset(expert_path=args.exp_file, verbose=1) start_time = time.time() model = GAIL('MlpPolicy', 'gym_pursuitevasion_small:pursuitevasion_small-v0', dataset, verbose=1) print(colored('Training a behaviour cloning agent for {} iterations!'.format(int(args.total_iters)),'red')) model.pretrain(dataset=dataset,n_epochs=int(args.total_iters)) model.save('games{}_iters{}_{}_bc_pursuitevasion_small'.format(len(exp_data['episode_returns']),\ int(args.total_iters),str(now.strftime('%Y%m%d')))) end_time = time.time() print(colored('Training time: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(end_time-start_time,\ (end_time-start_time)/60,(end_time-start_time)/3600),'red')) print(colored('Trained BC policy','red')) else: #test print(colored('Trained on expert data from {}!'.format(args.exp_file),'red')) # exp_data = np.load(args.exp_file)s print(colored('Testing learnt policy from model file {} for {} games!'.\ format(args.model,int(args.num_test)),'red'))
n_episodes=1000) env.close() print("Ending expert training, training with GAIL") # Load the expert dataset worker_id += 1 env = UnityEnv(env_name, worker_id=worker_id, use_visual=False) # , no_graphics=True env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run dataset = ExpertDataset(expert_path='expert_basic_env.npz', traj_limitation=10, verbose=1) model = GAIL("MlpPolicy", env, dataset, verbose=1) model.learn(total_timesteps=30000) model.save(log_dir + "model") print("evaluating agent") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while total_l < 200: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) total_l += 1.