log_path = "{}/{}/".format(args.log_folder, args.algo) if exp_name: assert (not ('_' in exp_name)), 'experiment name should not include _' save_path = os.path.join( log_path, "{}_{}_{}".format(env_id, exp_name, get_latest_run_id(log_path, env_id, exp_name) + 1)) else: save_path = os.path.join( log_path, "{}_{}".format(env_id, get_latest_run_id(log_path, env_id) + 1)) if args.log_outputs: # Log the outputs logger.configure(folder=save_path, format_strs=['log']) params_path = "{}/{}".format(save_path, env_id) os.makedirs(params_path, exist_ok=True) tensorboard_log = None if args.no_tensorboard else save_path monitor_log = None if args.no_monitor else save_path is_atari = 'NoFrameskip' in env_id print("=" * 10, env_id, "=" * 10) # Load hyperparameters from yaml file with open('hyperparams/{}.yml'.format(args.algo), 'r') as f: hyperparams_dict = yaml.load(f) if is_atari: hyperparams = hyperparams_dict['atari']
def main(args): rank = MPI.COMM_WORLD.Get_rank() model_dir = os.path.join(config.MODELDIR, args.env_name) if rank == 0: try: os.makedirs(model_dir) except: pass if args.reset: reset_files(model_dir) logger.configure(config.LOGDIR) else: logger.configure(format_strs=[]) if args.debug: logger.set_level(config.DEBUG) else: time.sleep(5) logger.set_level(config.INFO) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) logger.info('\nSetting up the selfplay training environment opponents...') base_env = get_environment(args.env_name) env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose) env.seed(workerseed) CustomPolicy = get_network_arch(args.env_name) params = { 'gamma': args.gamma, 'timesteps_per_actorbatch': args.timesteps_per_actorbatch, 'clip_param': args.clip_param, 'entcoeff': args.entcoeff, 'optim_epochs': args.optim_epochs, 'optim_stepsize': args.optim_stepsize, 'optim_batchsize': args.optim_batchsize, 'lam': args.lam, 'adam_epsilon': args.adam_epsilon, 'schedule': 'linear', 'verbose': 1, 'tensorboard_log': config.LOGDIR } time.sleep( 5 ) # allow time for the base model to be saved out when the environment is created if args.reset or not os.path.exists( os.path.join(model_dir, 'best_model.zip')): logger.info('\nLoading the base PPO agent to train...') model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params) else: logger.info( '\nLoading the best_model.zip PPO agent to continue training...') model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env, **params) #Callbacks logger.info( '\nSetting up the selfplay evaluation environment opponents...') callback_args = { 'eval_env': selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose), 'best_model_save_path': config.TMPMODELDIR, 'log_path': config.LOGDIR, 'eval_freq': args.eval_freq, 'n_eval_episodes': args.n_eval_episodes, 'deterministic': False, 'render': True, 'verbose': 0 } if args.rules: logger.info( '\nSetting up the evaluation environment against the rules-based agent...' ) # Evaluate against a 'rules' agent as well eval_actual_callback = EvalCallback( eval_env=selfplay_wrapper(base_env)(opponent_type='rules', verbose=args.verbose), eval_freq=1, n_eval_episodes=args.n_eval_episodes, deterministic=args.best, render=True, verbose=0) callback_args['callback_on_new_best'] = eval_actual_callback # Evaluate the agent against previous versions eval_callback = SelfPlayCallback(args.opponent_type, args.threshold, args.env_name, **callback_args) logger.info('\nSetup complete - commencing learning...\n') model.learn(total_timesteps=int(1e9), callback=[eval_callback], reset_num_timesteps=False, tb_log_name="tb") env.close() del env
"Cornwall", "Plymouth", "Torbay", "East Devon", "Exeter", "Mid Devon", "North Devon", "South Hams", "Teignbridge", "Torridge", "West Devon" ] env = make("SEIRmulti-v0") districts_group_ids = [ env.unwrapped.district_idx(name) for name in districts_group ] env = NormalizedObservationWrapper(env) env = NormalizedRewardWrapper(env) env = MultiAgentSelectObservation(env, districts_group_ids) env = MultiAgentSelectAction(env, districts_group_ids, 1) env = MultiAgentSelectReward(env, districts_group_ids) logger.configure(folder=args.monitor_path, format_strs=["csv"]) env = DummyVecEnv([lambda: env]) print(f"tensorboard --logdir {args.monitor_path}") layers = [args.n_hidden_units] * args.n_hidden_layers model = PPO2(MlpPolicy, env, verbose=0, tensorboard_log=args.monitor_path, ent_coef=args.entropy_coef, learning_rate=args.learning_rate, noptepochs=args.n_epochs, nminibatches=args.n_minibatches,
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None, linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None, neurons_inds_to_include=None, use_lagrangian=True): trained_model = None if not use_lagrangian: with tf.variable_scope("trained_model"): common_arg_parser = get_common_parser() trained_args, cma_unknown_args = common_arg_parser.parse_known_args() trained_args.env = policy_env trained_args.seed = policy_seed trained_args.num_timesteps = policy_num_timesteps trained_args.run_num = policy_run_num trained_this_run_dir = get_dir_path_for_this_run(trained_args) trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir) trained_save_dir = get_save_dir(trained_this_run_dir) trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final") trained_final_params = pd.read_csv(trained_final_file, header=None).values[0] trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed) trained_model.set_pi_from_flat(trained_final_params) args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######TRAIN: {args}") # non_linear_global_dict timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" if policy_env == "DartWalker2d-v1": entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' elif policy_env == "DartHopper-v1": entry_point = 'gym.envs.dart:DartHopperEnv_aug_input' elif policy_env == "DartHalfCheetah-v1": entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input' elif policy_env == "DartSnake7Link-v1": entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input' else: raise NotImplemented() this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold, result_dir=result_dir, network_size=network_size) full_param_traj_dir_path = get_full_params_dir(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) create_dir_remove(this_run_dir) create_dir_remove(full_param_traj_dir_path) create_dir_remove(save_dir) create_dir_remove(log_dir) logger.configure(log_dir) linear_top_vars_list_wanted_to_print = [] if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None): # note this is only linear if linear_top_vars_list is None or linear_correlation_neuron_list is None: linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note, metric_param=metric_param) lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \ get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold) with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp: json.dump(lagrangian_inds_to_include, fp) with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp: json.dump(linear_top_vars_list_wanted_to_print, fp) with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp: json.dump(neurons_inds_to_include, fp) args.env = f'{experiment_label}_{entry_point}-v1' if not use_lagrangian: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model, "neurons_inds_to_include": neurons_inds_to_include} ) else: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None, "neurons_inds_to_include": None} ) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = visualize env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = not visualize if args.normalize: env = VecNormalize(env) policy = MlpPolicy set_global_seeds(args.seed) walker_env.seed(args.seed) num_dof = walker_env.robot_skeleton.ndofs show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir) # extra run info I added for my purposes run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} layers = [network_size, network_size] policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]} model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir) return log_dir
from stable_baselines.common.base_class import _UnvecWrapper from utils import make_env, ALGOS, linear_schedule, get_latest_run_id, get_wrapper_class from utils.hyperparams_opt import hyperparam_optimization from utils.callbacks import SaveVecNormalizeCallback from utils.noise import LinearNormalActionNoise from utils.utils import StoreDict from stable_baselines.logger import configure from utils.callbacks import GoalToleranceCallback # Export the following environment variables for csv files # export OPENAI_LOG_FORMAT='stdout,log,csv' # export OPENAI_LOGDIR="log_dir" configure() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default="CartPole-v1", help='environment ID') parser.add_argument('-tb', '--tensorboard-log', help='Tensorboard log dir', default='', type=str) parser.add_argument('-i', '--trained-agent', help='Path to a pretrained agent to continue training', default='', type=str) parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='Overwrite the number of timesteps', default=-1, type=int) parser.add_argument('--log-interval', help='Override log interval (default: -1, no change)', default=-1, type=int) parser.add_argument('--eval-freq', help='Evaluate the agent every n steps (if negative, no evaluation)', default=10000, type=int)
else: action, value, neglogp = self.sess.run( [self.action, self.value_flat, self.neglogp], {self.obs_ph: obs}) return action, value, self.initial_state, neglogp def proba_step(self, obs, state=None, mask=None): return self.sess.run(self.policy_proba, {self.obs_ph: obs}) def value(self, obs, state=None, mask=None): return self.sess.run(self.value_flat, {self.obs_ph: obs}) if __name__ == '__main__': saver = U.ConfigurationSaver(log_dir='./logs') logger.configure(folder=saver.data_dir) env = gym.make('gym_docking:docking-v3') env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=255) checkpoint_callback = CheckpointCallback( save_freq=int(5e4), save_path='./logs/', name_prefix='rl_model_621_shaping_video_10M') model = PPO2( policy=MlpPolicy, env=env, verbose=1, tensorboard_log="./ppo2_docking_tensorboard/", lam=0.95,
def configure_logger(log_path, **kwargs): if log_path is not None: logger.configure(log_path) else: logger.configure(**kwargs)
def do_ppo(args, start_pi_theta, parent_this_run_dir, full_space_save_dir): """ Runs the test """ logger.log(f"#######CMA and then PPO TRAIN: {args}") this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir) log_dir = get_log_dir(this_conti_ppo_run_dir) conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir) logger.configure(log_dir) full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(conti_ppo_save_dir): import shutil shutil.rmtree(conti_ppo_save_dir) os.makedirs(conti_ppo_save_dir) def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{full_space_save_dir}/ppo2") # load after V model.set_pi_from_flat( start_pi_theta ) # don't set Vf's searched from CMA, those weren't really tested. if args.normalize: env.load_running_average(full_space_save_dir) model.set_env(env) run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path } # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, # noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model.tell_run_info(run_info) episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps) model.save(f"{conti_ppo_save_dir}/ppo2") env.save_running_average(conti_ppo_save_dir) return episode_returns, full_param_traj_dir_path
# Make the training environment env = make_train_env(datapaths) # Make the testing environments eval_envs = {} for d in test_datasets: path = os.path.join('./data', d + '.csv') output_path = os.path.join(anomaly_curve_log, d + '.csv') csv_file, csv_writer = generate_csv_writer(output_path) eval_envs[d] = { 'env': make_eval_env(datapath=path, budget=args.budget), 'csv_writer': csv_writer, 'csv_file': csv_file, 'mean_reward': 0, } # Train the model model = PPO2('MlpPolicy', env, verbose=1) model.set_eval(eval_envs, args.eval_log_interval) model.learn(total_timesteps=args.num_timesteps, log_interval=args.rl_log_interval) model.save(os.path.join(args.log, 'model')) if __name__ == "__main__": parser = argsparser() args = parser.parse_args() logger.configure(args.log) train(args)
from stable_baselines.ppo2 import PPO2 from stable_baselines import TD3 from stable_baselines.td3.policies import MlpPolicy from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines import logger from stable_baselines.common.callbacks import EvalCallback from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.bench import Monitor NUM_TIMESTEPS = int(2e7) SEED = 721 EVAL_FREQ = 1e6 EVAL_EPISODES = 100 LOGDIR = "tank_td3" # moved to zoo afterwards. logger.configure(folder=LOGDIR) env = gym.make("TankGym-v0") env.seed(SEED) env.policy = tankgym.BaselineRand() # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1) # eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, log_interval=10)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument( '--save_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument( '--load_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument('--test', type=int, default=0) parser.add_argument('--save_image', type=int, default=0) parser.add_argument('--exp_name', type=str, default='tmp') parser.add_argument('--logdir', type=str, default='./logs/') parser.add_argument('--clip_rewards', type=int, default=1) parser.add_argument('--e_greedy', type=int, default=0) parser.add_argument('--action_space', type=str, default='RIGHT_ONLY') parser.add_argument('--load_mtype', type=str, default='latest') args = parser.parse_args() logdir = os.path.join( args.logdir, args.exp_name + '_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) logger.configure(folder=logdir, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, load_dir=args.load_dir, save_dir=args.save_dir, test=args.test, exp_name=args.exp_name, clip_rewards=args.clip_rewards, save_image=args.save_image, action_space=args.action_space, e_greedy=args.e_greedy, load_mtype=args.load_mtype)
def train( _run, _seed: int, env_name: str, rollout_path: str, n_expert_demos: Optional[int], log_dir: str, *, n_epochs: int, n_gen_steps_per_epoch: int, n_disc_steps_per_epoch: int, init_trainer_kwargs: dict, n_episodes_eval: int, plot_interval: int, n_plot_episodes: int, show_plots: bool, init_tensorboard: bool, checkpoint_interval: int = 5, ) -> dict: """Train an adversarial-network-based imitation learning algorithm. Plots (turn on using `plot_interval > 0`): - Plot discriminator loss during discriminator training steps in blue and discriminator loss during generator training steps in red. - Plot the performance of the generator policy versus the performance of a random policy. Also plot the performance of an expert policy if that is provided in the arguments. Checkpoints: - DiscrimNets are saved to f"{log_dir}/checkpoints/{step}/discrim/", where step is either the training epoch or "final". - Generator policies are saved to f"{log_dir}/checkpoints/{step}/gen_policy/". Args: _seed: Random seed. env_name: The environment to train in. rollout_path: Path to pickle containing list of Trajectories. Used as expert demonstrations. n_expert_demos: The number of expert trajectories to actually use after loading them from `rollout_path`. If None, then use all available trajectories. If `n_expert_demos` is an `int`, then use exactly `n_expert_demos` trajectories, erroring if there aren't enough trajectories. If there are surplus trajectories, then use the first `n_expert_demos` trajectories and drop the rest. log_dir: Directory to save models and other logging to. n_epochs: The number of epochs to train. Each epoch consists of `n_disc_steps_per_epoch` discriminator steps followed by `n_gen_steps_per_epoch` generator steps. n_gen_steps_per_epoch: The number of generator update steps during every training epoch. n_disc_steps_per_epoch: The number of discriminator update steps during every training epoch. init_trainer_kwargs: Keyword arguments passed to `init_trainer`, used to initialize the trainer. n_episodes_eval: The number of episodes to average over when calculating the average episode reward of the imitation policy for return. plot_interval: The number of epochs between each plot. (If nonpositive, then plots are disabled). n_plot_episodes: The number of episodes averaged over when calculating the average episode reward of a policy for the performance plots. show_plots: Figures are always saved to `f"{log_dir}/plots/*.png"`. If `show_plots` is True, then also show plots as they are created. init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`. checkpoint_interval: Save the discriminator and generator models every `checkpoint_interval` epochs and after training is complete. If <=0, then only save weights after training is complete. Returns: A dictionary with two keys. "imit_stats" gives the return value of `rollout_stats()` on rollouts test-reward-wrapped environment, using the final policy (remember that the ground-truth reward can be recovered from the "monitor_return" key). "expert_stats" gives the return value of `rollout_stats()` on the expert demonstrations loaded from `rollout_path`. """ tf.logging.info("Logging to %s", log_dir) os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) # Calculate stats for expert rollouts. Used for plot and return value. with open(rollout_path, "rb") as f: expert_trajs = pickle.load(f) if n_expert_demos is not None: assert len(expert_trajs) >= n_expert_demos expert_trajs = expert_trajs[:n_expert_demos] expert_stats = util.rollout.rollout_stats(expert_trajs) with util.make_session(): sb_logger.configure(folder=osp.join(log_dir, 'generator'), format_strs=['tensorboard', 'stdout']) if init_tensorboard: sb_tensorboard_dir = osp.join(log_dir, "sb_tb") kwargs = init_trainer_kwargs kwargs["init_rl_kwargs"] = kwargs.get("init_rl_kwargs", {}) kwargs["init_rl_kwargs"]["tensorboard_log"] = sb_tensorboard_dir trainer = init_trainer(env_name, expert_trajs, seed=_seed, log_dir=log_dir, **init_trainer_kwargs) if plot_interval > 0: visualizer = _TrainVisualizer( trainer=trainer, show_plots=show_plots, n_episodes_per_reward_data=n_plot_episodes, log_dir=log_dir, expert_mean_ep_reward=expert_stats["return_mean"]) else: visualizer = None # Main training loop. for epoch in tqdm.tqdm(range(1, n_epochs + 1), desc="epoch"): trainer.train_disc(n_disc_steps_per_epoch) if visualizer: visualizer.add_data_disc_loss(False) trainer.train_gen(n_gen_steps_per_epoch) if visualizer: visualizer.add_data_disc_loss(True) if visualizer and epoch % plot_interval == 0: visualizer.plot_disc_loss() visualizer.add_data_ep_reward(trainer.venv, "Ground Truth Reward") visualizer.add_data_ep_reward(trainer.venv_train, "Train Reward") visualizer.add_data_ep_reward(trainer.venv_test, "Test Reward") visualizer.plot_ep_reward() if checkpoint_interval > 0 and epoch % checkpoint_interval == 0: save(trainer, os.path.join(log_dir, "checkpoints", f"{epoch:05d}")) # Save final artifacts. save(trainer, os.path.join(log_dir, "checkpoints", "final")) # Final evaluation of imitation policy. results = {} sample_until_eval = util.rollout.min_episodes(n_episodes_eval) trajs = util.rollout.generate_trajectories( trainer.gen_policy, trainer.venv_test, sample_until=sample_until_eval) results["imit_stats"] = util.rollout.rollout_stats(trajs) results["expert_stats"] = expert_stats return results
def main(args): logger.configure(config.LOGDIR) if args.debug: logger.set_level(config.DEBUG) else: logger.set_level(config.INFO) #make environment env = get_environment(args.env_name)(verbose = args.verbose, manual = args.manual) env.seed(args.seed) total_rewards = {} if args.recommend: ppo_model = load_model(env, 'best_model.zip') ppo_agent = Agent('best_model', ppo_model) else: ppo_agent = None agents = [] #load the agents if len(args.agents) != env.n_players: raise Exception(f'{len(args.agents)} players specified but this is a {env.n_players} player game!') for i, agent in enumerate(args.agents): if agent == 'human': agent_obj = Agent('human') elif agent== 'greedy': agent_obj = Agent('greedy') elif agent == 'rules': agent_obj = Agent('rules') elif agent == 'base': base_model = load_model(env, 'base.zip') agent_obj = Agent('base', base_model) else: ppo_model = load_model(env, f'{agent}.zip') agent_obj = Agent(agent, ppo_model) agents.append(agent_obj) total_rewards[agent_obj.id] = 0 if args.env_name == "blobwar": human_blobwar = Human() #play games logger.info(f'\nPlaying {args.games} games...') for game in range(args.games): players = agents[:] if args.randomise_players: random.shuffle(players) obs = env.reset() done = False for i, p in enumerate(players): logger.debug(f'Player {i+1} = {p.name}') while not done: current_player = players[env.current_player_num] env.render() logger.debug(f'Current player name: {current_player.name}') if args.recommend and current_player.name in ['human', 'rules']: # show recommendation from last loaded model logger.debug(f'\nRecommendation by {ppo_agent.name}:') action = ppo_agent.choose_action(env, choose_best_action = True, mask_invalid_actions = True) if current_player.name == 'human': if args.env_name == "blobwar": move= human_blobwar.compute_next_move(env.core) action=env.encode_action(move) else: action = input('\nPlease choose an action: ') try: action = int(action) except: # for MulitDiscrete action input as list TODO action = eval(action) elif current_player.name == 'rules': logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action(env, choose_best_action = False, mask_invalid_actions = True) else: logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action(env, choose_best_action = args.best, mask_invalid_actions = True) obs, reward, done, _ = env.step(action) for r, player in zip(reward, players): total_rewards[player.id] += r player.points += r if args.cont: input('Press any key to continue') env.render() logger.info(f"Played {game + 1} games: {total_rewards}") if args.write_results: write_results(players, game, args.games, env.turns_taken) for p in players: p.points = 0 env.close()
def main(): args = parser().parse_args() cfg = YAML().load( open(os.environ["FLIGHTMARE_PATH"] + "/flightlib/configs/vec_env.yaml", 'r')) if not args.train: cfg["env"]["num_envs"] = 1 cfg["env"]["num_threads"] = 1 if args.render: cfg["env"]["render"] = "yes" else: cfg["env"]["render"] = "no" env = wrapper.FlightEnvVec( QuadrotorEnv_v1(dump(cfg, Dumper=RoundTripDumper), False)) # set random seed configure_random_seed(args.seed, env=env) # if args.train: # save the configuration and other files rsg_root = os.path.dirname(os.path.abspath(__file__)) log_dir = rsg_root + '/saved' saver = U.ConfigurationSaver(log_dir=log_dir) model = PPO2( tensorboard_log=saver.data_dir, policy=MlpPolicy, # check activation function policy_kwargs=dict(net_arch=[dict(pi=[128, 128], vf=[128, 128])], act_fun=tf.nn.relu), env=env, lam=0.95, gamma=0.99, # lower 0.9 ~ 0.99 # n_steps=math.floor(cfg['env']['max_time'] / cfg['env']['ctl_dt']), n_steps=250, ent_coef=0.00, learning_rate=3e-4, vf_coef=0.5, max_grad_norm=0.5, nminibatches=1, noptepochs=10, cliprange=0.2, verbose=1, ) # tensorboard # Make sure that your chrome browser is already on. # TensorboardLauncher(saver.data_dir + '/PPO2_1') # PPO run # Originally the total timestep is 5 x 10^8 # 10 zeros for nupdates to be 4000 # 1000000000 is 2000 iterations and so # 2000000000 is 4000 iterations. logger.configure(folder=saver.data_dir) model.learn(total_timesteps=int(25000000), log_dir=saver.data_dir, logger=logger) model.save(saver.data_dir) # # Testing mode with a trained weight else: model = PPO2.load(args.weight) test_model(env, model, render=args.render)
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, norm_reward=False, seed=0, log_dir='', should_render=True): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param norm_reward: (bool) Whether to normalize rewards or not when using Vecnormalize :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() # Create the environment and wrap it if necessary if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: env = SubprocVecEnv( [make_env(env_id, i, seed, log_dir) for i in range(n_envs)]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: spec = gym.envs.registry.env_specs[env_id] class_ = load(spec._entry_point) # HACK: force SubprocVecEnv for Bullet env that does not # have a render argument use_subproc = 'renders' not in inspect.getfullargspec( class_.__init__).args # Create the env, with the original kwargs, and the new ones overriding them if needed def _init(): # TODO: fix for pybullet locomotion envs env = class_(**{**spec._kwargs}, renders=should_render) env.seed(0) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) return env if use_subproc: env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir)]) else: env = DummyVecEnv([_init]) else: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir)]) # Load saved stats for normalizing input and rewards if stats_path is not None: print("Loading running average") env = VecNormalize(env, training=False, norm_reward=norm_reward) env.load_running_average(stats_path) return env
def main(args): logger.configure(config.LOGDIR) if args.debug: logger.set_level(config.DEBUG) else: logger.set_level(config.INFO) # make environment env = get_environment(args.env_name)(verbose=args.verbose, manual=args.manual) env.seed(args.seed) total_rewards = {} first_time = True if args.recommend: ppo_model = load_model(env, 'best_model.zip') ppo_agent = Agent('best_model', ppo_model) else: ppo_agent = None agents = [] # load the agents if len(args.agents) != env.n_players: raise Exception( f'{len(args.agents)} players specified but this is a {env.n_players} player game!' ) for i, agent in enumerate(args.agents): if agent == 'human': agent_obj = Agent('human') elif agent == 'rules': agent_obj = Agent('rules') elif agent == 'json': # Start mq server context = zmq.Context() socket = context.socket(zmq.REP) socket.bind("tcp://*:5555") logger.debug("zaq server start at 5555") agent_obj = Agent('json') elif agent == 'base': base_model = load_model(env, 'base.zip') agent_obj = Agent('base', base_model) else: ppo_model = load_model(env, f'{agent}.zip') agent_obj = Agent(agent, ppo_model) agents.append(agent_obj) total_rewards[agent_obj.id] = 0 # play games logger.info(f'\nPlaying {args.games} games...') for game in range(args.games): players = agents[:] if args.randomise_players: random.shuffle(players) obs = env.reset() done = False for i, p in enumerate(players): logger.debug(f'Player {i+1} = {p.name}') while not done: current_player = players[env.current_player_num] env.render() logger.debug(f'\nCurrent player name: {current_player.name}') if args.recommend and current_player.name in [ 'human', 'rules', 'json' ]: # show recommendation from last loaded model logger.debug(f'\nRecommendation by {ppo_agent.name}:') action = ppo_agent.choose_action(env, choose_best_action=True, mask_invalid_actions=True) if current_player.name == 'human': action = input('\nPlease choose an action: ') try: # for int actions action = int(action) except: # for MulitDiscrete action input as list TODO action = eval(action) if current_player.name == 'json': if (not first_time): game_state = { "legal_action": [i for i, o in enumerate(env.legal_actions) if o != 0], "tableCard": env.tableCard.id } socket.send_json(game_state) action = socket.recv_json() first_time = False logger.debug(f'\nReceived {action}') # action = input('\n JSON!!! Please choose an action: ') try: # for int actions action = int(action) except: # for MulitDiscrete action input as list TODO action = eval(action) elif current_player.name == 'rules': logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action( env, choose_best_action=False, mask_invalid_actions=True) else: logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action( env, choose_best_action=args.best, mask_invalid_actions=True) obs, reward, done, _ = env.step(action) for r, player in zip(reward, players): total_rewards[player.id] += r player.points += r if args.cont: input('Press any key to continue') env.render() logger.info(f"Played {game + 1} games: {total_rewards}") if args.write_results: write_results(players, game, args.games, env.turns_taken) for p in players: p.points = 0 env.close()
def rollouts_and_policy( _seed: int, env_name: str, total_timesteps: int, *, log_dir: str = None, num_vec: int = 8, parallel: bool = False, max_episode_steps: Optional[int] = None, normalize: bool = True, make_blank_policy_kwargs: dict = {}, reward_type: Optional[str] = None, reward_path: Optional[str] = None, rollout_save_interval: int = 0, rollout_save_final: bool = False, rollout_save_n_timesteps: Optional[int] = None, rollout_save_n_episodes: Optional[int] = None, policy_save_interval: int = -1, policy_save_final: bool = True, ) -> None: """Trains an expert policy from scratch and saves the rollouts and policy. At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}.pkl`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. make_blank_policy_kwargs: Kwargs for `make_blank_policy`. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. """ _validate_traj_generate_params(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) log_callbacks = [] with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = RewardVecEnvWrapper(venv, reward_fn) log_callbacks.append(venv.log_callback) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs) # Make callback to save intermediate artifacts during training. step = 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: util.rollout.save(rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True # Continue training. policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: util.rollout.save(rollout_dir, policy, venv, "final", n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize)
def main(): global model, best_model_path, last_model_path, sim_joy mission = 'PushStonesEnv' # Change according to algorithm env = gym.make(mission + '-v0').unwrapped # Create log and model dir dir = 'stable_bl/' + mission # dir = 'stable_bl/PushMultipleStones' os.makedirs(dir + '/model_dir/sac', exist_ok=True) jobs = ['train', 'record', 'record-w/hm', 'BC_agent', 'play'] job = jobs[0] name = 'PickUp_40_episodes' pretrain = False fillBuffer = False if job == 'train': # create new folder try: tests = os.listdir(dir + '/model_dir/sac') indexes = [] for item in tests: indexes.append(int(item.split('_')[1])) if not bool(indexes): k = 0 else: k = max(indexes) + 1 except FileNotFoundError: os.makedirs(dir + '/log_dir/sac') k = 0 model_dir = os.getcwd() + '/' + dir + '/model_dir/sac/test_{}'.format( str(k)) best_model_path = model_dir last_model_path = model_dir log_dir = dir + '/log_dir/sac/test_{}'.format(str(k)) logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) num_timesteps = int(1e6) policy_kwargs = dict(layers=[64, 64, 64]) # SAC - start learning from scratch model = SAC(CnnPolicy, env, gamma=0.99, learning_rate=1e-4, buffer_size=50000, learning_starts=1000, train_freq=1, batch_size=64, tau=0.01, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=2, tensorboard_log=log_dir, _init_setup_model=True, full_tensorboard_log=True, seed=None, n_cpu_tf_sess=None) # Load best model and continue learning # models = os.listdir(dir + '/model_dir/sac') # models_rew = (model for model in models if 'rew' in model) # ind, reward = [], [] # for model in models_rew: # ind.append(model.split('_')[1]) # reward.append(model.split('_')[3]) # best_reward = max(reward) # best_model_ind = reward.index(best_reward) # k = ind[best_model_ind] # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_rew_' + best_reward, env=env, # custom_objects=dict(learning_starts=0)) # Load last saved model and continue learning # models = os.listdir(dir + '/model_dir/sac') # models_time = (model for model in models if 'rew' not in model) # ind, hour, min = [], [], [] # for model in models_time: # ind.append(model.split('_')[1]) # hour.append(model.split('_')[3]) # min.append(model.split('_')[4]) # date = models_time[0].split('_')[2] # latest_hour = max(hour) # latest_hour_ind = [i for i, n in enumerate(hour) if n == latest_hour] # latest_min = max(min[latest_hour_ind]) # latest_min_ind = min(latest_min) # k = ind[latest_min_ind] # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_' + date + '_' + latest_hour[0] + '_' + latest_min + 'zip', # env=env, custom_objects=dict(learning_starts=0)) # model = SAC.load(dir + '/model_dir/sac/test_0_11_16_2.zip', # env=env, tensorboard_log=log_dir, # custom_objects=dict(learning_starts=0)) #, learning_rate=2e-4, # # train_freq=8, gradient_steps=4, target_update_interval=4)) # # # batch_size=32)) # pretrain if pretrain: # load dataset only once # expert_dataset(name) dataset = ExpertDataset(expert_path=(os.getcwd() + '/' + name + '_dataset.npz'), traj_limitation=-1) model.pretrain(dataset, n_epochs=2000) # fill replay buffer with Benny's recordings if fillBuffer: traj = expert_dataset(name) for i in range(len(traj['obs']) - 1): if traj['episode_starts'][i + 1]: done = True else: done = False obs = traj['obs'][i] action = traj['actions'][i] reward = traj['rewards'][i] next_obs = traj['obs'][i + 1] model.replay_buffer.add(obs, action, reward, next_obs, float(done)) # Test the pre-trained model # env = model.get_env() # obs = env.reset() # # reward_sum = 0.0 # for _ in range(1000): # action, _ = model.predict(obs) # obs, reward, done, _ = env.step(action) # reward_sum += reward # if done: # print(reward_sum) # reward_sum = 0.0 # obs = env.reset() # # env.close() # learn model.learn(total_timesteps=num_timesteps, callback=save_fn) # PPO1 # model = PPO1(Common_MlpPolicy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, # optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, # schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, # policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1) # TRPO # model = TRPO(MlpPolicy, env, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1) # model.learn(total_timesteps=500000) # model.save(log_dir) elif job == 'record': mission = 'PushStonesHeatMapEnv' env = gym.make(mission + '-v0').unwrapped obs = [] actions = [] rewards = [] dones = [] episode_rewards = [] num_episodes = 30 listener = keyboard.Listener(on_press=on_press) listener.start() for episode in range(num_episodes): ob = env.reset() done = False print('Episode number ', episode + 1) episode_reward = 0 while not done: act = "recording" # act = sim_joy # act = [0,1,0.5] new_ob, reward, done, info = env.step(act) # print(info['action']) # print(ob) if recorder_on: obs.append(ob) actions.append(info['action']) rewards.append(reward) dones.append(done) episode_reward = episode_reward + reward ob = new_ob episode_rewards.append(episode_reward) if info['reset reason'] == 'out of boarders' or info[ 'reset reason'] == 'limit time steps': episode -= 1 else: print('saving data') data_saver(obs, actions, rewards, dones, episode_rewards) elif job == 'play': # env = gym.make('PickUpEnv-v0') model = SAC.load(dir + '/model_dir/sac/test_0_11_16_2.zip', env=env, custom_objects=dict(learning_starts=0)) ### ADD NUM for _ in range(2): obs = env.reset() done = False while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action)
if policy == "dnn": if algo == "dqn": from stable_baselines.deepq.policies import MlpPolicy else: from stable_baselines.common.policies import MlpPolicy policyFn = MlpPolicy elif policy == "bnn": if algo == "dqn": from dqn_model import BnnPolicy else: from model import BnnPolicy policyFn = BnnPolicy log_dir = f"{algo}-{policy}-{tag}" logger.configure(folder=log_dir) env = gym.make("SlimeVolley-v0") env.atari_mode = True env.survival_bonus = True env.__init__() env.seed(seed) eval_env = gym.make("SlimeVolley-v0") eval_env.atari_mode = True eval_env.__init__() eval_env.seed(seed) eval_callback = EvalCallback(eval_env, best_model_save_path=log_dir, log_path=log_dir, eval_freq=eval_freq,
def train( self, seed: int, communication_queue: Queue = None, current_iteration: int = -1, search_suffix: str = "1", env_variables: EnvVariables = None, random_search: bool = False, ): self._set_global_seed(seed=seed) env_kwargs_to_set = env_variables if env_variables else self.env_kwargs self.logger.debug("env_variables: {}".format(env_kwargs_to_set.get_params_string())) reward_threshold = get_reward_threshold(env_name=self.env_name) best_model_save_path, tensorboard_log_dir = self._preprocess_storage_dirs() if current_iteration != -1 and not self.continue_learning: best_model_save_path = best_model_save_path + "_" + str(current_iteration) self.logger.debug("best_model_save_path: {}".format(best_model_save_path)) if communication_queue or search_suffix != "1": continue_learning_suffix = self.continue_learning_suffix + "_" + search_suffix else: continue_learning_suffix = self.continue_learning_suffix os.environ["OPENAI_LOG_FORMAT"] = "log,csv" if self.continue_learning: os.environ["OPENAI_LOGDIR"] = best_model_save_path + "_" + continue_learning_suffix else: os.environ["OPENAI_LOGDIR"] = best_model_save_path configure() if self.algo_hyperparams: self.logger.debug("Overriding file specified hyperparams with {}".format(eval(self.algo_hyperparams))) hyperparams = eval(self.algo_hyperparams) else: hyperparams = load_hyperparams(algo_name=self.algo_name, env_name=self.env_name, model_suffix=self.model_suffix) (normalize_kwargs, n_envs, n_timesteps, log_every, hyperparams,) = self._preprocess_hyperparams( _hyperparams=hyperparams ) if n_envs > 1 and self.algo_name == "ppo2": # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv( [ make_env_parallel( sb_version=self.sb_version, seed=seed, rank=i, env_name=self.env_name, continue_learning=self.continue_learning, log_dir=best_model_save_path, env_kwargs=env_kwargs_to_set, algo_name=self.algo_name, continue_learning_suffix=continue_learning_suffix, ) for i in range(n_envs) ] ) if len(normalize_kwargs) > 0: env = normalize_env( env=env, vectorize=False, orig_log_dir=best_model_save_path, continue_learning=self.continue_learning, sb_version=self.sb_version, normalize_kwargs=normalize_kwargs, ) else: env = make_custom_env( seed=seed, sb_version=self.sb_version, env_kwargs=env_kwargs_to_set, normalize_kwargs=normalize_kwargs, continue_learning=self.continue_learning, log_dir=best_model_save_path, env_name=self.env_name, algo_name=self.algo_name, continue_learning_suffix=continue_learning_suffix, ) if self.n_eval_episodes > DEFAULT_N_EVAL_EPISODES: analysis_callback = self.build_callback( algo_name=self.algo_name, continue_learning=self.continue_learning, call_every=log_every, eval_callback=self.eval_callback, _reward_threshold=reward_threshold, eval_episodes=self.n_eval_episodes, _eval_env=make_custom_env( seed=seed, continue_learning=self.continue_learning, sb_version=self.sb_version, env_kwargs=env_kwargs_to_set, env_name=self.env_name, log_dir=best_model_save_path, algo_name=self.algo_name, normalize_kwargs=normalize_kwargs, evaluate=True, evaluate_during_learning=True, continue_learning_suffix=continue_learning_suffix, ), original_env=make_custom_env( seed=seed, continue_learning=self.continue_learning, sb_version=self.sb_version, env_kwargs=self.env_kwargs, env_name=self.env_name, log_dir=best_model_save_path, algo_name=self.algo_name, normalize_kwargs=normalize_kwargs, evaluate=True, evaluate_during_learning=True, ), env_name=self.env_name, _best_model_save_path=best_model_save_path, num_envs=n_envs, total_timesteps=n_timesteps, continue_learning_suffix=continue_learning_suffix, communication_queue=communication_queue, env_eval_callback=self.env_eval_callback, save_replay_buffer=self.save_replay_buffer, save_model=self.save_model, random_search=random_search, ) else: analysis_callback = self.build_callback( algo_name=self.algo_name, continue_learning=self.continue_learning, call_every=log_every, eval_callback=self.eval_callback, _reward_threshold=reward_threshold, eval_episodes=self.n_eval_episodes, env_name=self.env_name, _best_model_save_path=best_model_save_path, num_envs=n_envs, continue_learning_suffix=continue_learning_suffix, save_replay_buffer=self.save_replay_buffer, save_model=self.save_model, random_search=random_search, ) if self.continue_learning: model = self.create_model( seed=seed, algo_name=self.algo_name, env=env, tensorboard_log_dir=tensorboard_log_dir, hyperparams=hyperparams, best_model_save_path=best_model_save_path, n_timesteps=n_timesteps, continue_learning=True, env_name=self.env_name, model_to_load=self.model_to_load, save_replay_buffer=self.save_replay_buffer, ) else: model = self.create_model( seed=seed, algo_name=self.algo_name, env=env, tensorboard_log_dir=tensorboard_log_dir, hyperparams=hyperparams, env_name=self.env_name, n_timesteps=n_timesteps, model_to_load=self.model_to_load, save_replay_buffer=self.save_replay_buffer, ) try: callback_list = [analysis_callback] # if len(normalize_kwargs) > 0 and not self.continue_learning: # callback_list = [self._build_vec_normalize_callback(save_path=best_model_save_path, # log_every=log_every), analysis_callback] if self.show_progress_bar: with ProgressBarManager(total_timesteps=n_timesteps, sb_version=self.sb_version) as progress_callback: callback_list.append(progress_callback) if self.continue_learning and self.log_to_tensorboard: model.learn( total_timesteps=n_timesteps, callback=callback_list, tb_log_name=self.tb_log_name + "_" + continue_learning_suffix, ) else: model.learn( total_timesteps=n_timesteps, callback=callback_list, tb_log_name=self.tb_log_name, ) else: if self.continue_learning and self.log_to_tensorboard: model.learn( total_timesteps=n_timesteps, callback=callback_list, tb_log_name=self.tb_log_name + "_" + continue_learning_suffix, ) else: self.logger.debug("Model learn start...") model.learn( total_timesteps=n_timesteps, callback=callback_list, tb_log_name=self.tb_log_name, ) self.logger.debug("Model learn end") except KeyboardInterrupt: pass finally: if len(normalize_kwargs) > 0 and not self.continue_learning: # Important: save the running average, for testing the agent we need that normalization model.get_vec_normalize_env().save(os.path.join(best_model_save_path, "vecnormalize.pkl")) # Release resources env.close()
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: spec = gym.envs.registry.env_specs[env_id] try: class_ = load(spec.entry_point) except AttributeError: # Backward compatibility with gym class_ = load(spec._entry_point) # HACK: force SubprocVecEnv for Bullet env that does not # have a render argument render_name = None use_subproc = 'renders' not in inspect.getfullargspec( class_.__init__).args if not use_subproc: render_name = 'renders' # Dev branch of pybullet # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args # if not use_subproc and render_name is None: # render_name = 'render' # Create the env, with the original kwargs, and the new ones overriding them if needed def _init(): # TODO: fix for pybullet locomotion envs env = class_(**{**spec._kwargs}, **{render_name: should_render}) env.seed(0) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) return env if use_subproc: env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper) ]) else: env = DummyVecEnv([_init]) else: env = DummyVecEnv( [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
os.makedirs(top_log_dir, exist_ok=True) test_num = 1 for hyperparams in hyperparams_list: hyperparam_log_dir = os.path.join(top_log_dir, hyper_file_name(hyperparams)) os.makedirs(hyperparam_log_dir, exist_ok=True) print("Beginning test", test_num, "of", len(hyperparams_list)) begin_perm_time = datetime.now() for i in range(5,10): run_dir = os.path.join(hyperparam_log_dir, "run_" + str(i) + "_monitor_dir") hyperparamfilename = os.path.join(run_dir, "hyperparams.txt") if os.path.exists(hyperparamfilename): continue os.makedirs(run_dir, exist_ok=True) checkpoint_dir = os.path.join(run_dir, "model_checkpoints") os.makedirs(checkpoint_dir, exist_ok=True) logger.configure(run_dir) env = create_env(n_envs=n_envs, env_name=env_name, log_dir=run_dir) model = RLAgent('MlpPolicy', env, verbose=0, **hyperparams).learn(total_timesteps=timesteps, callback=callback) model.save(run_dir + "final_agent.pkl") del model del env gc.collect() hyperparamfile = open(hyperparamfilename, 'w') hyperparamfile.write(str(hyperparams)) hyperparamfile.write("\nn_envs = {}\n".format(n_envs)) hyperparamfile.write("RLAgent = {}\n".format(RLAgent)) hyperparamfile.write("Env = {}\n".format(args.env)) hyperparamfile.close() print("time remaining:", (datetime.now() - begin_perm_time) * (len(hyperparams_list) - test_num)) test_num += 1
lr = args.lr kwargs = kwargs_map[args.agent] kwargs['learning_rate'] = lr # kwargs['max_grad_norm'] = args.max_grad_norm # kwargs['kfac_clip'] = args.kfac_clip # kwargs['vf_coef'] = args.vf_coef # kwargs['ent_coef'] = args.ent_coef # kwargs['n_steps'] = args.n_steps start_time = datetime.now() log_dir = os.path.join("training_logs", run_name) checkpoint_dir = os.path.join(log_dir, "model_checkpoints") os.makedirs(log_dir, exist_ok=True) os.makedirs(checkpoint_dir, exist_ok=True) logger.configure(log_dir) env = create_env(n_envs=args.n_envs, env_name=args.env, log_dir=log_dir) RLAgent = AGENTS_MAP[args.agent] hyperparamfilename = os.path.join(log_dir, "hyperparams.txt") hyperparamfile = open(hyperparamfilename, 'w') hyperparamfile.write(str(kwargs)) hyperparamfile.write("\nn_envs = {}\n".format(args.n_envs)) hyperparamfile.write("RLAgent = {}\n".format(RLAgent)) hyperparamfile.write("Env = {}\n".format(args.env)) hyperparamfile.close() model = RLAgent('MlpPolicy', env, verbose=1, **kwargs).learn(total_timesteps=total_timesteps, callback=callback) model.save(log_dir + "final_agent.pkl") # env.save("trained_agents/env_" + run_name) print(kwargs) #
def do_ppos(ppos_args, result, intermediate_data_dir, origin_param): ppos_args.alg = "ppo_subspace" logger.log(f"#######TRAIN: {ppos_args}") this_run_dir = get_dir_path_for_this_run(ppos_args) if os.path.exists(this_run_dir): import shutil shutil.rmtree(this_run_dir) os.makedirs(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) full_param_traj_dir_path = get_full_params_dir(this_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(save_dir): import shutil shutil.rmtree(save_dir) os.makedirs(save_dir) run_info = {"full_param_traj_dir_path": full_param_traj_dir_path} logger.configure(log_dir) tic = time.time() def make_env(): env_out = gym.make(ppos_args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if ppos_args.normalize: env = VecNormalize(env) set_global_seeds(ppos_args.seed) policy = MlpMultPolicy model = PPO2(policy=policy, env=env, n_steps=ppos_args.n_steps, nminibatches=ppos_args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, policy_kwargs={"num_comp": len(result["first_n_pcs"])}, pcs=result["first_n_pcs"], origin_theta=origin_param) model.tell_run_info(run_info) eprews, optimization_path = model.learn( total_timesteps=ppos_args.ppos_num_timesteps, give_optimization_path=True) toc = time.time() logger.log( f"####################################PPOS took {toc-tic} seconds") moving_ave_rewards = get_moving_aves(eprews, 100) return eprews, moving_ave_rewards, optimization_path
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() if hyperparams is None: hyperparams = {} # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: # HACK: force SubprocVecEnv for Bullet env env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) else: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load(os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log, expert_path, pretrain, pretrain_epochs, mdpo_update_steps, num_trajectories, expert_model, exploration_bonus, bonus_coef, random_action_len, is_action_features, dir_name, neural, lipschitz, args): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): # from mpi4py import MPI # rank = MPI.COMM_WORLD.Get_rank() rank = 0 env_name = env_id[:-3].lower() log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\ + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam) log_dir += '_' + dir_name + '/' log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps) # log_name += '_randLen' + str(random_action_len) if exploration_bonus: log_name += '_exploration' + str(bonus_coef) if pretrain: log_name += '_pretrain' + str(pretrain_epochs) if not is_action_features: log_name += "_states_only" log_name += '_s' + str(seed) log_path = log_dir + log_name expert_path = './experts/' + expert_path num_timesteps = int(num_timesteps) args = args.__dict__ dir_path = os.getcwd() + log_dir[1:] if not os.path.exists(dir_path): os.makedirs(dir_path) with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file: file.write("Experiment Arguments:") for key, val in args.items(): print(key, ": ", val, file=file) if log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() # env = make_mujoco_env(env_id, workerseed) def make_env(): # env_out = gym.make(env_id, reset_noise_scale=1.0) env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) env_out = wrap_mujoco(env_out, random_action_len=random_action_len) return env_out # env = DummyVecEnv([make_env]) # env = VecNormalize(env) if algo == 'Train': train = True else: train = False if algo == 'Evaluate': eval = True else: eval = False if train: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) if num_timesteps > 0: model = SAC('MlpPolicy', env_id, verbose=1, buffer_size=1000000, batch_size=256, ent_coef='auto', train_freq=1, tau=0.01, gradient_steps=1, learning_starts=10000) else: model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=num_trajectories) if num_timesteps > 0: model.save('sac_' + env_name + '_' + str(num_timesteps)) elif eval: from stable_baselines import SAC env = VecNormalize(env, norm_reward=False, norm_obs=False) model = SAC.load(expert_model, env) generate_expert_traj(model, expert_path, n_timesteps=num_timesteps, n_episodes=10, evaluate=True) else: expert_path = expert_path + '.npz' dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, verbose=1) if algo == 'MDAL': model = MDAL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal/", seed=seed, buffer_size=1000000, ent_coef=0.0, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, d_step=10, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lipschitz=lipschitz) elif algo == 'MDAL_ON_POLICY': model = MDAL_MDPO_ON('MlpPolicy', env, dataset, verbose=1, timesteps_per_batch=2048, tensorboard_log="./experiments/" + env_name + "/mdal_mdpo_on/", seed=seed, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, adversary_entcoeff=0.001, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, sgd_steps=sgd_steps, klcoeff=1.0, method="multistep-SGD", tsallis_q=1.0, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural) elif algo == 'MDAL_TRPO': model = MDAL_TRPO('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/mdal_trpo/", seed=seed, gamma=0.99, g_step=3, d_step=5, sgd_steps=1, d_stepsize=9e-5, entcoeff=0.0, adversary_entcoeff=0.001, max_kl=t_pi, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, neural=neural, lam=0.98, timesteps_per_batch=2000, lipschitz=lipschitz) elif algo == 'GAIL': from mpi4py import MPI from stable_baselines import GAIL model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail/", seed=seed, entcoeff=0.0, adversary_entcoeff=0.001, lipschitz=lipschitz) elif algo == 'GAIL_MDPO_OFF': # from mpi4py import MPI from stable_baselines import GAIL_MDPO_OFF model = GAIL_MDPO_OFF('MlpPolicy', env, dataset, verbose=1, tensorboard_log="./experiments/" + env_name + "/gail_mdpo_off/", seed=seed, ent_coef=0.0, adversary_entcoeff=0.001, buffer_size=1000000, learning_starts=10000, batch_size=256, tau=0.01, gamma=0.99, gradient_steps=sgd_steps, mdpo_update_steps=mdpo_update_steps, lam=0.0, train_freq=1, tsallis_q=1, reparameterize=True, t_pi=t_pi, t_c=t_c, exploration_bonus=exploration_bonus, bonus_coef=bonus_coef, is_action_features=is_action_features, lipschitz=lipschitz) else: raise ValueError("Not a valid algorithm.") if pretrain: model.pretrain(dataset, n_epochs=pretrain_epochs) model.learn(total_timesteps=num_timesteps, tb_log_name=log_name) env.close()
import sys sys.path.append('/home/frcvision1/Final/My_Environments/Carla-0.9.4') sys.path.append('/home/frcvision1/Final/learning-to-drive-in-a-day-carla-0.9') from stable_baselines.common.vec_env import DummyVecEnv from vae.controller import VAEController from stable_baselines import logger import os from ppo_with_vae import PPOWithVAE from stable_baselines.ppo2.ppo2 import PPO2 from stable_baselines.common.policies import MlpPolicy import numpy as np vae = VAEController() PATH_MODEL_VAE = "vae.json" logger.configure(folder='/tmp/ppo_carla2/') PATH_MODEL_PPO2 = "carla_ppo2_with_vae_500_2mil" def make_carla_env(): """Import the package for carla Env, this packge calls the __init__ that registers the environment.Did this just to be consistent with gym""" sys.path.append('/home/frcvision1/Final/My_Environments/Carla_new') from env3 import CarlaEnv env = CarlaEnv() env = DummyVecEnv([lambda: env]) return env env = make_carla_env()
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params=None, save_policies=True): """ launch training with mpi :param env: (str) environment ID :param logdir: (str) the log directory :param n_epochs: (int) the number of training epochs :param num_cpu: (int) the number of CPUs to run on :param seed: (int) the initial random seed :param replay_strategy: (str) the type of replay strategy ('future' or 'none') :param policy_save_interval: (int) the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled. :param clip_return: (float): clip returns to be in [-clip_return, clip_return] :param override_params: (dict) override any parameter for training :param save_policies: (bool) whether or not to save the policies """ if override_params is None: override_params = {} # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) tf_util.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(folder=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as file_handler: json.dump(params, file_handler) params = config.prepare_params(params) config.log_params(params, logger_input=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/stable_baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, # 'use_demo_states': True, 'compute_q': False, 'time_horizon': params['time_horizon'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], # 'use_demo_states': False, 'compute_q': True, 'time_horizon': params['time_horizon'], } for name in [ 'time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--critic-l2-reg', type=float, default=1e-2) parser.add_argument('--batch-size', type=int, default=64) # per MPI worker parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) boolean_flag(parser, 'enable-popart', default=False) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--reward-scale', type=float, default=1.) parser.add_argument('--clip-norm', type=float, default=None) parser.add_argument('--nb-train-steps', type=int, default=50) # per epoch cycle and MPI worker parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker # choices are adaptive-param_xx, ou_xx, normal_xx, none parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') parser.add_argument('--num-timesteps', type=int, default=int(1e6)) boolean_flag(parser, 'evaluation', default=False) args = parser.parse_args() dict_args = vars(args) return dict_args if __name__ == '__main__': args = parse_args() if MPI.COMM_WORLD.Get_rank() == 0: logger.configure() # Run actual script. run(**args)
def main(): """ Runs the test """ parser = mujoco_arg_parser() parser.add_argument( '--model-path', default="/cvgl2/u/surajn/workspace/saved_models/sawyerlift_ppo2/model") parser.add_argument('--images', default=False) args = parser.parse_args() logger.configure() if not args.play: model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path, images=args.images) if args.play: def make_env(): env_out = GymWrapper( suite.make( "SawyerLift", use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=True, # make sure we can render to the screen reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth )) env_out.reward_range = None env_out.metadata = None env_out.spec = None env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out #env = make_env() env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = MlpPolicy #model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1) model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) model.load(args.model_path) logger.log("Running trained model") obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() while True: env.render() actions = model.step(obs)[0] obs[:] = env.step(actions)[0]