from mpl_toolkits.mplot3d import Axes3D from toolsm import tools import numpy as np import baselines.common.tf_util as U # path_root = '/media/d/e/et/baselines' from baselines.common.distributions import DiagGaussianPd import os import pandas as pd from toolsm import tools TabularActionPrecision = 5 import baselines from toolsm import logger path_root = logger.get_logger_dir('baselines/KL2Clip', baselines, 'results/KL2Clip' ) # print(path_root) # exit() _BATCH_NORM_DECAY = 0.997 _BATCH_NORM_EPSILON = 1e-5 def batch_norm_relu(inputs, is_training): """Performs a batch normalization followed by a ReLU.""" # We set fused=True for a significant performance boost. See # https://www.tensorflow.org/performance/performance_guide#common_fused_ops inputs = tf.layers.batch_normalization( inputs=inputs, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, scale=True, training=is_training, fused=True)
def main(): parser, args_default = arg_parser_common() args = parser.parse_args() import json from dotmap import DotMap from copy import copy, deepcopy keys_exclude = [ 'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval', 'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write', 'kl2clip_sharelogstd','policy_variance_state_dependent', 'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps', 'gradient_rectify', 'rectify_scale','kl2clip_clipcontroltype', 'reward_scale', 'coef_predict_task','explore_additive_rate','explore_additive_threshold','explore_timesteps', 'debug_halfcheetah', 'name_project', 'n_opt_epochs', 'coef_entropy', 'log_interval', 'save_interval', 'save_debug', 'isatari', 'env_full', 'envtype'] keys_exclude.extend(['logstd','lam','hidden_sizes','num_layers','num_sharing_layers','ac_fn','lam_decay','policy_type']) # TODO: These args should not be used as name of dir only if they are specified. # TODO: Split args into..... group_keys and run_keys. # -------------------- prepare args args.env_full = args.env args.env = args.env_full.split('-v')[0] if not args.isatari: args.envtype = MUJOCO if '-v' not in args.env_full: args.env_full = f'{args.env}-v2' else: keys_exclude.append('logstd') args.envtype = ATARI # if 'NoFrameskip' not in args.env: # args.env = f'' if '-v' not in args.env_full: args.env_full = f'{args.env}-v4' tools.warn_(f'Run with setting for {args.envtype} task!!!!!') assert bool(args.alg) != bool(args.cliptype), 'Either alg or cliptype should be specified' if args.alg: # For release args.cliptype = alg2cliptype[args.alg] keys_exclude.append('cliptype') if len(args.keys_group) ==0: args.keys_group = ['alg'] if args.name_group is None: args.name_group = '' else: # For debug keys_exclude.append('alg') if len(args.keys_group) ==0: args.keys_group = ['cliptype','clipargs'] if args.name_group is None: args.name_group = 'tmp' # ------ Set the values of args def update_dict(dictmain, dictnew): for key_arg in dictnew: if key_arg.startswith('__'): # This means that the value are customized for the specific values key_interest = key_arg[2:] #e.g., __cliptype value_interest = dictmain[key_interest] #Search value from dictmain. e.g., kl_klrollback_constant_withratio if value_interest in dictnew[ key_arg ].keys(): dictmain = update_dict( dictmain, dictnew[ key_arg ][value_interest]) else: if isinstance(dictnew[key_arg], dict) and key_arg in dictmain.keys(): dictmain[key_arg].update( dictnew[key_arg] ) else: dictmain[key_arg] = copy( dictnew[key_arg]) return dictmain def reform_specific_dict(d): dictmain = dict( (k,v) for k,v in d.items() if not k.startswith('__') ) dictspecific = dict( (k,v) for k,v in d.items() if k.startswith('__') ) return update_dict( dictmain, dictspecific ) # If the value of the following args are None, then it is setted by the following values keys_del = [] args = vars(args) keys = list(args.keys()) for key in keys: if args[key] is None: del args[key] #Delete the value of args keys_del.append( key ) if len(keys_del) > 0: print( 'The following args are not provided value by the args. They will used built-in values.\n', ', '.join(keys_del) ) # args__ = update_dict( copy(args_default), args ) # We need to update the basic args, e.g., env, cliptype # args__ = reform_specific_dict( args__) # The following operations may seems strange. Maybe I will give a more clear one in the furture. args__ = update_dict( deepcopy(args), args_default ) # generate the default value from args_default args = update_dict( args__, args ) # The priority of the customed value is highest for key in keys_del: # make sure that keys_del are within args.keys() assert key in args.keys(), key # print( json.dumps(args, indent=True) ) # exit() # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear. # --- prepare dir import baselines root_dir = tools_logger.get_logger_dir( 'baselines', 'results', baselines ) args = tools_logger.prepare_dirs( args, key_first='env', keys_exclude=keys_exclude, dirs_type=['log' ], root_dir=root_dir ) # --- prepare args for use args.cliptype = ClipType[ args.cliptype ] args.zip_dirs = ['model','monitor'] for d in args.zip_dirs: args[f'{d}_dir'] = osp.join(args.log_dir, d) os.mkdir( args[f'{d}_dir'] ) from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2_AdaClip import ppo2 # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2 import baselines.ppo2_AdaClip.policies as plcs import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() set_global_seeds(args.seed) policy = getattr(plcs, args.policy_type) # ------ prepare env # args.eval_model = args.n_eval_epsiodes > 0 if args.envtype == MUJOCO: def make_mujoco_env(rank=0): def _thunk(): env = gym.make(args.env_full) env.seed(args.seed + rank) env = bench.Monitor(env, os.path.join(args.log_dir, 'monitor', str(rank)), allow_early_resets=True) return env return _thunk if args.n_envs == 1: env = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env = SubprocVecEnv([make_mujoco_env(i) for i in range(args.n_envs)]) env = VecNormalize(env, reward_scale=args.reward_scale) env_test = None if args.n_eval_epsiodes > 0: if args.n_eval_epsiodes == 1: env_test = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env_test = SubprocVecEnv([make_mujoco_env(i) for i in range(args.n_eval_epsiodes)]) env_test = VecNormalize(env_test, ret=False, update=False) # It doesn't need to normalize return else: from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.common.cmd_util import make_atari_env env = VecFrameStack(make_atari_env(args.env_full, num_env=args.n_envs, seed=args.seed), 4) env_test = None # TODO : debug VecFrame if args.n_eval_epsiodes > 0: env_test = VecFrameStack(make_atari_env(args.env_full, num_env=args.n_eval_epsiodes, seed=args.seed), 4) # env_test.reset() # env_test.render() # ----------- learn if args.envtype == MUJOCO: lr = args.lr # cliprange = args.clipargs.cliprange elif args.envtype == ATARI: lr = lambda f: f * args.lr # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None # print('action_space',env.action_space) ppo2.learn(policy=policy, env=env, env_eval=env_test, n_steps=args.n_steps, nminibatches=args.n_minibatches, lam=args.lam, gamma=0.99, n_opt_epochs=args.n_opt_epochs, log_interval=args.log_interval, ent_coef=args.coef_entropy, lr=lr, total_timesteps=args.num_timesteps, cliptype=args.cliptype, save_interval=args.save_interval, args=args) tools_logger.finish_dir( args.log_dir )
def main(): parser, clipargs_default_all, args_default_all = arg_parser_common() args = parser.parse_args() import json from dotmap import DotMap keys_exclude = [ 'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval', 'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write', 'kl2clip_sharelogstd', 'policy_variance_state_dependent', 'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps', 'gradient_rectify', 'rectify_scale', 'kl2clip_clipcontroltype', 'reward_scale', 'coef_predict_task', 'explore_additive_rate', 'explore_additive_threshold', 'explore_timesteps', 'debug_halfcheetah', 'name_project', 'env_pure', 'n_opt_epochs', 'coef_entropy', 'log_interval', 'save_interval', 'save_debug', 'is_atari' ] # 'is_atari' # -------------------- prepare args args.env_pure = args.env.split('-v')[0] # env_mujocos = 'InvertedPendulum,InvertedDoublePendulum,HalfCheetah,Hopper,Walker2d,Ant,Reacher,Swimmer,Humanoid' # env_mujocos = tools.str2list(env_mujocos) if not args.is_atari: env_type = MUJOCO if '-v' not in args.env: args.env = f'{args.env}-v2' else: env_type = ATARI if '-v' not in args.env: args.env = f'{args.env}-v4' tools.warn_(f'Run with setting for {env_type} task!!!!!') # --- set value of clipargs clipargs_default = clipargs_default_all[env_type] clipargs = clipargs_default[args.cliptype].copy() clipargs.update(args.clipargs) args.clipargs = clipargs # --- prepare other args # If the value of the following args are None, then it is setted by the following values args_default = args_default_all[env_type] args = DotMap(vars(args)) print( "The followng arg value is None, thus they are setted by built-in value:" ) for argname in args_default.keys(): if args[argname] is None: if args.env_pure in args_default[argname].keys(): args[argname] = args_default[argname][args.env_pure] else: args[argname] = args_default[argname]['_default'] print(f"{argname}={args[argname]}") # print( json.dumps( args.toDict(), indent='\t') ) # exit() # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear. # --- prepare dir import baselines root_dir = tools_logger.get_logger_dir('baselines', baselines, 'results') args = tools_logger.prepare_dirs(args, key_first='env', keys_exclude=keys_exclude, dirs_type=['log'], root_dir=root_dir) # --- prepare args for use args.cliptype = ClipType[args.cliptype] args.zip_dirs = ['model', 'monitor'] for d in args.zip_dirs: args[f'{d}_dir'] = osp.join(args.log_dir, d) os.mkdir(args[f'{d}_dir']) from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2_AdaClip import ppo2 # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2 import baselines.ppo2_AdaClip.policies as plcs import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() set_global_seeds(args.seed) policy = getattr(plcs, args.policy_type) # ------ prepare env # args.eval_model = args.n_eval_epsiodes > 0 if env_type == MUJOCO: def make_mujoco_env(rank=0): def _thunk(): env = gym.make(args.env) env.seed(args.seed + rank) env = bench.Monitor(env, os.path.join(args.log_dir, 'monitor', str(rank)), allow_early_resets=True) return env return _thunk if args.n_envs == 1: env = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_envs)]) env = VecNormalize(env, reward_scale=args.reward_scale) env_test = None if args.n_eval_epsiodes > 0: if args.n_eval_epsiodes == 1: env_test = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env_test = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_eval_epsiodes)]) env_test = VecNormalize( env_test, ret=False, update=False) # It doesn't need to normalize return else: from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.common.cmd_util import make_atari_env env = VecFrameStack( make_atari_env(args.env, num_env=args.n_envs, seed=args.seed), 4) env_test = None # TODO : debug VecFrame if args.n_eval_epsiodes > 0: env_test = VecFrameStack( make_atari_env(args.env, num_env=args.n_eval_epsiodes, seed=args.seed), 4) # env_test.reset() # env_test.render() # ----------- learn if env_type == MUJOCO: lr = args.lr # cliprange = args.clipargs.cliprange elif env_type == ATARI: lr = lambda f: f * args.lr # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None args.env_type = env_type ppo2.learn(policy=policy, env=env, env_eval=env_test, n_steps=args.n_steps, nminibatches=args.n_minibatches, lam=args.lam, gamma=0.99, n_opt_epochs=args.n_opt_epochs, log_interval=args.log_interval, ent_coef=args.coef_entropy, lr=lr, total_timesteps=args.num_timesteps, cliptype=args.cliptype, save_interval=args.save_interval, args=args) tools_logger.finish_dir(args.log_dir)