def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): env = make_atari_env(env_id, num_cpu, seed) if policy == 'cnn': policy_fn = AcerCnnPolicy elif policy == 'lstm': policy_fn = AcerLstmPolicy else: print("Policy {} not implemented".format(policy)) return learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True #pylint: disable=E1101 tf.Session(config=config).__enter__() env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy] ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f : f * 2.5e-4, cliprange=lambda f : f * 0.1, total_timesteps=int(num_timesteps * 1.1))
def train(env_id, num_timesteps, seed, num_cpu): env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
from arguments import get_args from ppo_agent import ppo_agent from baselines.common.cmd_util import make_atari_env from baselines.common.vec_env.vec_frame_stack import VecFrameStack from models import CNN_Net from baselines import logger import os if __name__ == '__main__': args = get_args() if not os.path.exists('logs/'): os.mkdir('logs/') log_path = 'logs/' + args.env_name + '/' if not os.path.exists(log_path): os.mkdir(log_path) # write log information logger.configure(dir=log_path) envs = VecFrameStack( make_atari_env(args.env_name, args.num_workers, args.seed), 4) network = CNN_Net(envs.action_space.n) ppo_trainer = ppo_agent(envs, args, network, 'atari') ppo_trainer.learn()
from baselines.common.cmd_util import make_atari_env from baselines.common.vec_env.vec_frame_stack import VecFrameStack from arguments import achieve_arguments from a2c_agent import a2c_agent from baselines import logger if __name__ == '__main__': args = achieve_arguments() logger.configure(dir=args.log_dir) # create environments envs = VecFrameStack(make_atari_env(args.env_name, args.num_processes, args.seed), 4) trainer = a2c_agent(envs, args) trainer.learn() envs.close()
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'mujoco': # todo: copy paste from akhil: create session instead of getting session get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) # always using dummy environment should allow running saved models without any further changes! # env = DummyVecEnv([lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) if args.num_env: env = SubprocVecEnv([ lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env) ]) else: env = DummyVecEnv( [lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) # uncommented on Akhil's advice, as it is no longer necessary because I'm normalizing the data in my environment! env = VecNormalize(env) elif env_type == 'atari': if alg == 'acer': env = make_atari_env(env_id, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'classic_control': def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e env = DummyVecEnv([make_env]) else: raise ValueError('Unknown env_type {}'.format(env_type)) return env
def main(): parser, clipargs_default_all, args_default_all = arg_parser_common() args = parser.parse_args() import json from dotmap import DotMap keys_exclude = [ 'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval', 'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write', 'kl2clip_sharelogstd', 'policy_variance_state_dependent', 'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps', 'gradient_rectify', 'rectify_scale', 'kl2clip_clipcontroltype', 'reward_scale', 'coef_predict_task', 'explore_additive_rate', 'explore_additive_threshold', 'explore_timesteps', 'debug_halfcheetah', 'name_project', 'env_pure', 'n_opt_epochs', 'coef_entropy', 'log_interval', 'save_interval', 'save_debug', 'is_atari' ] # 'is_atari' # -------------------- prepare args args.env_pure = args.env.split('-v')[0] # env_mujocos = 'InvertedPendulum,InvertedDoublePendulum,HalfCheetah,Hopper,Walker2d,Ant,Reacher,Swimmer,Humanoid' # env_mujocos = tools.str2list(env_mujocos) if not args.is_atari: env_type = MUJOCO if '-v' not in args.env: args.env = f'{args.env}-v2' else: env_type = ATARI if '-v' not in args.env: args.env = f'{args.env}-v4' tools.warn_(f'Run with setting for {env_type} task!!!!!') # --- set value of clipargs clipargs_default = clipargs_default_all[env_type] clipargs = clipargs_default[args.cliptype].copy() clipargs.update(args.clipargs) args.clipargs = clipargs # --- prepare other args # If the value of the following args are None, then it is setted by the following values args_default = args_default_all[env_type] args = DotMap(vars(args)) print( "The followng arg value is None, thus they are setted by built-in value:" ) for argname in args_default.keys(): if args[argname] is None: if args.env_pure in args_default[argname].keys(): args[argname] = args_default[argname][args.env_pure] else: args[argname] = args_default[argname]['_default'] print(f"{argname}={args[argname]}") # print( json.dumps( args.toDict(), indent='\t') ) # exit() # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear. # --- prepare dir import baselines root_dir = tools_logger.get_logger_dir('baselines', baselines, 'results') args = tools_logger.prepare_dirs(args, key_first='env', keys_exclude=keys_exclude, dirs_type=['log'], root_dir=root_dir) # --- prepare args for use args.cliptype = ClipType[args.cliptype] args.zip_dirs = ['model', 'monitor'] for d in args.zip_dirs: args[f'{d}_dir'] = osp.join(args.log_dir, d) os.mkdir(args[f'{d}_dir']) from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2_AdaClip import ppo2 # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2 import baselines.ppo2_AdaClip.policies as plcs import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() set_global_seeds(args.seed) policy = getattr(plcs, args.policy_type) # ------ prepare env # args.eval_model = args.n_eval_epsiodes > 0 if env_type == MUJOCO: def make_mujoco_env(rank=0): def _thunk(): env = gym.make(args.env) env.seed(args.seed + rank) env = bench.Monitor(env, os.path.join(args.log_dir, 'monitor', str(rank)), allow_early_resets=True) return env return _thunk if args.n_envs == 1: env = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_envs)]) env = VecNormalize(env, reward_scale=args.reward_scale) env_test = None if args.n_eval_epsiodes > 0: if args.n_eval_epsiodes == 1: env_test = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env_test = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_eval_epsiodes)]) env_test = VecNormalize( env_test, ret=False, update=False) # It doesn't need to normalize return else: from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.common.cmd_util import make_atari_env env = VecFrameStack( make_atari_env(args.env, num_env=args.n_envs, seed=args.seed), 4) env_test = None # TODO : debug VecFrame if args.n_eval_epsiodes > 0: env_test = VecFrameStack( make_atari_env(args.env, num_env=args.n_eval_epsiodes, seed=args.seed), 4) # env_test.reset() # env_test.render() # ----------- learn if env_type == MUJOCO: lr = args.lr # cliprange = args.clipargs.cliprange elif env_type == ATARI: lr = lambda f: f * args.lr # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None args.env_type = env_type ppo2.learn(policy=policy, env=env, env_eval=env_test, n_steps=args.n_steps, nminibatches=args.n_minibatches, lam=args.lam, gamma=0.99, n_opt_epochs=args.n_opt_epochs, log_interval=args.log_interval, ent_coef=args.coef_entropy, lr=lr, total_timesteps=args.num_timesteps, cliptype=args.cliptype, save_interval=args.save_interval, args=args) tools_logger.finish_dir(args.log_dir)
def build_env(args, selector=None): global store ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) print(env_type, env_id, nenv, args.num_env) if env_type == 'mujoco': get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) if args.num_env: env = SubprocVecEnv([ lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env) ]) else: env = DummyVecEnv( [lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) env = VecNormalize(env) elif env_type == 'atari': if alg == 'acer': env = make_atari_env( env_id, nenv, seed) #, wrapper_kwargs={'clip_rewards': False}) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) elif "Zelda" in env_id: sys.path.append( "/home/jupyter/Notebooks/Chang/HardRLWithYoutube/nnrunner/a2c_gvgai" ) import nnrunner.a2c_gvgai.env as gvgai_env frame_stack_size = 4 print("run zelda") env = VecFrameStack( gvgai_env.make_gvgai_env(env_id, nenv, seed, level_selector=selector, experiment="PE", dataset="zelda"), frame_stack_size) # env.reset() # store = env else: frame_stack_size = 4 env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'classic_control': def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e env = DummyVecEnv([make_env]) else: raise ValueError('Unknown env_type {}'.format(env_type)) # env.reset() print("build env") # store.reset() # store.reset() return env