def main(): seed = 15 # ---- Specifiy the version of CFB ---- game = 'ContFlappyBird' ns = 'gfNS' # 'gfNS', 'gsNS', 'rand_feat' nrandfeat = ('-nrf' + str(2)) # '', 0,2,3,4 noiselevel = ('-nl' + str(0.001)) # '', 0.0001 - 0.05 (see env/__init__.py) experiment_phase = '-test' # '-test', '-train' # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0 env_name = (game + '-' + ns + noiselevel + nrandfeat + experiment_phase + '-v0') # ---- Generate CFB with single instance ---- env = make_ple_env(env_name, seed=seed) # Run env: env.seed(seed=seed) env.reset() for i in range(100): state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1)) if RENDER: env.render() # ---- Generate CFB with N parallel instances. ---- N = 3 env = make_ple_envs(env_name, num_env=N, seed=seed) # Run env: env.seed(seed=seed) env.reset() for i in range(100): state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1)) if RENDER: env[0].render()
logger.info( 'Total number of finished episodes during training: sum(%s) = %s' % (runner.ep_idx, sum(runner.ep_idx))) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return breaked from run_ple_utils import make_ple_envs, make_ple_env from models import MLPPolicy, LSTMPolicy, GRUPolicy if __name__ == '__main__': seed = 1 env = make_ple_envs('ContFlappyBird-hNS-nrf0-train-v0', num_env=1, seed=seed) test_env = make_ple_env('ContFlappyBird-v3', seed=seed) logger = logging.getLogger() ch = logging.StreamHandler() # Handler which writes to stderr (in red) ch.setLevel(logging.INFO) ch.setFormatter(logging.Formatter('%(levelname)s:%(name)s: %(message)s')) logger.addHandler(ch) logger.setLevel(logging.INFO) BATCH_SIZE = 64 # SMAC config 1 ACTIV_FCN = 'mixed' DISCOUNT = 0.94 ENT_COEFF = 0.000036
def run_ppo_smac(**kwargs): params = ppo_params_parser(**kwargs) # logger = logging.getLogger(__name__) # logger.propagate = False # no duplicate logging outputs # fh = logging.FileHandler(os.path.join(params["logdir"], 'run.log')) # fh.setLevel(logging.INFO) # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) # logger.addHandler(fh) seed = params["seed"] ple_env = make_ple_envs(params["env"], num_env=params["nenvs"], seed=seed) test_env = make_ple_env(params["test_env"], seed=3000) if params["architecture"] == 'ff': policy_fn = LargerMLPPolicy elif params["architecture"] == 'lstm': policy_fn = LargerLSTMPolicy elif params["architecture"] == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % params["policy"]) with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: for k, v in params.items(): f.write(k + ': ' + str(v) + '\n') print(params) early_stopped = learn(policy_fn, env=ple_env, test_env=test_env, seed=seed, total_timesteps=params["total_timesteps"], log_interval=params["log_interval"], test_interval=params["test_interval"], show_interval=params["show_interval"], logdir=params["logdir"], lr=params["lr"], # lrschedule=params["lrschedule"], max_grad_norm=params["max_grad_norm"], units_per_hlayer=(params["units_shared_layer1"], params["units_shared_layer2"], params["units_policy_layer"]), activ_fcn=params["activ_fcn"], gamma=params["gamma"], vf_coef=params["vf_coeff"], ent_coef=params["ent_coeff"], nsteps=params["nsteps"], lam=params["lam"], nminibatches=params["nminibatches"], noptepochs=params["noptepochs"], cliprange=params["cliprange"], early_stop=params["early_stop"], keep_model=params["keep_model"]) ple_env.close() if not early_stopped: avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=10, test_steps=3000, **params) with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: f.write('\n') f.write('Results: \n') f.write('average performance: ' + str(avg_perf) + '\n') f.write('performance variance: ' + str(var_perf) + '\n') f.write('maximum return: ' + str(max_return) + '\n') return avg_perf, var_perf, max_return else: return -3000, 3000, -3000
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=4) parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='elu', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated') parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=1) parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=1) parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95) parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64) parser.add_argument('--restore_model', help='whether a pretrained model shall be restored', type=bool, default=False) args = parser.parse_args() seed = args.seed env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed*10) # env = make_ple_envs('ContFlappyBird-hNS-nrf2-train-v0', num_env=args.nenvs, seed=seed - 1) test_env = make_ple_env(args.test_env, seed=3000) if args.architecture == 'ff': policy_fn = LargerMLPPolicy elif args.architecture == 'lstm': policy_fn = LargerLSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) # store hyperparms setting # logdir = os.path.join(args.logdir, str(datetime.datetime.today())) # os.makedirs(logdir) ppo_output_dir = os.path.join(args.logdir, ('ppo_output'+str(args.seed))) if not os.path.isdir(ppo_output_dir): os.makedirs(ppo_output_dir) with open(os.path.join(ppo_output_dir, 'hyperparams.txt'), 'a') as f: for k,v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(ppo_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False early_stopped = learn(policy_fn, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=ppo_output_dir, lr=args.lr, # lrschedule=args.lrschedule, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer), activ_fcn=args.activ_fcn, gamma=args.gamma, vf_coef=args.vf_coeff, ent_coef=args.ent_coeff, nsteps=args.nsteps, lam=args.lam, nminibatches=args.nminibatches, noptepochs=args.noptepochs, cliprange=args.cliprange, early_stop=args.early_stop, keep_model=args.keep_model, restore_model=args.restore_model) env.close()
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1) parser.add_argument( '--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='relu6', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument( '--batch_size', type=int, default=50, help='number of samples based on which gradient is updated', ) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64) args = parser.parse_args() seed = args.seed env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed - 1) test_env = make_ple_env(args.test_env, seed=100 + (seed - 1)) if args.architecture == 'ff': policy_fn = MLPPolicy elif args.architecture == 'lstm': policy_fn = LSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) a2c_output_dir = os.path.join(args.logdir, ('a2c_output' + str(args.seed))) if not os.path.isdir(a2c_output_dir): os.makedirs(a2c_output_dir) with open(os.path.join(a2c_output_dir, 'hyperparams.txt'), 'a') as f: for k, v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(a2c_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False early_stopped = learn( policy_fn, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=a2c_output_dir, lr=args.lr, # lrschedule=args.lrschedule, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer), activ_fcn=args.activ_fcn, gamma=args.gamma, vf_coef=args.vf_coeff, ent_coef=args.ent_coeff, batch_size=args.batch_size, early_stop=args.early_stop, keep_model=args.keep_model) env.close()
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1) parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='mixed', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=0.001) parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated') parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=7e-5) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=28) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=59) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=21) # PPO args parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=2) parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=4) parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95) parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2) # MAML args parser.add_argument('--K', help='length of each rollout (=trajectory)', type=int, default=20) # Test how well it works with other measures. parser.add_argument('--train_batchsz', help='number of rollouts per adaptation/training update (=fast update)', type=int, default=1) parser.add_argument('--kshot', help='number of adaptation/training update (=fast updates) per task between two meta updates', type=int, default=1000) parser.add_argument('--test_batchsz', help='number of rollouts with updated model on which test_loss is computed', type=int, default=1) parser.add_argument('--meta_batchsz', help='number of sampled tasks per meta update', type=int, default=4) # parallely or sequentially parser.add_argument('--test_stage', help='whether or not meta learner is in test_stage', type=bool, default=False) parser.add_argument('--base_agent', help='type of base learning agent, i.e. A2C or PPO agent', type=str, default='ppo') args = parser.parse_args() print(args) ple_env = make_ple_envs(args.env, args.nenvs, seed=args.seed-1) ple_test_env = make_ple_env(args.test_env, seed=100 + (args.seed-1)) if args.architecture == 'ff': policy_fn = LargerMLPPolicy elif args.architecture == 'lstm': policy_fn = LargerLSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) output_dir = os.path.join(args.logdir, ('a2c_output'+str(args.seed))) if not os.path.isdir(output_dir): os.makedirs(output_dir) with open(os.path.join(output_dir, 'hyperparams.txt'), 'a') as f: for k,v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False # if not args.test_stage: # construct training model # pass args.env = ple_env args.test_env = ple_test_env args.logdir = output_dir args.units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer) args.policy = policy_fn args.total_timesteps = 200000 meta_learn(**args.__dict__) ple_env.close()
def main(): seed = 42 # ---- Specifiy the version of CFB ---- game = 'ContFlappyBird' ns = '' # '', 'gfNS', 'gsNS', 'rand_feat' nrandfeat = ('-nrf' + str(2)) # '', 0,2,3,4 noiselevel = ('-nl' + str(0.001) ) # '', 0.0001 - 0.05 (see env/__init__.py) experiment_phase = '-train' # '-test', '-train' # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0 env_name = (game + ns + noiselevel + nrandfeat + experiment_phase + '-v0') test_env_name = (game + ns + noiselevel + nrandfeat + '-test' + '-v0') # ---- Generate CFB with N parallel instances and with single instance ---- ple_env = make_ple_envs(env_name, num_env=2, seed=seed) # N parallel instances test_env = make_ple_env(test_env_name, seed=seed + 42) # single instance # ---- Import the RL method you want to use ---- from A2C.a2c import learn # from PPO.ppo import learn # from DQN.dqn import q_learning # ---- Specify the model (FF, LSTM, GRU) ---- model_architecture = 'ff' # 'lstm', 'gru' if model_architecture == 'ff': policy_fn = MLPPolicy elif model_architecture == 'lstm': policy_fn = LSTMPolicy elif model_architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % model_architecture) # ---- Learn an optimal policy. The agents model ('final_model...') is stored in LOGDIR. early_stopped = learn( policy_fn, env=ple_env, test_env=test_env, seed=seed, total_timesteps=int(2e4), # Total number of env steps log_interval= 0, # Network parameter values are stored in tensorboard summary every <log_interval> model update step. 0 --> no logging test_interval= 0, # Model is evaluated after <test_interval> model updates. 0 = do not evaluate while learning. show_interval=0, # Env is rendered every n-th episode. 0 = no rendering logdir=LOGDIR, # directory where logs and the learned models are stored lr=5e-4, # Learning Rate max_grad_norm= 0.01, # Maximum gradient norm up to which gradient is not clipped units_per_hlayer=(64, 64, 64), # Number of units per network layer activ_fcn= 'relu6', # Type of activation function used in the network: 'relu6', 'elu', 'mixed' gamma=0.95, # Discount factor for discounting the reward vf_coef=0.2, # Weight on the value function loss in the loss function ent_coef=1e-7, # Weight on the policy entropy in the loss function batch_size=64, # number of samples based on which gradient is updated early_stop=False, # whether or not to stop bad performing runs earlier. keep_model=0 ) # How many best models shall be kept during training. 0 -> only final model ple_env.close()