def run_smac(**kwargs): params = dqn_params_parser(**kwargs) seed = params["seed"] ple_env = make_ple_env(params["env"], seed=seed) test_env = make_ple_env(params["test_env"], seed=seed) # with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: # f.write('KWARGS\n') # for k, v in kwargs.items(): # f.write(k + ': ' + str(v) + '\n') with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: f.write('PARAMS\n') for k, v in params.items(): f.write(k + ': ' + str(v) + '\n') # print(params) q_learning(ple_env, test_env=test_env, seed=seed, total_timesteps=params["total_timesteps"], gamma=params["gamma"], epsilon=params["epsilon"], epsilon_decay=params["epsilon_decay"], tau=params["tau"], lr=params["lr"], lrschedule=params["lrschedule"], buffer_size=params["buffer_size"], nbatch=params["nbatch"], trace_length=params["trace_length"], max_grad_norm=params["max_grad_norm"], units_per_hlayer=(params["units_layer1"], params["units_layer2"], params["units_layer3"]), update_interval=params["update_interval"], log_interval=params["log_interval"], test_interval=params["test_interval"], show_interval=params["show_interval"], logdir=params["logdir"], keep_model=params["keep_model"]) ple_env.close() avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=10, **params) with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: f.write('\n') f.write('Results: \n') f.write('average performance: ' + str(avg_perf) + '\n') f.write('performance variance: ' + str(var_perf) + '\n') f.write('maximum return: ' + str(max_return) + '\n') return avg_perf, var_perf, max_return
def main(): seed = 15 # ---- Specifiy the version of CFB ---- game = 'ContFlappyBird' ns = 'gfNS' # 'gfNS', 'gsNS', 'rand_feat' nrandfeat = ('-nrf' + str(2)) # '', 0,2,3,4 noiselevel = ('-nl' + str(0.001)) # '', 0.0001 - 0.05 (see env/__init__.py) experiment_phase = '-test' # '-test', '-train' # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0 env_name = (game + '-' + ns + noiselevel + nrandfeat + experiment_phase + '-v0') # ---- Generate CFB with single instance ---- env = make_ple_env(env_name, seed=seed) # Run env: env.seed(seed=seed) env.reset() for i in range(100): state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1)) if RENDER: env.render() # ---- Generate CFB with N parallel instances. ---- N = 3 env = make_ple_envs(env_name, num_env=N, seed=seed) # Run env: env.seed(seed=seed) env.reset() for i in range(100): state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1)) if RENDER: env[0].render()
(runner.ep_idx, sum(runner.ep_idx))) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return breaked from run_ple_utils import make_ple_envs, make_ple_env from models import MLPPolicy, LSTMPolicy, GRUPolicy if __name__ == '__main__': seed = 1 env = make_ple_envs('ContFlappyBird-hNS-nrf0-train-v0', num_env=1, seed=seed) test_env = make_ple_env('ContFlappyBird-v3', seed=seed) logger = logging.getLogger() ch = logging.StreamHandler() # Handler which writes to stderr (in red) ch.setLevel(logging.INFO) ch.setFormatter(logging.Formatter('%(levelname)s:%(name)s: %(message)s')) logger.addHandler(ch) logger.setLevel(logging.INFO) BATCH_SIZE = 64 # SMAC config 1 ACTIV_FCN = 'mixed' DISCOUNT = 0.94 ENT_COEFF = 0.000036 VF_COEFF = 0.36 LR = 0.0032
def eval_model(render, nepisodes, test_steps, save_traj=False, result_file='test_results.csv', **params): logger = logging.getLogger(__name__) logger.info('Evaluating learning algorithm...\n') logger.info(params["eval_model"]) logger.debug('\nMake Environment with seed %s' % params["seed"]) ple_env = make_ple_env(params["test_env"], seed=params["seed"]) # TODO alwys use the same random seed here! tf.reset_default_graph() set_global_seeds(params["seed"]) model_idx = [] if save_traj: result_path = os.path.join(params["logdir"], result_file) else: result_path = None recurrent = (params["architecture"] == 'lstm' or params["architecture"] == 'gru') if params["eval_model"] == 'final': avg_performances = [] var_performances = [] maximal_returns = [] for f in glob.glob(os.path.join(params["logdir"], '*final_model-*.meta')): logger.info('Restore model: %s' % f) idx = f.find('final_model') f_name = f[idx:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"]) # Add model performance metrics avg_performances = np.mean(model_performance) var_performances = np.var(model_performance) maximal_returns = np.max(model_performance) tf.reset_default_graph() elif params["eval_model"] == 'inter': # Use all stored maximum performance models and the final model. avg_performances = [] var_performances = [] maximal_returns = [] for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')): logger.info('Restore model: %s' % f) idx = f.find('_model') f_name = f[idx-5:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = \ restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = \ run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) var_performances.append(np.var(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() elif params["eval_model"] == 'analysis': # Use all stored maximum performance models and the final model. avg_performances = [] std_performances = [] maximal_returns = [] for f in glob.glob(os.path.join(params["logdir"], '*.meta')): logger.info('Restore model: %s' % f) idx = f.find('_model') f_name = f[idx - 5:-5] model_idx.append(f_name) print(f_name) with tf.Session() as sess: OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = \ restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) std_performances.append(np.std(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() return model_idx, avg_performances, std_performances # elif params["eval_model"] == "config": # # Use all stored maximum performance models and the final model. # avg_performances = [] # var_performances = [] # maximal_returns = [] # fieldnames = ['model'] # for i in range(nepisodes): # fieldnames.append(('eps' + str(i))) # path = os.path.join(params["logdir"], 'results.csv') # with open(path, "w") as csvfile: # writer = csv.writer(csvfile) # writer.writerow(fieldnames) # models = glob.glob(os.path.join(params["logdir"], '*config_model*.meta')) # models.sort() # for f in models: # logger.info('Restore model: %s' % f) # idx = f.find('config_model') # f_name = f[idx:-5] # model_idx.append(f_name) # with tf.Session() as sess: # OBS, PI, PI_LOGITS, pred_ac_op, pred_vf_op = restore_model(sess, logdir=params["logdir"], f_name=f_name) # logger.info('Run %s evaluation episodes' % nepisodes) # model_performance = \ # run_episodes(sess, ple_env, nepisodes, 2000, render, OBS, PI, PI_LOGITS, pred_ac_op) # # # Add model performance metrics # avg_performances.append(np.mean(model_performance)) # var_performances.append(np.var(model_performance)) # maximal_returns.append(np.max(model_performance)) # tf.reset_default_graph() # # # Save episode information in csv file for further analysis each row contains nepisodes episodes using model f_name. # with open(path, "a") as csvfile: # TODO add real returns # writer = csv.writer(csvfile) # model_performance = [str(p) for p in model_performance] # model_performance.insert(0, f_name) # writer.writerow(model_performance) logger.info(params["logdir"]) logger.info('Results of the evaluation of the learning algorithm:') logger.info('Restored models: %s' % model_idx) logger.info('Average performance per model: %s' % avg_performances) logger.info('Performance variance per model: %s' % var_performances) logger.info('Maximum episode return per model: %s\n' % maximal_returns) ple_env.close() if not avg_performances == []: return np.mean(avg_performances), np.mean(var_performances), np.mean(maximal_returns) else: return -3000, 3000, -3000
def eval_model(render, nepisodes, test_steps, save_traj=False, result_file='test_results.csv', **params): logger = logging.getLogger(__name__) logger.info('Evaluating learning algorithm...\n') logger.info(params["eval_model"]) logger.debug('\nMake Environment with seed %s' % params["seed"]) # TODO use different seed for every run!#, allow_early_resets=True) # TODO make non-clipped env, even if agent is trained on clipped env ple_env = make_ple_env(params["test_env"], seed=params["seed"]) tf.reset_default_graph() set_global_seeds(params["seed"]) model_idx = [] if save_traj: result_path = os.path.join(params["logdir"], result_file) else: result_path = None recurrent = (params["architecture"] == 'lstm' or params["architecture"] == 'gru') if params["eval_model"] == 'final': avg_performances = [] var_performances = [] maximal_returns = [] for f in glob.glob( os.path.join(params["logdir"], '*final_model-*.meta')): logger.info('Restore model: %s' % f) idx = f.find('final_model') f_name = f[idx:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model( sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, RNN_S_IN, RNN_S_OUT, PRED_Q, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) var_performances.append(np.var(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() elif params["eval_model"] == 'inter': # Use all stored maximum performance models and the final model. # print('Eval now!') avg_performances = [] var_performances = [] maximal_returns = [] for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')): logger.info('Restore model: %s' % f) idx = f.find('_model') f_name = f[idx - 5:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model( sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, RNN_S_IN, RNN_S_OUT, PRED_Q, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) var_performances.append(np.var(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() elif params["eval_model"] == 'analysis': # Use all stored maximum performance models and the final model. avg_performances = [] std_performances = [] maximal_returns = [] for f in glob.glob(os.path.join(params["logdir"], '*.meta')): logger.info('Restore model: %s' % f) idx = f.find('_model') f_name = f[idx - 5:-5] model_idx.append(f_name) # print(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model( sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, RNN_S_IN, RNN_S_OUT, PRED_Q, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) std_performances.append(np.std(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() return model_idx, avg_performances, std_performances logger.info(params["logdir"]) logger.info('Results of the evaluation of the learning algorithm:') logger.info('Restored models: %s' % model_idx) logger.info('Average performance per model: %s' % avg_performances) logger.info('Performance variance per model: %s' % var_performances) logger.info('Maximum episode return per model: %s' % maximal_returns) ple_env.close() if not avg_performances == []: return np.mean(avg_performances), np.mean(var_performances), np.mean( maximal_returns) else: return -3000, 3000, -3000
def run_ppo_smac(**kwargs): params = ppo_params_parser(**kwargs) # logger = logging.getLogger(__name__) # logger.propagate = False # no duplicate logging outputs # fh = logging.FileHandler(os.path.join(params["logdir"], 'run.log')) # fh.setLevel(logging.INFO) # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) # logger.addHandler(fh) seed = params["seed"] ple_env = make_ple_envs(params["env"], num_env=params["nenvs"], seed=seed) test_env = make_ple_env(params["test_env"], seed=3000) if params["architecture"] == 'ff': policy_fn = LargerMLPPolicy elif params["architecture"] == 'lstm': policy_fn = LargerLSTMPolicy elif params["architecture"] == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % params["policy"]) with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: for k, v in params.items(): f.write(k + ': ' + str(v) + '\n') print(params) early_stopped = learn(policy_fn, env=ple_env, test_env=test_env, seed=seed, total_timesteps=params["total_timesteps"], log_interval=params["log_interval"], test_interval=params["test_interval"], show_interval=params["show_interval"], logdir=params["logdir"], lr=params["lr"], # lrschedule=params["lrschedule"], max_grad_norm=params["max_grad_norm"], units_per_hlayer=(params["units_shared_layer1"], params["units_shared_layer2"], params["units_policy_layer"]), activ_fcn=params["activ_fcn"], gamma=params["gamma"], vf_coef=params["vf_coeff"], ent_coef=params["ent_coeff"], nsteps=params["nsteps"], lam=params["lam"], nminibatches=params["nminibatches"], noptepochs=params["noptepochs"], cliprange=params["cliprange"], early_stop=params["early_stop"], keep_model=params["keep_model"]) ple_env.close() if not early_stopped: avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=10, test_steps=3000, **params) with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: f.write('\n') f.write('Results: \n') f.write('average performance: ' + str(avg_perf) + '\n') f.write('performance variance: ' + str(var_perf) + '\n') f.write('maximum return: ' + str(max_return) + '\n') return avg_perf, var_perf, max_return else: return -3000, 3000, -3000
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=4) parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='elu', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated') parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=1) parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=1) parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95) parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64) parser.add_argument('--restore_model', help='whether a pretrained model shall be restored', type=bool, default=False) args = parser.parse_args() seed = args.seed env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed*10) # env = make_ple_envs('ContFlappyBird-hNS-nrf2-train-v0', num_env=args.nenvs, seed=seed - 1) test_env = make_ple_env(args.test_env, seed=3000) if args.architecture == 'ff': policy_fn = LargerMLPPolicy elif args.architecture == 'lstm': policy_fn = LargerLSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) # store hyperparms setting # logdir = os.path.join(args.logdir, str(datetime.datetime.today())) # os.makedirs(logdir) ppo_output_dir = os.path.join(args.logdir, ('ppo_output'+str(args.seed))) if not os.path.isdir(ppo_output_dir): os.makedirs(ppo_output_dir) with open(os.path.join(ppo_output_dir, 'hyperparams.txt'), 'a') as f: for k,v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(ppo_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False early_stopped = learn(policy_fn, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=ppo_output_dir, lr=args.lr, # lrschedule=args.lrschedule, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer), activ_fcn=args.activ_fcn, gamma=args.gamma, vf_coef=args.vf_coeff, ent_coef=args.ent_coeff, nsteps=args.nsteps, lam=args.lam, nminibatches=args.nminibatches, noptepochs=args.noptepochs, cliprange=args.cliprange, early_stop=args.early_stop, keep_model=args.keep_model, restore_model=args.restore_model) env.close()
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1) parser.add_argument( '--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='relu6', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument( '--batch_size', type=int, default=50, help='number of samples based on which gradient is updated', ) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64) args = parser.parse_args() seed = args.seed env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed - 1) test_env = make_ple_env(args.test_env, seed=100 + (seed - 1)) if args.architecture == 'ff': policy_fn = MLPPolicy elif args.architecture == 'lstm': policy_fn = LSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) a2c_output_dir = os.path.join(args.logdir, ('a2c_output' + str(args.seed))) if not os.path.isdir(a2c_output_dir): os.makedirs(a2c_output_dir) with open(os.path.join(a2c_output_dir, 'hyperparams.txt'), 'a') as f: for k, v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(a2c_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False early_stopped = learn( policy_fn, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=a2c_output_dir, lr=args.lr, # lrschedule=args.lrschedule, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer), activ_fcn=args.activ_fcn, gamma=args.gamma, vf_coef=args.vf_coeff, ent_coef=args.ent_coeff, batch_size=args.batch_size, early_stop=args.early_stop, keep_model=args.keep_model) env.close()
def main_event_dependent(): parser = argparse.ArgumentParser() parser.add_argument('--test_env', help='testv environment ID', default='ContFlappyBird-v3') parser.add_argument('--total_timesteps', help='Total number of env steps', type=int, default=int(2e5)) parser.add_argument('--seed', help='RNG seed', type=int, default=1) parser.add_argument('--logdir', default='/home/mara/Desktop/logs/ED_CONTROL', help='directory where logs are stored') parser.add_argument( '--show_interval', type=int, default=1, help='Env is rendered every n-th episode. 0 = no rendering') parser.add_argument( '--eval_model', choices=['all', 'inter', 'final'], default='inter', help= 'Eval all stored models, only the final model or only the intermediately stored models (while testing the best algorithm configs)' ) args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) # Init test_results.csv # rnd_output_dir = args.logdir # # logger = logging.getLogger() # fh = logging.FileHandler(os.path.join(rnd_output_dir, 'algo.log')) # fh.setLevel(logging.INFO) # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) # logger.addHandler(fh) # logger.setLevel(logging.INFO) # logger.propagate = False # # result_path = os.path.join(rnd_output_dir, 'test_results.csv') for s in range(100, 120): # logger.info('make env with seed %s' % s) test_env = make_ple_env(args.test_env, seed=s) state = test_env.reset() #print(state) # logger.info('reset') total_return = 0 rew_traj = [] t = 0 while t < args.total_timesteps: t += 1 if t % 20 == 0: a = 1 if args.show_interval > 0: test_env.render() time.sleep(0.01) # logger.info('render') # logger.info('step') if state[0] > 0.5 * (state[2] + state[3]): action = 0 # FLAP else: action = 1 state, reward, dones, _ = test_env.step(action) #print(state) # logger.info('stepped') # reward_window.append(reward) total_return += reward rew_traj.append(reward) test_env.close()
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1) parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='mixed', help='Activation functions of network layers', ) parser.add_argument('--lr', help='Learning Rate', type=float, default=0.001) parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated') parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2) parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=7e-5) parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=28) parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=59) parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=21) # PPO args parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=2) parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=4) parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95) parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2) # MAML args parser.add_argument('--K', help='length of each rollout (=trajectory)', type=int, default=20) # Test how well it works with other measures. parser.add_argument('--train_batchsz', help='number of rollouts per adaptation/training update (=fast update)', type=int, default=1) parser.add_argument('--kshot', help='number of adaptation/training update (=fast updates) per task between two meta updates', type=int, default=1000) parser.add_argument('--test_batchsz', help='number of rollouts with updated model on which test_loss is computed', type=int, default=1) parser.add_argument('--meta_batchsz', help='number of sampled tasks per meta update', type=int, default=4) # parallely or sequentially parser.add_argument('--test_stage', help='whether or not meta learner is in test_stage', type=bool, default=False) parser.add_argument('--base_agent', help='type of base learning agent, i.e. A2C or PPO agent', type=str, default='ppo') args = parser.parse_args() print(args) ple_env = make_ple_envs(args.env, args.nenvs, seed=args.seed-1) ple_test_env = make_ple_env(args.test_env, seed=100 + (args.seed-1)) if args.architecture == 'ff': policy_fn = LargerMLPPolicy elif args.architecture == 'lstm': policy_fn = LargerLSTMPolicy elif args.architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % args.policy) output_dir = os.path.join(args.logdir, ('a2c_output'+str(args.seed))) if not os.path.isdir(output_dir): os.makedirs(output_dir) with open(os.path.join(output_dir, 'hyperparams.txt'), 'a') as f: for k,v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger() fh = logging.FileHandler(os.path.join(output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False # if not args.test_stage: # construct training model # pass args.env = ple_env args.test_env = ple_test_env args.logdir = output_dir args.units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2, args.units_policy_layer) args.policy = policy_fn args.total_timesteps = 200000 meta_learn(**args.__dict__) ple_env.close()
DISCOUNT = 0.90 EPSILON = 0.5 EPS_DECAY = 0.995 LR = 5e-4 MAX_REPLAY_BUF_SIZE = 1000 BATCH_SIZE = 4 # number of episodes from which trces are sampled MAX_GRAD_NORM = 0.5 NUM_TRAIN_UPDATES = int(2e6) TARGET = None SAVE_INTERVAL = 500 LOG_INTERVAL = 30 DATE = str(datetime.datetime.now().strftime("%Y_%m_%d_%H%M%S")) LOGDIR = os.path.join('/home/mara/Desktop/logs/DQN', DATE) seed = 2 env = make_ple_env('FlappyBird-v1', seed=seed) test_env = make_ple_env('FlappyBird-v1', seed=seed) q_learning( env, test_env=test_env, seed=seed, total_timesteps=NUM_TRAIN_UPDATES, gamma=DISCOUNT, epsilon=EPSILON, epsilon_decay=EPS_DECAY, tau=0.90, lr=LR, buffer_size=MAX_REPLAY_BUF_SIZE, nbatch=BATCH_SIZE, trace_length=8,
import os, glob import csv import logging import tensorflow as tf import numpy as np import time from utils import set_global_seeds, normalize_obs, get_collection_rnn_state from run_ple_utils import make_ple_env SEED = 100 LOGDIR = '/home/mara/Videos' F_NAME = 'final_model-2000000' ple_env = make_ple_env('ContFlappyBird-v3', seed=SEED) tf.reset_default_graph() set_global_seeds(SEED) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # g = tf.get_default_graph() # Shouldn't be set here again, as a new RNG is used without previous seeding. # restore the model loader = tf.train.import_meta_graph(glob.glob(os.path.join(LOGDIR, (F_NAME + '.meta')))[0]) # now variables exist, but the values are not initialized yet. loader.restore(sess, os.path.join(LOGDIR, F_NAME)) # restore values of the variables. # Load operations from collections obs_in = tf.get_collection('inputs') probs_out = tf.get_collection('pi')
def eval_model(render, nepisodes, **params): logger = logging.getLogger(__name__) logger.info('Evaluating learning algorithm...\n') logger.info(params["eval_model"]) logger.debug('\nMake Environment with seed %s' % params["seed"]) # TODO make non-clipped env, even if agent is trained on clipped env ple_env = make_ple_env(params["env"], seed=params["seed"]) # , allow_early_resets=True) tf.reset_default_graph() set_global_seeds(params["seed"]) model_idx = [] if params["eval_model"] == 'final': f = glob.glob(os.path.join(params["logdir"], 'final_model-*.meta')) idx = f.find('final_model') f_name = f[idx:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model( sess, logdir=params["logdir"], f_name=f_name) model_performance = run_episodes(sess, ple_env, nepisodes, 1000, render, params["epsilon"], OBS, RNN_S_IN, RNN_S_OUT, PRED_Q) # Add model performance metrics avg_performances = np.mean(model_performance) var_performances = np.var(model_performance) maximal_returns = np.max(model_performance) tf.reset_default_graph() elif params["eval_model"] == 'all': # Use all stored maximum performance models and the final model. avg_performances = [] var_performances = [] maximal_returns = [] iii = 0 for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')): logger.info('Restore model: %s' % f) idx = f.find('_model') f_name = f[idx - 5:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model( sess, logdir=params["logdir"], f_name=f_name) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, 1000, render, params["epsilon"], OBS, RNN_S_IN, RNN_S_OUT, PRED_Q) # Add model performance metrics avg_performances.append(np.mean(model_performance)) var_performances.append(np.var(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() elif params["eval_model"] == "config": # Use all stored maximum performance models and the final model. avg_performances = [] var_performances = [] maximal_returns = [] # Setup log csv file fieldnames = ['model'] for i in range(nepisodes): fieldnames.append(('eps' + str(i))) path = os.path.join(params["logdir"], 'results.csv') with open(path, "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(fieldnames) # Run evaluation episodes models = glob.glob( os.path.join(params["logdir"], '*config_model*.meta')) models.sort() for f in models: logger.info('Restore model: %s' % f) idx = f.find('config_model') f_name = f[idx:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model( sess, logdir=params["logdir"], f_name=f_name) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, 2000, render, params["epsilon"], OBS, RNN_S_IN, RNN_S_OUT, PRED_Q) # TODO 1000 # Add model performance metrics avg_performances.append(np.mean(model_performance)) var_performances.append(np.var(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() # Save episode information in csv file for further analysis. # Each row contains nepisodes episodes using the current model "f_name". with open(path, "a") as csvfile: # TODO add real returns writer = csv.writer(csvfile) model_performance = [str(p) for p in model_performance] model_performance.insert(0, f_name) writer.writerow(model_performance) logger.info(params["logdir"]) logger.info('Results of the evaluation of the learning algorithm:') logger.info('Restored models: %s' % model_idx) logger.info('Average performance per model: %s' % avg_performances) logger.info('Performance variance per model: %s' % var_performances) logger.info('Maximum episode return per model: %s' % maximal_returns) ple_env.close() if len(avg_performances) > 0: return np.mean(avg_performances), np.mean(var_performances), np.mean( maximal_returns) else: return -5, 0, -5
def run_dqn_smac(**kwargs): params = dqn_params_parser(**kwargs) seed = params["seed"] ple_env = make_ple_env(params["env"], seed=seed) test_env = make_ple_env(params["test_env"], seed=3000) if params["architecture"] == 'ff': q_network = FF_DQN params["trace_length"] = 1 elif params["architecture"] == 'lstm': q_network = LSTM_DQN elif params["architecture"] == 'gru': q_network = GRU_DQN else: print('Policy option %s is not implemented yet.' % params["policy"]) with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: for k, v in params.items(): f.write(k + ': ' + str(v) + '\n') # If buffer size of the experience replay buffer is smaller than the batch_size * trace length, not enough # observations are fed to the network to compute the update step and the code throws an error. if params["buffer_size"] < (params["batch_size"] * params["trace_length"]): return -3000, 3000, -3000 early_stopped, _ = q_learning(q_network=q_network, env=ple_env, test_env=test_env, seed=seed, total_timesteps=params["total_timesteps"], log_interval=params["log_interval"], test_interval=params["test_interval"], show_interval=params["show_interval"], logdir=params["logdir"], lr=params["lr"], max_grad_norm=params["max_grad_norm"], units_per_hlayer=(params["units_layer1"], params["units_layer2"], params["units_layer3"]), activ_fcn=params["activ_fcn"], gamma=params["gamma"], epsilon=params["epsilon"], epsilon_decay=params["epsilon_decay"], buffer_size=params["buffer_size"], batch_size=params["batch_size"], trace_length=params["trace_length"], tau=params["tau"], update_interval=params["update_interval"], early_stop=params["early_stop"], keep_model=params["keep_model"]) # update_interval=params["trace_length"]) ple_env.close() if not early_stopped: avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=10, test_steps=3000, **params) with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f: f.write('\n') f.write('Results: \n') f.write('average performance: ' + str(avg_perf) + '\n') f.write('performance variance: ' + str(var_perf) + '\n') f.write('maximum return: ' + str(max_return) + '\n') return avg_perf, var_perf, max_return else: return -3000, 3000, -3000
def main(): parser = arg_parser() parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False) parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--epsilon', help='Epsilon for epsilon-greedy policy', type=float, default=0.5) parser.add_argument('--epsilon_decay', help='Epsilon decay rate', type=float, default=0.995) parser.add_argument('--tau', help='Update rate of target netowrk', type=float, default=0.99) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--buffer_size', help='Replay buffer size', type=float, default=500) parser.add_argument( '--batch_size', help= 'Batch size. Number of samples drawn from buffer, which are used to update the model.', type=int, default=50) parser.add_argument( '--trace_length', help='Length of the traces obtained from the batched episodes', type=int, default=1) parser.add_argument('--units_layer1', help='Units in first hidden layer', type=int, default=64) parser.add_argument('--units_layer2', help='Units in second hidden layer', type=int, default=64) parser.add_argument('--units_layer3', help='Units in third hidden layer', type=int, default=64) parser.add_argument( '--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='relu6', help='Activation functions of network layers', ) parser.add_argument( '--update_interval', type=int, default=30, help= 'Frequency with which the network model is updated based on minibatch data.' ) args = parser.parse_args() assert (args.buffer_size > (args.batch_size * args.trace_length) ), 'Batch size needs to be smaller than Buffer size!' seed = args.seed env = make_ple_env(args.env, seed=seed - 1) # env = make_ple_env('ContFlappyBird-hNS-nrf2-test-v0', seed=seed-1) test_env = make_ple_env(args.test_env, seed=100 + (seed - 1)) if args.architecture == 'ff': q_network = FF_DQN args.trace_length = 1 elif args.architecture == 'lstm': q_network = LSTM_DQN elif args.architecture == 'gru': q_network = GRU_DQN # logdir = os.path.join(args.logdir, str(datetime.datetime.today())) # os.makedirs(logdir) dqn_output_dir = os.path.join(args.logdir, ('dqn_output' + str(args.seed))) if not os.path.isdir(dqn_output_dir): os.makedirs(dqn_output_dir) # store hyperparms setting with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f: for k, v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger( ) # setup root logger is necessary to use FIleHandler fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) logger.propagate = False # If buffer size of the experience replay buffer is smaller than the batch_size * trace length, not enough # observations are fed to the network to compute the update step and the code throws an error. if args.buffer_size < (args.batch_size * args.trace_length): logger.info( 'Experience replay buffer is too small. Should be bigger than batch_size * trace_length = %i * %i' % (args.batch_size, args.trace_length)) # return -3000, 3000, -3000 early_stopped, _ = q_learning(q_network=q_network, env=env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=dqn_output_dir, lr=args.lr, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_layer1, args.units_layer2, args.units_layer3), activ_fcn=args.activ_fcn, gamma=args.gamma, epsilon=args.epsilon, epsilon_decay=args.epsilon_decay, buffer_size=args.buffer_size, batch_size=args.batch_size, trace_length=args.trace_length, tau=args.tau, update_interval=args.update_interval, early_stop=args.early_stop, keep_model=args.keep_model) env.close() args.logdir = dqn_output_dir
def main(): parser = arg_parser() # parser = arg_parser() parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90) parser.add_argument('--epsilon', help='Epsilon for epsilon-greedy policy', type=float, default=0.5) parser.add_argument('--epsilon_decay', help='Epsilon decay rate', type=float, default=0.995) parser.add_argument('--tau', help='Update rate of target netowrk', type=float, default=0.99) parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4) parser.add_argument('--lrschedule', help='Learning Rate Decay Schedule', choices=['constant', 'linear', 'double_linear_con'], default='constant') parser.add_argument( '--nbatch', help= 'Batch size. Number of sampless drawn from buffer, which are used to update the model.', type=int, default=3) parser.add_argument('--buffer_size', help='Replay buffer size', type=int, default=10) parser.add_argument( '--trace_length', help='Length of the traces obtained from the batched episodes', type=int, default=8) parser.add_argument( '--max_grad_norm', help='Maximum gradient norm up to which gradient is not clipped', type=float, default=0.01) parser.add_argument('--units_layer1', help='Units in first hidden layer', type=int, default=64) parser.add_argument('--units_layer2', help='Units in second hidden layer', type=int, default=64) parser.add_argument('--units_layer3', help='Units in third hidden layer', type=int, default=64) parser.add_argument( '--update_interval', type=int, default=5, help= 'Frequency with which the network model is updated based on minibatch data.' ) # parser.add_argument('--log_interval', help='parameter values stored in tensorboard summary every <log_interval> model update step. 0 --> no logging ', type=int, default=30) # parser.add_argument('--show_interval', help='Env is rendered every n-th episode. 0 = no rendering', type=int, default=30) # parser.add_argument('--logdir', help='directory where logs are stored', default='/home/mara/Desktop/logs/A2C_OAI_NENVS') # '/mnt/logs/A2C') args = parser.parse_args() seed = args.seed env = make_ple_env(args.env, seed=seed) test_env = make_ple_env(args.env, seed=seed) # logdir = os.path.join(args.logdir, str(datetime.datetime.today())) # os.makedirs(logdir) dqn_output_dir = os.path.join(args.logdir, ('dqn_rnn_output' + str(args.seed))) if not os.path.isdir(dqn_output_dir): # TODO check what this does os.makedirs(dqn_output_dir) # store hyperparms setting with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f: for k, v in vars(args).items(): f.write(k + ': ' + str(v) + '\n') logger = logging.getLogger( ) # TODO setup root logger is necessary to use FIleHandler logger.propagate = False fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log')) fh.setLevel(logging.INFO) fh.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) logger.addHandler(fh) logger.setLevel(logging.INFO) q_learning(env, test_env=test_env, seed=seed, total_timesteps=args.total_timesteps, gamma=args.gamma, epsilon=args.epsilon, epsilon_decay=args.epsilon_decay, tau=args.tau, lr=args.lr, lrschedule=args.lrschedule, buffer_size=args.buffer_size, nbatch=args.nbatch, trace_length=args.trace_length, max_grad_norm=args.max_grad_norm, units_per_hlayer=(args.units_layer1, args.units_layer2, args.units_layer3), update_interval=args.update_interval, log_interval=args.log_interval, test_interval=args.test_interval, show_interval=args.show_interval, logdir=dqn_output_dir, keep_model=args.keep_model) env.close() args.logdir = dqn_output_dir avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=15, **args.__dict__) with open(os.path.join(args.logdir, 'hyperparams.txt'), 'a') as f: f.write('\n') f.write('Results: \n') f.write('average performance: ' + str(avg_perf) + '\n') f.write('performance variance: ' + str(var_perf) + '\n') f.write('maximum return: ' + str(max_return) + '\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--test_env', help='testv environment ID', default='ContFlappyBird-v3') parser.add_argument('--total_timesteps', help='Total number of env steps', type=int, default=int(2e4)) parser.add_argument('--seed', help='RNG seed', type=int, default=1) parser.add_argument('--logdir', default='/home/mara/Desktop/logs/RND', help='directory where logs are stored') parser.add_argument( '--show_interval', type=int, default=1, help='Env is rendered every n-th episode. 0 = no rendering') parser.add_argument( '--eval_model', choices=['all', 'inter', 'final'], default='inter', help= 'Eval all stored models, only the final model or only the intermediately stored models (while testing the best algorithm configs)' ) args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) # Init test_results.csv # for i, p_flap in zip(range(1, 4), [0.1, 0.3, 0.5]): # rnd_output_dir = os.path.join(args.logdir, ('rnd_output' + str(i))) # if not os.path.isdir(rnd_output_dir): # os.makedirs(rnd_output_dir) # # logger = logging.getLogger() # fh = logging.FileHandler(os.path.join(rnd_output_dir, 'algo.log')) # fh.setLevel(logging.INFO) # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s')) # logger.addHandler(fh) # logger.setLevel(logging.INFO) # logger.propagate = False # # result_path = os.path.join(rnd_output_dir, 'test_results.csv') for p_flap in [0.1]: for s in [100]: # range(100, 120): # logger.info('make env with seed %s' % s) test_env = make_ple_env(args.test_env, seed=s) test_env.reset() total_return = 0 rew_traj = [] t = 0 while t < args.total_timesteps: t += 1 if args.show_interval > 0: test_env.render() time.sleep(0.01) obs, reward, dones, _ = test_env.step( np.random.choice([0, 1], p=[p_flap, 1 - p_flap])) total_return += reward rew_traj.append(reward) test_env.close()
def main(): seed = 42 # ---- Specifiy the version of CFB ---- game = 'ContFlappyBird' ns = '' # '', 'gfNS', 'gsNS', 'rand_feat' nrandfeat = ('-nrf' + str(2)) # '', 0,2,3,4 noiselevel = ('-nl' + str(0.001) ) # '', 0.0001 - 0.05 (see env/__init__.py) experiment_phase = '-train' # '-test', '-train' # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0 env_name = (game + ns + noiselevel + nrandfeat + experiment_phase + '-v0') test_env_name = (game + ns + noiselevel + nrandfeat + '-test' + '-v0') # ---- Generate CFB with N parallel instances and with single instance ---- ple_env = make_ple_envs(env_name, num_env=2, seed=seed) # N parallel instances test_env = make_ple_env(test_env_name, seed=seed + 42) # single instance # ---- Import the RL method you want to use ---- from A2C.a2c import learn # from PPO.ppo import learn # from DQN.dqn import q_learning # ---- Specify the model (FF, LSTM, GRU) ---- model_architecture = 'ff' # 'lstm', 'gru' if model_architecture == 'ff': policy_fn = MLPPolicy elif model_architecture == 'lstm': policy_fn = LSTMPolicy elif model_architecture == 'gru': policy_fn = GRUPolicy else: print('Policy option %s is not implemented yet.' % model_architecture) # ---- Learn an optimal policy. The agents model ('final_model...') is stored in LOGDIR. early_stopped = learn( policy_fn, env=ple_env, test_env=test_env, seed=seed, total_timesteps=int(2e4), # Total number of env steps log_interval= 0, # Network parameter values are stored in tensorboard summary every <log_interval> model update step. 0 --> no logging test_interval= 0, # Model is evaluated after <test_interval> model updates. 0 = do not evaluate while learning. show_interval=0, # Env is rendered every n-th episode. 0 = no rendering logdir=LOGDIR, # directory where logs and the learned models are stored lr=5e-4, # Learning Rate max_grad_norm= 0.01, # Maximum gradient norm up to which gradient is not clipped units_per_hlayer=(64, 64, 64), # Number of units per network layer activ_fcn= 'relu6', # Type of activation function used in the network: 'relu6', 'elu', 'mixed' gamma=0.95, # Discount factor for discounting the reward vf_coef=0.2, # Weight on the value function loss in the loss function ent_coef=1e-7, # Weight on the policy entropy in the loss function batch_size=64, # number of samples based on which gradient is updated early_stop=False, # whether or not to stop bad performing runs earlier. keep_model=0 ) # How many best models shall be kept during training. 0 -> only final model ple_env.close()