Exemple #1
0
def run_smac(**kwargs):
    params = dqn_params_parser(**kwargs)
    seed = params["seed"]
    ple_env = make_ple_env(params["env"], seed=seed)
    test_env = make_ple_env(params["test_env"], seed=seed)

    # with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
    #     f.write('KWARGS\n')
    #     for k, v in kwargs.items():
    #         f.write(k + ': ' + str(v) + '\n')

    with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
        f.write('PARAMS\n')
        for k, v in params.items():
            f.write(k + ': ' + str(v) + '\n')

    # print(params)

    q_learning(ple_env,
               test_env=test_env,
               seed=seed,
               total_timesteps=params["total_timesteps"],
               gamma=params["gamma"],
               epsilon=params["epsilon"],
               epsilon_decay=params["epsilon_decay"],
               tau=params["tau"],
               lr=params["lr"],
               lrschedule=params["lrschedule"],
               buffer_size=params["buffer_size"],
               nbatch=params["nbatch"],
               trace_length=params["trace_length"],
               max_grad_norm=params["max_grad_norm"],
               units_per_hlayer=(params["units_layer1"],
                                 params["units_layer2"],
                                 params["units_layer3"]),
               update_interval=params["update_interval"],
               log_interval=params["log_interval"],
               test_interval=params["test_interval"],
               show_interval=params["show_interval"],
               logdir=params["logdir"],
               keep_model=params["keep_model"])

    ple_env.close()
    avg_perf, var_perf, max_return = eval_model(render=False,
                                                nepisodes=10,
                                                **params)

    with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
        f.write('\n')
        f.write('Results: \n')
        f.write('average performance: ' + str(avg_perf) + '\n')
        f.write('performance variance: ' + str(var_perf) + '\n')
        f.write('maximum return: ' + str(max_return) + '\n')

    return avg_perf, var_perf, max_return
Exemple #2
0
def main():
    seed = 15

    # ---- Specifiy the version of CFB ----
    game = 'ContFlappyBird'
    ns = 'gfNS'                         # 'gfNS', 'gsNS', 'rand_feat'
    nrandfeat = ('-nrf' + str(2))       # '', 0,2,3,4
    noiselevel = ('-nl' + str(0.001))   # '', 0.0001 - 0.05 (see env/__init__.py)
    experiment_phase = '-test'          # '-test', '-train'

    # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0
    env_name = (game + '-' + ns + noiselevel + nrandfeat + experiment_phase + '-v0')

    # ---- Generate CFB with single instance ----
    env = make_ple_env(env_name, seed=seed)
    # Run env:
    env.seed(seed=seed)
    env.reset()
    for i in range(100):
        state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1))
        if RENDER:
            env.render()

    # ---- Generate CFB with N parallel instances. ----
    N = 3
    env = make_ple_envs(env_name, num_env=N, seed=seed)
    # Run env:
    env.seed(seed=seed)
    env.reset()
    for i in range(100):
        state, reward, done, info = env.step(action=np.random.randint(len(env.action_space)+1))
        if RENDER:
            env[0].render()
Exemple #3
0
        (runner.ep_idx, sum(runner.ep_idx)))
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return breaked


from run_ple_utils import make_ple_envs, make_ple_env
from models import MLPPolicy, LSTMPolicy, GRUPolicy
if __name__ == '__main__':
    seed = 1
    env = make_ple_envs('ContFlappyBird-hNS-nrf0-train-v0',
                        num_env=1,
                        seed=seed)
    test_env = make_ple_env('ContFlappyBird-v3', seed=seed)
    logger = logging.getLogger()
    ch = logging.StreamHandler()  # Handler which writes to stderr (in red)
    ch.setLevel(logging.INFO)
    ch.setFormatter(logging.Formatter('%(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(ch)
    logger.setLevel(logging.INFO)

    BATCH_SIZE = 64

    # SMAC config 1
    ACTIV_FCN = 'mixed'
    DISCOUNT = 0.94
    ENT_COEFF = 0.000036
    VF_COEFF = 0.36
    LR = 0.0032
Exemple #4
0
def eval_model(render, nepisodes, test_steps, save_traj=False, result_file='test_results.csv', **params):
    logger = logging.getLogger(__name__)
    logger.info('Evaluating learning algorithm...\n')
    logger.info(params["eval_model"])

    logger.debug('\nMake Environment with seed %s' % params["seed"])
    ple_env = make_ple_env(params["test_env"], seed=params["seed"])  # TODO alwys use the same random seed here!

    tf.reset_default_graph()
    set_global_seeds(params["seed"])
    model_idx = []

    if save_traj:
        result_path = os.path.join(params["logdir"], result_file)
    else:
        result_path = None

    recurrent = (params["architecture"] == 'lstm' or params["architecture"] == 'gru')
    if params["eval_model"] == 'final':
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*final_model-*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('final_model')
            f_name = f[idx:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent)
                model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render,
                                                 OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"])

                # Add model performance metrics
                avg_performances = np.mean(model_performance)
                var_performances = np.var(model_performance)
                maximal_returns = np.max(model_performance)
            tf.reset_default_graph()

    elif params["eval_model"] == 'inter':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx-5:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = \
                    restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = \
                    run_episodes(sess, ple_env, nepisodes, test_steps, render,
                                 OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
    elif params["eval_model"] == 'analysis':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        std_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            print(f_name)
            with tf.Session() as sess:
                OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = \
                    restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render,
                                                 OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                std_performances.append(np.std(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
        return model_idx, avg_performances, std_performances
    # elif params["eval_model"] == "config":
    #     # Use all stored maximum performance models and the final model.
    #     avg_performances = []
    #     var_performances = []
    #     maximal_returns = []
    #     fieldnames = ['model']
    #     for i in range(nepisodes):
    #         fieldnames.append(('eps' + str(i)))
    #     path = os.path.join(params["logdir"], 'results.csv')
    #     with open(path, "w") as csvfile:
    #         writer = csv.writer(csvfile)
    #         writer.writerow(fieldnames)
    #     models = glob.glob(os.path.join(params["logdir"], '*config_model*.meta'))
    #     models.sort()
    #     for f in models:
    #         logger.info('Restore model: %s' % f)
    #         idx = f.find('config_model')
    #         f_name = f[idx:-5]
    #         model_idx.append(f_name)
    #         with tf.Session() as sess:
    #             OBS, PI, PI_LOGITS, pred_ac_op, pred_vf_op = restore_model(sess, logdir=params["logdir"], f_name=f_name)
    #             logger.info('Run %s evaluation episodes' % nepisodes)
    #             model_performance = \
    #                 run_episodes(sess, ple_env, nepisodes, 2000, render, OBS, PI, PI_LOGITS, pred_ac_op)
    #
    #             # Add model performance metrics
    #             avg_performances.append(np.mean(model_performance))
    #             var_performances.append(np.var(model_performance))
    #             maximal_returns.append(np.max(model_performance))
    #         tf.reset_default_graph()
    #
    #         # Save episode information in csv file for further analysis each row contains nepisodes episodes using model f_name.
    #         with open(path, "a") as csvfile:  # TODO add real returns
    #             writer = csv.writer(csvfile)
    #             model_performance = [str(p) for p in model_performance]
    #             model_performance.insert(0, f_name)
    #             writer.writerow(model_performance)

    logger.info(params["logdir"])
    logger.info('Results of the evaluation of the learning algorithm:')
    logger.info('Restored models: %s' % model_idx)
    logger.info('Average performance per model: %s' % avg_performances)
    logger.info('Performance variance per model: %s' % var_performances)
    logger.info('Maximum episode return per model: %s\n' % maximal_returns)
    ple_env.close()

    if not avg_performances == []:
        return np.mean(avg_performances), np.mean(var_performances), np.mean(maximal_returns)
    else:
        return -3000, 3000, -3000
def eval_model(render,
               nepisodes,
               test_steps,
               save_traj=False,
               result_file='test_results.csv',
               **params):
    logger = logging.getLogger(__name__)
    logger.info('Evaluating learning algorithm...\n')
    logger.info(params["eval_model"])

    logger.debug('\nMake Environment with seed %s' % params["seed"])
    # TODO use different seed for every run!#, allow_early_resets=True)
    # TODO make non-clipped env, even if agent is trained on clipped env
    ple_env = make_ple_env(params["test_env"], seed=params["seed"])

    tf.reset_default_graph()
    set_global_seeds(params["seed"])
    model_idx = []

    if save_traj:
        result_path = os.path.join(params["logdir"], result_file)
    else:
        result_path = None

    recurrent = (params["architecture"] == 'lstm'
                 or params["architecture"] == 'gru')
    if params["eval_model"] == 'final':
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(
                os.path.join(params["logdir"], '*final_model-*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('final_model')
            f_name = f[idx:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])
                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()

    elif params["eval_model"] == 'inter':
        # Use all stored maximum performance models and the final model.
        # print('Eval now!')
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
    elif params["eval_model"] == 'analysis':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        std_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            # print(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                std_performances.append(np.std(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
        return model_idx, avg_performances, std_performances

    logger.info(params["logdir"])
    logger.info('Results of the evaluation of the learning algorithm:')
    logger.info('Restored models: %s' % model_idx)
    logger.info('Average performance per model: %s' % avg_performances)
    logger.info('Performance variance per model: %s' % var_performances)
    logger.info('Maximum episode return per model: %s' % maximal_returns)
    ple_env.close()

    if not avg_performances == []:
        return np.mean(avg_performances), np.mean(var_performances), np.mean(
            maximal_returns)
    else:
        return -3000, 3000, -3000
def run_ppo_smac(**kwargs):
    params = ppo_params_parser(**kwargs)

    # logger = logging.getLogger(__name__)
    # logger.propagate = False  # no duplicate logging outputs
    # fh = logging.FileHandler(os.path.join(params["logdir"], 'run.log'))
    # fh.setLevel(logging.INFO)
    # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    # logger.addHandler(fh)

    seed = params["seed"]
    ple_env = make_ple_envs(params["env"], num_env=params["nenvs"], seed=seed)
    test_env = make_ple_env(params["test_env"], seed=3000)

    if params["architecture"] == 'ff':
        policy_fn = LargerMLPPolicy
    elif params["architecture"] == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif params["architecture"] == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % params["policy"])

    with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
        for k, v in params.items():
            f.write(k + ': ' + str(v) + '\n')
    print(params)

    early_stopped = learn(policy_fn,
                          env=ple_env,
                          test_env=test_env,
                          seed=seed,
                          total_timesteps=params["total_timesteps"],
                          log_interval=params["log_interval"],
                          test_interval=params["test_interval"],
                          show_interval=params["show_interval"],
                          logdir=params["logdir"],
                          lr=params["lr"],
                          # lrschedule=params["lrschedule"],
                          max_grad_norm=params["max_grad_norm"],
                          units_per_hlayer=(params["units_shared_layer1"],
                                            params["units_shared_layer2"],
                                            params["units_policy_layer"]),
                          activ_fcn=params["activ_fcn"],
                          gamma=params["gamma"],
                          vf_coef=params["vf_coeff"],
                          ent_coef=params["ent_coeff"],
                          nsteps=params["nsteps"],
                          lam=params["lam"],
                          nminibatches=params["nminibatches"],
                          noptepochs=params["noptepochs"],
                          cliprange=params["cliprange"],
                          early_stop=params["early_stop"],
                          keep_model=params["keep_model"])
    ple_env.close()

    if not early_stopped:
        avg_perf, var_perf, max_return = eval_model(render=False, nepisodes=10, test_steps=3000, **params)

        with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
            f.write('\n')
            f.write('Results: \n')
            f.write('average performance: ' + str(avg_perf) + '\n')
            f.write('performance variance: ' + str(var_perf) + '\n')
            f.write('maximum return: ' + str(max_return) + '\n')

        return avg_perf, var_perf, max_return
    else:
        return -3000, 3000, -3000
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False)
    parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=4)
    parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='elu',
                        help='Activation functions of network layers', )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated')
    parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=1)
    parser.add_argument('--noptepochs', help='Number of optimization epochs with sample data, i.e. how often samples are reused.', type=int, default=1)

    parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95)
    parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float, default=0.2)
    parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90)
    parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2)
    parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=1e-7)
    parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int, default=64)
    parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int, default=64)
    parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=64)

    parser.add_argument('--restore_model', help='whether a pretrained model shall be restored', type=bool, default=False)
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed*10)
    # env = make_ple_envs('ContFlappyBird-hNS-nrf2-train-v0', num_env=args.nenvs, seed=seed - 1)
    test_env = make_ple_env(args.test_env, seed=3000)

    if args.architecture == 'ff':
        policy_fn = LargerMLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    # store hyperparms setting
    # logdir = os.path.join(args.logdir, str(datetime.datetime.today()))
    # os.makedirs(logdir)

    ppo_output_dir = os.path.join(args.logdir, ('ppo_output'+str(args.seed)))
    if not os.path.isdir(ppo_output_dir):
        os.makedirs(ppo_output_dir)

    with open(os.path.join(ppo_output_dir, 'hyperparams.txt'), 'a') as f:
        for k,v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(ppo_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    early_stopped = learn(policy_fn,
                          env=env,
                          test_env=test_env,
                          seed=seed,
                          total_timesteps=args.total_timesteps,
                          log_interval=args.log_interval,
                          test_interval=args.test_interval,
                          show_interval=args.show_interval,
                          logdir=ppo_output_dir,
                          lr=args.lr,
                          # lrschedule=args.lrschedule,
                          max_grad_norm=args.max_grad_norm,
                          units_per_hlayer=(args.units_shared_layer1,
                                            args.units_shared_layer2,
                                            args.units_policy_layer),
                          activ_fcn=args.activ_fcn,
                          gamma=args.gamma,
                          vf_coef=args.vf_coeff,
                          ent_coef=args.ent_coeff,
                          nsteps=args.nsteps,
                          lam=args.lam,
                          nminibatches=args.nminibatches,
                          noptepochs=args.noptepochs,
                          cliprange=args.cliprange,
                          early_stop=args.early_stop,
                          keep_model=args.keep_model,
                          restore_model=args.restore_model)
    env.close()
Exemple #8
0
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop',
                        help='stop bad performing runs ealier',
                        type=bool,
                        default=False)
    parser.add_argument('--nenvs',
                        help='Number of parallel simulation environmenrs',
                        type=int,
                        default=1)
    parser.add_argument(
        '--activ_fcn',
        choices=['relu6', 'elu', 'mixed'],
        type=str,
        default='relu6',
        help='Activation functions of network layers',
    )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument(
        '--batch_size',
        type=int,
        default=50,
        help='number of samples based on which gradient is updated',
    )
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--vf_coeff',
                        help='Weight of value function loss in total loss',
                        type=float,
                        default=0.2)
    parser.add_argument('--ent_coeff',
                        help='Weight of entropy in total loss',
                        type=float,
                        default=1e-7)
    parser.add_argument('--units_shared_layer1',
                        help='Units in first hidden layer which is shared',
                        type=int,
                        default=64)
    parser.add_argument('--units_shared_layer2',
                        help='Units in second hidden layer which is shared',
                        type=int,
                        default=64)
    parser.add_argument('--units_policy_layer',
                        help='Units in hidden layer in policy head',
                        type=int,
                        default=64)
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_envs(args.env, num_env=args.nenvs, seed=seed - 1)
    test_env = make_ple_env(args.test_env, seed=100 + (seed - 1))

    if args.architecture == 'ff':
        policy_fn = MLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    a2c_output_dir = os.path.join(args.logdir, ('a2c_output' + str(args.seed)))
    if not os.path.isdir(a2c_output_dir):
        os.makedirs(a2c_output_dir)

    with open(os.path.join(a2c_output_dir, 'hyperparams.txt'), 'a') as f:
        for k, v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(a2c_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    early_stopped = learn(
        policy_fn,
        env=env,
        test_env=test_env,
        seed=seed,
        total_timesteps=args.total_timesteps,
        log_interval=args.log_interval,
        test_interval=args.test_interval,
        show_interval=args.show_interval,
        logdir=a2c_output_dir,
        lr=args.lr,
        # lrschedule=args.lrschedule,
        max_grad_norm=args.max_grad_norm,
        units_per_hlayer=(args.units_shared_layer1, args.units_shared_layer2,
                          args.units_policy_layer),
        activ_fcn=args.activ_fcn,
        gamma=args.gamma,
        vf_coef=args.vf_coeff,
        ent_coef=args.ent_coeff,
        batch_size=args.batch_size,
        early_stop=args.early_stop,
        keep_model=args.keep_model)
    env.close()
def main_event_dependent():
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_env',
                        help='testv environment ID',
                        default='ContFlappyBird-v3')
    parser.add_argument('--total_timesteps',
                        help='Total number of env steps',
                        type=int,
                        default=int(2e5))
    parser.add_argument('--seed', help='RNG seed', type=int, default=1)
    parser.add_argument('--logdir',
                        default='/home/mara/Desktop/logs/ED_CONTROL',
                        help='directory where logs are stored')
    parser.add_argument(
        '--show_interval',
        type=int,
        default=1,
        help='Env is rendered every n-th episode. 0 = no rendering')
    parser.add_argument(
        '--eval_model',
        choices=['all', 'inter', 'final'],
        default='inter',
        help=
        'Eval all stored models, only the final model or only the intermediately stored models (while testing the best algorithm configs)'
    )

    args = parser.parse_args()

    np.random.seed(args.seed)
    random.seed(args.seed)

    # Init test_results.csv
    # rnd_output_dir = args.logdir
    #
    # logger = logging.getLogger()
    # fh = logging.FileHandler(os.path.join(rnd_output_dir, 'algo.log'))
    # fh.setLevel(logging.INFO)
    # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    # logger.addHandler(fh)
    # logger.setLevel(logging.INFO)
    # logger.propagate = False
    #
    # result_path = os.path.join(rnd_output_dir, 'test_results.csv')

    for s in range(100, 120):
        # logger.info('make env with seed %s' % s)
        test_env = make_ple_env(args.test_env, seed=s)

        state = test_env.reset()
        #print(state)
        # logger.info('reset')
        total_return = 0
        rew_traj = []

        t = 0
        while t < args.total_timesteps:
            t += 1
            if t % 20 == 0:
                a = 1
            if args.show_interval > 0:
                test_env.render()
                time.sleep(0.01)
                # logger.info('render')
            # logger.info('step')

            if state[0] > 0.5 * (state[2] + state[3]):
                action = 0  # FLAP
            else:
                action = 1
            state, reward, dones, _ = test_env.step(action)
            #print(state)
            # logger.info('stepped')
            # reward_window.append(reward)
            total_return += reward
            rew_traj.append(reward)
        test_env.close()
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop', help='stop bad performing runs ealier', type=bool, default=False)
    parser.add_argument('--nenvs', help='Number of parallel simulation environmenrs', type=int, default=1)
    parser.add_argument('--activ_fcn', choices=['relu6', 'elu', 'mixed'], type=str, default='mixed',
                        help='Activation functions of network layers', )
    parser.add_argument('--lr', help='Learning Rate', type=float, default=0.001)
    parser.add_argument('--nsteps', type=int, default=32, help='number of samples based on which gradient is updated')
    parser.add_argument('--gamma', help='Discount factor for discounting the reward', type=float, default=0.90)
    parser.add_argument('--vf_coeff', help='Weight of value function loss in total loss', type=float, default=0.2)
    parser.add_argument('--ent_coeff', help='Weight of entropy in total loss', type=float, default=7e-5)
    parser.add_argument('--units_shared_layer1', help='Units in first hidden layer which is shared', type=int,
                        default=28)
    parser.add_argument('--units_shared_layer2', help='Units in second hidden layer which is shared', type=int,
                        default=59)
    parser.add_argument('--units_policy_layer', help='Units in hidden layer in policy head', type=int, default=21)

    # PPO args
    parser.add_argument('--nminibatches', help='Number of minibatches per sampled data batch.', type=int, default=2)
    parser.add_argument('--noptepochs',
                        help='Number of optimization epochs with sample data, i.e. how often samples are reused.',
                        type=int, default=4)

    parser.add_argument('--lam', help='Lambda parameter for GAE', type=float, default=0.95)
    parser.add_argument('--cliprange', help='Defines the maximum policy change allowed, before clipping.', type=float,
                        default=0.2)

    # MAML args
    parser.add_argument('--K', help='length of each rollout (=trajectory)', type=int, default=20) # Test how well it works with other measures.
    parser.add_argument('--train_batchsz', help='number of rollouts per adaptation/training update (=fast update)', type=int, default=1)
    parser.add_argument('--kshot', help='number of adaptation/training update (=fast updates) per task between two meta updates', type=int, default=1000)
    parser.add_argument('--test_batchsz', help='number of rollouts with updated model on which test_loss is computed',
                        type=int, default=1)
    parser.add_argument('--meta_batchsz', help='number of sampled tasks per meta update', type=int, default=4)  # parallely or sequentially
    parser.add_argument('--test_stage', help='whether or not meta learner is in test_stage', type=bool, default=False)

    parser.add_argument('--base_agent', help='type of base learning agent, i.e. A2C or PPO agent', type=str, default='ppo')
    args = parser.parse_args()
    print(args)

    ple_env = make_ple_envs(args.env, args.nenvs, seed=args.seed-1)
    ple_test_env = make_ple_env(args.test_env, seed=100 + (args.seed-1))

    if args.architecture == 'ff':
        policy_fn = LargerMLPPolicy
    elif args.architecture == 'lstm':
        policy_fn = LargerLSTMPolicy
    elif args.architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % args.policy)

    output_dir = os.path.join(args.logdir, ('a2c_output'+str(args.seed)))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    with open(os.path.join(output_dir, 'hyperparams.txt'), 'a') as f:
        for k,v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger()
    fh = logging.FileHandler(os.path.join(output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    # if not args.test_stage:  # construct training model
    #     pass
    args.env = ple_env
    args.test_env = ple_test_env
    args.logdir = output_dir
    args.units_per_hlayer=(args.units_shared_layer1,
                           args.units_shared_layer2,
                           args.units_policy_layer)
    args.policy = policy_fn

    args.total_timesteps = 200000

    meta_learn(**args.__dict__)
    ple_env.close()
Exemple #11
0
    DISCOUNT = 0.90
    EPSILON = 0.5
    EPS_DECAY = 0.995
    LR = 5e-4
    MAX_REPLAY_BUF_SIZE = 1000
    BATCH_SIZE = 4  # number of episodes from which trces are sampled
    MAX_GRAD_NORM = 0.5
    NUM_TRAIN_UPDATES = int(2e6)
    TARGET = None
    SAVE_INTERVAL = 500
    LOG_INTERVAL = 30
    DATE = str(datetime.datetime.now().strftime("%Y_%m_%d_%H%M%S"))
    LOGDIR = os.path.join('/home/mara/Desktop/logs/DQN', DATE)

    seed = 2
    env = make_ple_env('FlappyBird-v1', seed=seed)
    test_env = make_ple_env('FlappyBird-v1', seed=seed)

    q_learning(
        env,
        test_env=test_env,
        seed=seed,
        total_timesteps=NUM_TRAIN_UPDATES,
        gamma=DISCOUNT,
        epsilon=EPSILON,
        epsilon_decay=EPS_DECAY,
        tau=0.90,
        lr=LR,
        buffer_size=MAX_REPLAY_BUF_SIZE,
        nbatch=BATCH_SIZE,
        trace_length=8,
Exemple #12
0
import os, glob
import csv
import logging

import tensorflow as tf
import numpy as np
import time
from utils import set_global_seeds, normalize_obs, get_collection_rnn_state
from run_ple_utils import make_ple_env

SEED = 100
LOGDIR = '/home/mara/Videos'
F_NAME = 'final_model-2000000'

ple_env = make_ple_env('ContFlappyBird-v3', seed=SEED)
tf.reset_default_graph()
set_global_seeds(SEED)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    # g = tf.get_default_graph()  # Shouldn't be set here again, as a new RNG is used without previous seeding.

    # restore the model
    loader = tf.train.import_meta_graph(glob.glob(os.path.join(LOGDIR, (F_NAME + '.meta')))[0])

    # now variables exist, but the values are not initialized yet.
    loader.restore(sess, os.path.join(LOGDIR, F_NAME))  # restore values of the variables.

    # Load operations from collections
    obs_in = tf.get_collection('inputs')
    probs_out = tf.get_collection('pi')
Exemple #13
0
def eval_model(render, nepisodes, **params):
    logger = logging.getLogger(__name__)
    logger.info('Evaluating learning algorithm...\n')
    logger.info(params["eval_model"])

    logger.debug('\nMake Environment with seed %s' % params["seed"])
    # TODO make non-clipped env, even if agent is trained on clipped env
    ple_env = make_ple_env(params["env"],
                           seed=params["seed"])  # , allow_early_resets=True)

    tf.reset_default_graph()
    set_global_seeds(params["seed"])
    model_idx = []

    if params["eval_model"] == 'final':
        f = glob.glob(os.path.join(params["logdir"], 'final_model-*.meta'))
        idx = f.find('final_model')
        f_name = f[idx:-5]
        model_idx.append(f_name)
        with tf.Session() as sess:
            OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model(
                sess, logdir=params["logdir"], f_name=f_name)
            model_performance = run_episodes(sess, ple_env, nepisodes, 1000,
                                             render, params["epsilon"], OBS,
                                             RNN_S_IN, RNN_S_OUT, PRED_Q)

            # Add model performance metrics
            avg_performances = np.mean(model_performance)
            var_performances = np.var(model_performance)
            maximal_returns = np.max(model_performance)

        tf.reset_default_graph()

    elif params["eval_model"] == 'all':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        var_performances = []
        maximal_returns = []
        iii = 0
        for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model(
                    sess, logdir=params["logdir"], f_name=f_name)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 1000, render,
                                                 params["epsilon"], OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q)

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
    elif params["eval_model"] == "config":
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        var_performances = []
        maximal_returns = []

        # Setup log csv file
        fieldnames = ['model']
        for i in range(nepisodes):
            fieldnames.append(('eps' + str(i)))
        path = os.path.join(params["logdir"], 'results.csv')
        with open(path, "w") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(fieldnames)

        # Run evaluation episodes
        models = glob.glob(
            os.path.join(params["logdir"], '*config_model*.meta'))
        models.sort()
        for f in models:
            logger.info('Restore model: %s' % f)
            idx = f.find('config_model')
            f_name = f[idx:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model(
                    sess, logdir=params["logdir"], f_name=f_name)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 2000, render,
                                                 params["epsilon"], OBS,
                                                 RNN_S_IN, RNN_S_OUT,
                                                 PRED_Q)  # TODO 1000

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()

            # Save episode information in csv file for further analysis.
            # Each row contains nepisodes episodes using the current model "f_name".
            with open(path, "a") as csvfile:  # TODO add real returns
                writer = csv.writer(csvfile)
                model_performance = [str(p) for p in model_performance]
                model_performance.insert(0, f_name)
                writer.writerow(model_performance)

    logger.info(params["logdir"])
    logger.info('Results of the evaluation of the learning algorithm:')
    logger.info('Restored models: %s' % model_idx)
    logger.info('Average performance per model: %s' % avg_performances)
    logger.info('Performance variance per model: %s' % var_performances)
    logger.info('Maximum episode return per model: %s' % maximal_returns)
    ple_env.close()

    if len(avg_performances) > 0:
        return np.mean(avg_performances), np.mean(var_performances), np.mean(
            maximal_returns)
    else:
        return -5, 0, -5
Exemple #14
0
def run_dqn_smac(**kwargs):
    params = dqn_params_parser(**kwargs)

    seed = params["seed"]
    ple_env = make_ple_env(params["env"], seed=seed)
    test_env = make_ple_env(params["test_env"], seed=3000)

    if params["architecture"] == 'ff':
        q_network = FF_DQN
        params["trace_length"] = 1
    elif params["architecture"] == 'lstm':
        q_network = LSTM_DQN
    elif params["architecture"] == 'gru':
        q_network = GRU_DQN
    else:
        print('Policy option %s is not implemented yet.' % params["policy"])

    with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
        for k, v in params.items():
            f.write(k + ': ' + str(v) + '\n')

    # If buffer size of the experience replay buffer is smaller than the batch_size * trace length, not enough
    # observations are fed to the network to compute the update step and the code throws an error.
    if params["buffer_size"] < (params["batch_size"] * params["trace_length"]):
        return -3000, 3000, -3000

    early_stopped, _ = q_learning(q_network=q_network,
                                  env=ple_env,
                                  test_env=test_env,
                                  seed=seed,
                                  total_timesteps=params["total_timesteps"],
                                  log_interval=params["log_interval"],
                                  test_interval=params["test_interval"],
                                  show_interval=params["show_interval"],
                                  logdir=params["logdir"],
                                  lr=params["lr"],
                                  max_grad_norm=params["max_grad_norm"],
                                  units_per_hlayer=(params["units_layer1"],
                                                    params["units_layer2"],
                                                    params["units_layer3"]),
                                  activ_fcn=params["activ_fcn"],
                                  gamma=params["gamma"],
                                  epsilon=params["epsilon"],
                                  epsilon_decay=params["epsilon_decay"],
                                  buffer_size=params["buffer_size"],
                                  batch_size=params["batch_size"],
                                  trace_length=params["trace_length"],
                                  tau=params["tau"],
                                  update_interval=params["update_interval"],
                                  early_stop=params["early_stop"],
                                  keep_model=params["keep_model"])
    # update_interval=params["trace_length"])
    ple_env.close()

    if not early_stopped:
        avg_perf, var_perf, max_return = eval_model(render=False,
                                                    nepisodes=10,
                                                    test_steps=3000,
                                                    **params)

        with open(os.path.join(params["logdir"], 'hyperparams.txt'), 'a') as f:
            f.write('\n')
            f.write('Results: \n')
            f.write('average performance: ' + str(avg_perf) + '\n')
            f.write('performance variance: ' + str(var_perf) + '\n')
            f.write('maximum return: ' + str(max_return) + '\n')

        return avg_perf, var_perf, max_return
    else:
        return -3000, 3000, -3000
Exemple #15
0
def main():
    parser = arg_parser()
    parser.add_argument('--early_stop',
                        help='stop bad performing runs ealier',
                        type=bool,
                        default=False)
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--epsilon',
                        help='Epsilon for epsilon-greedy policy',
                        type=float,
                        default=0.5)
    parser.add_argument('--epsilon_decay',
                        help='Epsilon decay rate',
                        type=float,
                        default=0.995)
    parser.add_argument('--tau',
                        help='Update rate of target netowrk',
                        type=float,
                        default=0.99)
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--buffer_size',
                        help='Replay buffer size',
                        type=float,
                        default=500)
    parser.add_argument(
        '--batch_size',
        help=
        'Batch size. Number of samples drawn from buffer, which are used to update the model.',
        type=int,
        default=50)
    parser.add_argument(
        '--trace_length',
        help='Length of the traces obtained from the batched episodes',
        type=int,
        default=1)
    parser.add_argument('--units_layer1',
                        help='Units in first hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer2',
                        help='Units in second hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer3',
                        help='Units in third hidden layer',
                        type=int,
                        default=64)
    parser.add_argument(
        '--activ_fcn',
        choices=['relu6', 'elu', 'mixed'],
        type=str,
        default='relu6',
        help='Activation functions of network layers',
    )
    parser.add_argument(
        '--update_interval',
        type=int,
        default=30,
        help=
        'Frequency with which the network model is updated based on minibatch data.'
    )
    args = parser.parse_args()

    assert (args.buffer_size > (args.batch_size * args.trace_length)
            ), 'Batch size needs to be smaller than Buffer size!'

    seed = args.seed
    env = make_ple_env(args.env, seed=seed - 1)
    # env = make_ple_env('ContFlappyBird-hNS-nrf2-test-v0', seed=seed-1)
    test_env = make_ple_env(args.test_env, seed=100 + (seed - 1))

    if args.architecture == 'ff':
        q_network = FF_DQN
        args.trace_length = 1
    elif args.architecture == 'lstm':
        q_network = LSTM_DQN
    elif args.architecture == 'gru':
        q_network = GRU_DQN

    # logdir = os.path.join(args.logdir, str(datetime.datetime.today()))
    # os.makedirs(logdir)

    dqn_output_dir = os.path.join(args.logdir, ('dqn_output' + str(args.seed)))
    if not os.path.isdir(dqn_output_dir):
        os.makedirs(dqn_output_dir)

    # store hyperparms setting
    with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f:
        for k, v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger(
    )  # setup root logger is necessary to use FIleHandler
    fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    logger.propagate = False

    # If buffer size of the experience replay buffer is smaller than the batch_size * trace length, not enough
    # observations are fed to the network to compute the update step and the code throws an error.
    if args.buffer_size < (args.batch_size * args.trace_length):
        logger.info(
            'Experience replay buffer is too small. Should be bigger than batch_size * trace_length = %i * %i'
            % (args.batch_size, args.trace_length))
        # return -3000, 3000, -3000

    early_stopped, _ = q_learning(q_network=q_network,
                                  env=env,
                                  test_env=test_env,
                                  seed=seed,
                                  total_timesteps=args.total_timesteps,
                                  log_interval=args.log_interval,
                                  test_interval=args.test_interval,
                                  show_interval=args.show_interval,
                                  logdir=dqn_output_dir,
                                  lr=args.lr,
                                  max_grad_norm=args.max_grad_norm,
                                  units_per_hlayer=(args.units_layer1,
                                                    args.units_layer2,
                                                    args.units_layer3),
                                  activ_fcn=args.activ_fcn,
                                  gamma=args.gamma,
                                  epsilon=args.epsilon,
                                  epsilon_decay=args.epsilon_decay,
                                  buffer_size=args.buffer_size,
                                  batch_size=args.batch_size,
                                  trace_length=args.trace_length,
                                  tau=args.tau,
                                  update_interval=args.update_interval,
                                  early_stop=args.early_stop,
                                  keep_model=args.keep_model)
    env.close()

    args.logdir = dqn_output_dir
Exemple #16
0
def main():
    parser = arg_parser()
    # parser = arg_parser()
    parser.add_argument('--gamma',
                        help='Discount factor for discounting the reward',
                        type=float,
                        default=0.90)
    parser.add_argument('--epsilon',
                        help='Epsilon for epsilon-greedy policy',
                        type=float,
                        default=0.5)
    parser.add_argument('--epsilon_decay',
                        help='Epsilon decay rate',
                        type=float,
                        default=0.995)
    parser.add_argument('--tau',
                        help='Update rate of target netowrk',
                        type=float,
                        default=0.99)
    parser.add_argument('--lr', help='Learning Rate', type=float, default=5e-4)
    parser.add_argument('--lrschedule',
                        help='Learning Rate Decay Schedule',
                        choices=['constant', 'linear', 'double_linear_con'],
                        default='constant')
    parser.add_argument(
        '--nbatch',
        help=
        'Batch size. Number of sampless drawn from buffer, which are used to update the model.',
        type=int,
        default=3)
    parser.add_argument('--buffer_size',
                        help='Replay buffer size',
                        type=int,
                        default=10)
    parser.add_argument(
        '--trace_length',
        help='Length of the traces obtained from the batched episodes',
        type=int,
        default=8)
    parser.add_argument(
        '--max_grad_norm',
        help='Maximum gradient norm up to which gradient is not clipped',
        type=float,
        default=0.01)
    parser.add_argument('--units_layer1',
                        help='Units in first hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer2',
                        help='Units in second hidden layer',
                        type=int,
                        default=64)
    parser.add_argument('--units_layer3',
                        help='Units in third hidden layer',
                        type=int,
                        default=64)
    parser.add_argument(
        '--update_interval',
        type=int,
        default=5,
        help=
        'Frequency with which the network model is updated based on minibatch data.'
    )
    # parser.add_argument('--log_interval', help='parameter values stored in tensorboard summary every <log_interval> model update step. 0 --> no logging ', type=int, default=30)
    # parser.add_argument('--show_interval', help='Env is rendered every n-th episode. 0 = no rendering', type=int, default=30)
    # parser.add_argument('--logdir', help='directory where logs are stored', default='/home/mara/Desktop/logs/A2C_OAI_NENVS')  # '/mnt/logs/A2C')
    args = parser.parse_args()

    seed = args.seed
    env = make_ple_env(args.env, seed=seed)
    test_env = make_ple_env(args.env, seed=seed)

    # logdir = os.path.join(args.logdir, str(datetime.datetime.today()))
    # os.makedirs(logdir)

    dqn_output_dir = os.path.join(args.logdir,
                                  ('dqn_rnn_output' + str(args.seed)))
    if not os.path.isdir(dqn_output_dir):  # TODO check what this does
        os.makedirs(dqn_output_dir)

    # store hyperparms setting
    with open(os.path.join(dqn_output_dir, 'hyperparams.txt'), 'a') as f:
        for k, v in vars(args).items():
            f.write(k + ': ' + str(v) + '\n')

    logger = logging.getLogger(
    )  # TODO setup root logger is necessary to use FIleHandler
    logger.propagate = False
    fh = logging.FileHandler(os.path.join(dqn_output_dir, 'algo.log'))
    fh.setLevel(logging.INFO)
    fh.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)

    q_learning(env,
               test_env=test_env,
               seed=seed,
               total_timesteps=args.total_timesteps,
               gamma=args.gamma,
               epsilon=args.epsilon,
               epsilon_decay=args.epsilon_decay,
               tau=args.tau,
               lr=args.lr,
               lrschedule=args.lrschedule,
               buffer_size=args.buffer_size,
               nbatch=args.nbatch,
               trace_length=args.trace_length,
               max_grad_norm=args.max_grad_norm,
               units_per_hlayer=(args.units_layer1, args.units_layer2,
                                 args.units_layer3),
               update_interval=args.update_interval,
               log_interval=args.log_interval,
               test_interval=args.test_interval,
               show_interval=args.show_interval,
               logdir=dqn_output_dir,
               keep_model=args.keep_model)
    env.close()

    args.logdir = dqn_output_dir
    avg_perf, var_perf, max_return = eval_model(render=False,
                                                nepisodes=15,
                                                **args.__dict__)

    with open(os.path.join(args.logdir, 'hyperparams.txt'), 'a') as f:
        f.write('\n')
        f.write('Results: \n')
        f.write('average performance: ' + str(avg_perf) + '\n')
        f.write('performance variance: ' + str(var_perf) + '\n')
        f.write('maximum return: ' + str(max_return) + '\n')
Exemple #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_env',
                        help='testv environment ID',
                        default='ContFlappyBird-v3')
    parser.add_argument('--total_timesteps',
                        help='Total number of env steps',
                        type=int,
                        default=int(2e4))
    parser.add_argument('--seed', help='RNG seed', type=int, default=1)
    parser.add_argument('--logdir',
                        default='/home/mara/Desktop/logs/RND',
                        help='directory where logs are stored')
    parser.add_argument(
        '--show_interval',
        type=int,
        default=1,
        help='Env is rendered every n-th episode. 0 = no rendering')
    parser.add_argument(
        '--eval_model',
        choices=['all', 'inter', 'final'],
        default='inter',
        help=
        'Eval all stored models, only the final model or only the intermediately stored models (while testing the best algorithm configs)'
    )

    args = parser.parse_args()

    np.random.seed(args.seed)
    random.seed(args.seed)

    # Init test_results.csv
    # for i, p_flap in zip(range(1, 4), [0.1, 0.3, 0.5]):

    # rnd_output_dir = os.path.join(args.logdir, ('rnd_output' + str(i)))
    # if not os.path.isdir(rnd_output_dir):
    #     os.makedirs(rnd_output_dir)
    #
    # logger = logging.getLogger()
    # fh = logging.FileHandler(os.path.join(rnd_output_dir, 'algo.log'))
    # fh.setLevel(logging.INFO)
    # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s:%(name)s: %(message)s'))
    # logger.addHandler(fh)
    # logger.setLevel(logging.INFO)
    # logger.propagate = False
    #
    # result_path = os.path.join(rnd_output_dir, 'test_results.csv')

    for p_flap in [0.1]:
        for s in [100]:  # range(100, 120):
            # logger.info('make env with seed %s' % s)
            test_env = make_ple_env(args.test_env, seed=s)

            test_env.reset()
            total_return = 0
            rew_traj = []

            t = 0
            while t < args.total_timesteps:
                t += 1
                if args.show_interval > 0:
                    test_env.render()
                    time.sleep(0.01)

                obs, reward, dones, _ = test_env.step(
                    np.random.choice([0, 1], p=[p_flap, 1 - p_flap]))
                total_return += reward
                rew_traj.append(reward)
            test_env.close()
Exemple #18
0
def main():
    seed = 42

    # ---- Specifiy the version of CFB ----
    game = 'ContFlappyBird'
    ns = ''  # '', 'gfNS', 'gsNS', 'rand_feat'
    nrandfeat = ('-nrf' + str(2))  # '', 0,2,3,4
    noiselevel = ('-nl' + str(0.001)
                  )  # '', 0.0001 - 0.05 (see env/__init__.py)
    experiment_phase = '-train'  # '-test', '-train'

    # Naming convention is <game>-<non-stationarity>-nl<noise_level>-nrf<nrandfeat>-<phase>-v0
    env_name = (game + ns + noiselevel + nrandfeat + experiment_phase + '-v0')
    test_env_name = (game + ns + noiselevel + nrandfeat + '-test' + '-v0')

    # ---- Generate CFB with N parallel instances and with single instance ----
    ple_env = make_ple_envs(env_name, num_env=2,
                            seed=seed)  # N parallel instances
    test_env = make_ple_env(test_env_name, seed=seed + 42)  # single instance

    # ---- Import the RL method you want to use ----
    from A2C.a2c import learn
    # from PPO.ppo import learn
    # from DQN.dqn import q_learning

    # ---- Specify the model (FF, LSTM, GRU) ----
    model_architecture = 'ff'  # 'lstm', 'gru'

    if model_architecture == 'ff':
        policy_fn = MLPPolicy
    elif model_architecture == 'lstm':
        policy_fn = LSTMPolicy
    elif model_architecture == 'gru':
        policy_fn = GRUPolicy
    else:
        print('Policy option %s is not implemented yet.' % model_architecture)

    # ---- Learn an optimal policy. The agents model ('final_model...') is stored in LOGDIR.
    early_stopped = learn(
        policy_fn,
        env=ple_env,
        test_env=test_env,
        seed=seed,
        total_timesteps=int(2e4),  # Total number of env steps
        log_interval=
        0,  # Network parameter values are stored in tensorboard summary every <log_interval> model update step. 0 --> no logging
        test_interval=
        0,  # Model is evaluated after <test_interval> model updates. 0 = do not evaluate while learning.
        show_interval=0,  # Env is rendered every n-th episode. 0 = no rendering
        logdir=LOGDIR,  # directory where logs and the learned models are stored
        lr=5e-4,  # Learning Rate
        max_grad_norm=
        0.01,  # Maximum gradient norm up to which gradient is not clipped
        units_per_hlayer=(64, 64, 64),  # Number of units per network layer
        activ_fcn=
        'relu6',  # Type of activation function used in the network: 'relu6', 'elu', 'mixed'
        gamma=0.95,  # Discount factor for discounting the reward
        vf_coef=0.2,  # Weight on the value function loss in the loss function
        ent_coef=1e-7,  # Weight on the policy entropy in the loss function
        batch_size=64,  # number of samples based on which gradient is updated
        early_stop=False,  # whether or not to stop bad performing runs earlier.
        keep_model=0
    )  # How many best models shall be kept during training. 0 -> only final model
    ple_env.close()