def learn(policy,
          env,
          test_env,
          seed,
          total_timesteps,
          log_interval,
          test_interval,
          show_interval,
          logdir,
          lr,
          max_grad_norm,
          units_per_hlayer,
          activ_fcn,
          gamma=0.99,
          vf_coef=0.5,
          ent_coef=0.01,
          batch_size=5,
          early_stop=False,
          keep_model=2,
          save_model=True,
          restore_model=False,
          save_traj=False):
    logger = logging.getLogger(__name__)
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  lr=lr,
                  max_grad_norm=max_grad_norm,
                  activ_fcn=activ_fcn,
                  units_per_hlayer=units_per_hlayer,
                  log_interval=log_interval,
                  logdir=logdir,
                  nenvs=nenvs,
                  batch_size=batch_size,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  keep_model=keep_model
                  # total_timesteps=total_timesteps,
                  )

    sum_write = model.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(
            logdir, ('lr' + str(lr) + '_tracking_results.csv'))
    else:
        rew_results_path = None

    i_sample, i_train = 0, 0
    return_threshold = -0.05
    horizon = 100
    avg_rm = deque(maxlen=30)

    runner = Runner(env,
                    model,
                    nsteps=batch_size,
                    gamma=gamma,
                    horizon=horizon,
                    show_interval=show_interval,
                    summary_writer=sum_write)

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                model.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                model.sess.run(model.global_step.assign(0))

    logger.info('Start Training')
    breaked = False
    nbatch = nenvs * batch_size
    tstart = time.time()
    # max_avg_ep_return = -5  # moving average of 20*nenv training episodes
    max_returns = deque([50],
                        maxlen=7)  # returns of the 7 best training episodes
    for update in range(1, total_timesteps // nbatch + 1):
        # obs, states, rewards, masks, actions, values, avg_ep_return = runner.run()
        # policy_loss, value_loss, policy_entropy, ap = model.train(obs, states, rewards, masks, actions, values)
        obs, states, rewards, actions, values, reward_window, raw_rewards = runner.run(
        )
        if rew_results_path is not None:
            rew_traj.append(raw_rewards)
        # print('\nMEAN:%s\n' % np.mean(obs, axis=0))
        # plt.figure()
        # d = [o *512 for o  in obs[:,0]]
        # plt.plot(d)
        # plt.figure()
        # d = [o * 7 for o in obs[:, 1]]
        # plt.plot(d)
        # plt.figure()
        # d = [o * 512 for o in obs[:, 2]]
        # plt.plot(d)
        # plt.figure()
        # d = [o * 512 for o in obs[:, 3]]
        # plt.plot(d)
        # plt.figure()
        # d = [o * 512 for o in obs[:, 4]]
        # plt.plot(d)
        # plt.figure()
        # d = [o * 512 for o in obs[:, 5]]
        # plt.plot(d)
        # plt.figure()
        # d = [o * 512 for o in obs[:, 6]]
        # plt.plot(d)
        # plt.figure()
        # d = [o * 512 for o in obs[:, 7]]
        # plt.plot(d)
        # plt.show()
        policy_loss, value_loss, policy_entropy, ap = model.train(
            obs, states, rewards, actions, values)
        if test_interval > 0 and i_train > 0 and (update % test_interval == 0):
            ep_return = model.test_run(
                test_env, n_eps=10, n_pipes=2000
            )  # TODO test, whether results.csv is saved properly
            with open(result_path, "a") as csvfile:
                writer = csv.writer(csvfile)
                # ep_return = [str(p) for p in ep_return]
                # ep_return.insert(0, ('step_%s' % i_sample))
                ep_return[0:0] = [i_sample, i_train]
                writer.writerow(ep_return)

        # Log the performance during training at every update step.
        # Save the current model if the average reward of the last
        # 100 time steps is above the return threshold
        if ('ContFlappyBird' in env.env_id):
            saved = False
            for i, rw in enumerate(reward_window):
                rm = sum(rw) / horizon
                if sum_write is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(
                        tag='envs/environment%s/isample_return' % i,
                        simple_value=rm)
                    sum_write.add_summary(s_summary, i_sample)

                    t_summary = tf.Summary()
                    t_summary.value.add(
                        tag='envs/environment%s/itrain_return' % i,
                        simple_value=rm)
                    sum_write.add_summary(t_summary, i_train)
                    sum_write.flush()
                # logger.info(rm)
                if save_model and not saved and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' %
                                return_threshold)
                    model.save('inter_model')
                    saved = True
                avg_rm.append(rm)

        if early_stop:
            if (i_sample > 500000) and (
                    i_sample <= 500000 + nbatch
            ):  # TODO how to determine early-stopping criteria non-heuristically, but automatically? - BOHB algorithm?
                if (sum(avg_rm) / 30) <= -0.88:
                    print('breaked')
                    breaked = True
                    break
        i_sample += nbatch
        i_train += 1

        # nseconds = time.time()-tstart
        # fps = int((update*nbatch)/nseconds)
        # if update % log_interval == 0 or update == 1:
        #     ev = explained_variance(values, rewards)
        #     logger.record_tabular("nupdates", update)
        #     logger.record_tabular("total_timesteps", update*nbatch)
        #     logger.record_tabular("fps", fps)
        #     logger.record_tabular("policy_entropy", float(policy_entropy))
        #     logger.record_tabular("value_loss", float(value_loss))
        #     logger.record_tabular("explained_variance", float(ev))
        #     logger.dump_tabular()

    if save_model:
        model.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info(
        'Total number of finished episodes during training: sum(%s) = %s' %
        (runner.ep_idx, sum(runner.ep_idx)))
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return breaked
Ejemplo n.º 2
0
def eval_model(render, nepisodes, test_steps, save_traj=False, result_file='test_results.csv', **params):
    logger = logging.getLogger(__name__)
    logger.info('Evaluating learning algorithm...\n')
    logger.info(params["eval_model"])

    logger.debug('\nMake Environment with seed %s' % params["seed"])
    ple_env = make_ple_env(params["test_env"], seed=params["seed"])  # TODO alwys use the same random seed here!

    tf.reset_default_graph()
    set_global_seeds(params["seed"])
    model_idx = []

    if save_traj:
        result_path = os.path.join(params["logdir"], result_file)
    else:
        result_path = None

    recurrent = (params["architecture"] == 'lstm' or params["architecture"] == 'gru')
    if params["eval_model"] == 'final':
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*final_model-*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('final_model')
            f_name = f[idx:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent)
                model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render,
                                                 OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"])

                # Add model performance metrics
                avg_performances = np.mean(model_performance)
                var_performances = np.var(model_performance)
                maximal_returns = np.max(model_performance)
            tf.reset_default_graph()

    elif params["eval_model"] == 'inter':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx-5:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = \
                    restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = \
                    run_episodes(sess, ple_env, nepisodes, test_steps, render,
                                 OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
    elif params["eval_model"] == 'analysis':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        std_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            print(f_name)
            with tf.Session() as sess:
                OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, pred_vf_op = \
                    restore_a2c_model(sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render,
                                                 OBS, PI, PI_LOGITS, RNN_S_IN, RNN_S_OUT, pred_ac_op, result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                std_performances.append(np.std(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
        return model_idx, avg_performances, std_performances
    # elif params["eval_model"] == "config":
    #     # Use all stored maximum performance models and the final model.
    #     avg_performances = []
    #     var_performances = []
    #     maximal_returns = []
    #     fieldnames = ['model']
    #     for i in range(nepisodes):
    #         fieldnames.append(('eps' + str(i)))
    #     path = os.path.join(params["logdir"], 'results.csv')
    #     with open(path, "w") as csvfile:
    #         writer = csv.writer(csvfile)
    #         writer.writerow(fieldnames)
    #     models = glob.glob(os.path.join(params["logdir"], '*config_model*.meta'))
    #     models.sort()
    #     for f in models:
    #         logger.info('Restore model: %s' % f)
    #         idx = f.find('config_model')
    #         f_name = f[idx:-5]
    #         model_idx.append(f_name)
    #         with tf.Session() as sess:
    #             OBS, PI, PI_LOGITS, pred_ac_op, pred_vf_op = restore_model(sess, logdir=params["logdir"], f_name=f_name)
    #             logger.info('Run %s evaluation episodes' % nepisodes)
    #             model_performance = \
    #                 run_episodes(sess, ple_env, nepisodes, 2000, render, OBS, PI, PI_LOGITS, pred_ac_op)
    #
    #             # Add model performance metrics
    #             avg_performances.append(np.mean(model_performance))
    #             var_performances.append(np.var(model_performance))
    #             maximal_returns.append(np.max(model_performance))
    #         tf.reset_default_graph()
    #
    #         # Save episode information in csv file for further analysis each row contains nepisodes episodes using model f_name.
    #         with open(path, "a") as csvfile:  # TODO add real returns
    #             writer = csv.writer(csvfile)
    #             model_performance = [str(p) for p in model_performance]
    #             model_performance.insert(0, f_name)
    #             writer.writerow(model_performance)

    logger.info(params["logdir"])
    logger.info('Results of the evaluation of the learning algorithm:')
    logger.info('Restored models: %s' % model_idx)
    logger.info('Average performance per model: %s' % avg_performances)
    logger.info('Performance variance per model: %s' % var_performances)
    logger.info('Maximum episode return per model: %s\n' % maximal_returns)
    ple_env.close()

    if not avg_performances == []:
        return np.mean(avg_performances), np.mean(var_performances), np.mean(maximal_returns)
    else:
        return -3000, 3000, -3000
Ejemplo n.º 3
0
def learn(policy, env, test_env, seed, total_timesteps,
          log_interval, test_interval, show_interval, logdir,
          lr, max_grad_norm, units_per_hlayer, activ_fcn,
          gamma=0.99, vf_coef=0.5, ent_coef=0.01, nsteps=5,
          lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2,
          early_stop=False, keep_model=2,
          save_model=True, restore_model=False, save_traj=False):

    # if isinstance(lr, float): lr = constfn(lr)
    # else: assert callable(lr)
    # if isinstance(cliprange, float): cliprange = constfn(cliprange)
    # else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    logger = logging.getLogger(__name__)
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches  # TODO number of samples per minibatch in an optimization episode

    make_model = lambda : Model(policy=policy,
                                ob_space=ob_space,
                                ac_space=ac_space,
                                nenvs=nenvs,
                                nbatch_train=nbatch_train,
                                nsteps=nsteps,
                                ent_coef=ent_coef,
                                vf_coef=vf_coef,
                                max_grad_norm=max_grad_norm,
                                activ_fcn=activ_fcn,
                                units_per_hlayer=units_per_hlayer,
                                log_interval=log_interval,
                                logdir=logdir,
                                keep_model=keep_model,
                                lr=lr,
                                cliprange=cliprange)
    # if save_interval and logger.get_dir():
    #     import cloudpickle
    #     with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
    #         fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    sum_write = model.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(logdir, ('lr'+str(lr)+'_tracking_results.csv'))
    else:
        rew_results_path = None

    i_sample, i_train = 0, 0
    return_threshold = -2.
    horizon = 100
    avg_rm = deque(maxlen=30)

    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, horizon=horizon, show_interval=show_interval, summary_writer=sum_write)

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                model.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                model.sess.run(model.global_step.assign(0))

    logger.info('Start Training')
    breaked = False

    # epinfobuf = deque(maxlen=100)
    # tfirststart = time.time()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0  # nbatch should be a multiple of nminibatches
        obs, returns, masks, actions, values, neglogpacs, states, reward_window, rewards = \
            runner.run()  #pylint: disable=E0632  # returns are estimates of the discounted reward

        if rew_results_path is not None:
            rew_traj.append(rewards)


        nbatch_train = nbatch // nminibatches  # number of samples per minibatch
        tstart = time.time()
        # frac = 1.0 - (update - 1.0) / nupdates  # converges to 0
        # lrnow = lr(frac)  #
        # cliprangenow = cliprange(frac)  # cliprange converges to 0

        # Update step
        mblossvals = []
        if states is None: # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)  #
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    # mblossvals.append(model.train(lrnow, cliprangenow, *slices))
                    mblossvals.append(model.train(*slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches  # minibatch contains batch data from several envs.
            envinds = np.arange(nenvs, dtype=np.int32)
            # print(envinds)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            # print(envsperbatch)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = np.array(envinds[start:end])  # TODO int() does not work here. ensure that indices are integers beforehand
                    # print(mbenvinds)
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    if nenvs == 1:
                        mbstates = states[:]
                    else:
                        # print(states[1])
                        # print(states[0])
                        if type(states) == tuple or type(states) == tf.contrib.rnn.LSTMStateTuple:  # LSTM state
                            mbstates = [el[mbenvinds] for el in states]
                        else:  # GRU state
                            mbstates = states[mbenvinds]
                        # print(mbstates)
                    # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
                    mblossvals.append(model.train(*slices, mbstates))

        if test_interval > 0 and i_train > 0 and (update % test_interval == 0):
            ep_return = model.test_run(test_env, n_eps=10, n_pipes=2000)  # TODO test, whether results.csv is saved properly
            with open(result_path, "a") as csvfile:
                writer = csv.writer(csvfile)
                # ep_return = [str(p) for p in ep_return]
                # ep_return.insert(0, ('step_%s' % i_sample))
                ep_return[0:0] = [i_sample, i_train]
                writer.writerow(ep_return)

        if ('ContFlappyBird' in env.env_id):
            saved = False
            for i, rw in enumerate(reward_window):
                rm = sum(rw) / horizon
                if sum_write is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(tag='envs/environment%s/isample_return' % i,
                                        simple_value=rm)
                    sum_write.add_summary(s_summary, i_sample)

                    t_summary = tf.Summary()
                    t_summary.value.add(tag='envs/environment%s/itrain_return' % i,
                                        simple_value=rm)
                    sum_write.add_summary(t_summary, i_train)
                    sum_write.flush()
                # logger.info(rm)
                if save_model and not saved and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' % return_threshold)
                    model.save('inter_model')
                    saved = True
                avg_rm.append(rm)

        if early_stop:
            if (i_sample > 500000) and (i_sample <= 500000 + nbatch):  # TODO how to determine early-stopping criteria non-heuristically, but automatically? - BOHB algorithm?
                if (sum(avg_rm)/30) <= -0.88:
                    print('breaked')
                    breaked = True
                    break
        i_sample += nbatch
        i_train += 1

        # lossvals = np.mean(mblossvals, axis=0)
        # tnow = time.time()
        # fps = int(nbatch / (tnow - tstart))
        # if update % log_interval == 0 or update == 1:
        #     ev = explained_variance(values, returns)
        #     logger.logkv("serial_timesteps", update*nsteps)
        #     logger.logkv("nupdates", update)
        #     logger.logkv("total_timesteps", update*nbatch)
        #     logger.logkv("fps", fps)
        #     logger.logkv("explained_variance", float(ev))
        #     logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
        #     logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
        #     logger.logkv('time_elapsed', tnow - tfirststart)
        #     for (lossval, lossname) in zip(lossvals, model.loss_names):
        #         logger.logkv(lossname, lossval)
        #     logger.dumpkvs()
        # if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
        #     checkdir = osp.join(logger.get_dir(), 'checkpoints')
        #     os.makedirs(checkdir, exist_ok=True)
        #     savepath = osp.join(checkdir, '%.5i'%update)
        #     print('Saving to', savepath)
        #     model.save(savepath)

    if save_model:
        model.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' % i_sample)
    logger.info('Total number of finished episodes during training: sum(%s) = %s' % (runner.ep_idx, sum(runner.ep_idx)))
    logger.info('Total number of parameter updates during training: %s' % i_train)
    logger.info('*******************************************************\n')

    return breaked
Ejemplo n.º 4
0
def meta_learn(base_agent, policy, env, test_env, seed, total_timesteps,
               log_interval, test_interval, show_interval, logdir, keep_model,
               lr, max_grad_norm, units_per_hlayer, activ_fcn,
               gamma=0.99, lam=0.95, vf_coeff=0.5, ent_coeff=0.01,

               K=20, train_batchsz=1, kshot=2, test_batchsz=1, meta_batchsz=1,
               test_stage=False, **kwargs):
                # nbatch_train=4, cliprange=0.2,# PPO variables

    logger = logging.getLogger(__name__)
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    nd, = env.observation_space.shape
    ob_space = env.observation_space
    ac_space = env.action_space

    # Init model args
    model_args = dict()
    for k, v in [['policy', policy], ['ob_space', ob_space], ['ac_space', ac_space], ['max_grad_norm', max_grad_norm],
                 ['units_per_hlayer', units_per_hlayer], ['activ_fcn', activ_fcn], ['log_interval', log_interval],
                 ['logdir', logdir], ['nenvs', nenvs], ['ent_coef', ent_coeff], ['vf_coef', vf_coeff],
                 ['keep_model', keep_model], ['base_agent', base_agent], ['lr', lr]]:
        model_args[k] = v

    # Add base agent specific parameters to model_args
    if base_agent == 'ppo':
        base_agent_cls = PPO.ppo.Model
        model_args["nbatch_train"] = K * nenvs
        model_args["nsteps"] = K
        model_args["cliprange"] = kwargs["cliprange"]
    elif base_agent == 'a2c':
        base_agent_cls = A2C.A2C_OAI_NENVS.Model
        model_args["batch_size"] = K
        model_args["lr"] = lr
    else:
        raise Exception('Base Agent %s is not implemented yet' % base_agent)

    # Init meta model
    META_MODEL = get_meta_model_cls(base_agent_cls)
    model = META_MODEL(meta_batchsz=meta_batchsz*test_batchsz, meta_task_steps=K, **model_args)
    sum_write = model.get_summary_writer()
    result_path = os.path.join(logdir, 'meta_train_results.csv')

    # Init worker, which includes data processing, i.e. discounting
    runner = Runner(env=env, model=model, nsteps=K, gamma=gamma, lam=lam, horizon=100,
                    show_interval=show_interval, summary_writer=sum_write)

    i_kshot_training = 0
    i_sample = 0

    if not test_stage:
        steps_per_meta_update = (((train_batchsz * K) * kshot) + (test_batchsz * K) * 1) * meta_batchsz
        for meta_update in range(total_timesteps // steps_per_meta_update):
            print('meta update %s' % meta_update)
            test_obs, test_rewards, test_actions, test_values, test_dones, test_neglogpacs, test_states = \
                [], [], [], [], [], [], []  # init train batches

            init_meta_param = model.get_trainable_vars_vals()

            for t in range(meta_batchsz):
                i_kshot_training += 1
                # Run <Kshot> Fast Training Updates of model parameters
                runner.nsteps = train_batchsz * K
                for k in range(kshot):
                    i_sample += runner.nsteps
                    # Sample <train_batchsz> trajectories with length <K>
                    # and process samples, i.e. discounting and advantage estimation
                    tb_obs, tb_returns, tb_dones, tb_actions, tb_values, tb_neglogpacs, tb_states, reward_window, _ = \
                        runner.run()
                    model.fast_train(tb_obs, tb_states, tb_returns, tb_actions, tb_values,
                                     tb_dones=tb_dones, tb_neglogpacs=tb_neglogpacs)
                    print(model.sess.run([model.train_model.pi], {model.train_model.X:tb_obs})) #, model.train_model.rnn_state_in: tb_states}))

                if test_interval > 0 and (i_kshot_training % test_interval == 0):
                    ep_return = model.test_run(test_env, n_eps=10, n_pipes=2000)
                    with open(result_path, "a") as csvfile:
                        writer = csv.writer(csvfile)
                        # ep_return = [str(p) for p in ep_return]
                        # ep_return.insert(0, ('step_%s' % i_sample))
                        ep_return[0:0] = [i_sample, i_kshot_training]
                        writer.writerow(ep_return)

                # Test Performance of kshot model on <test_batchsz> K-length trajectories.
                runner.nsteps = K
                for i in range(test_batchsz):
                    i_sample += runner.nsteps
                    # Sample trajectory and process samples, i.e. discounting and advantage estimation
                    obs, returns, dones, actions, values, neglogpacs, states, reward_window, _ = runner.run()

                    # Add recent experience to train batches fpr fast updates
                    test_obs.append(obs)
                    test_rewards.append(returns)
                    test_actions.append(actions)
                    test_values.append(values)
                    test_dones.append(dones)
                    test_neglogpacs.append(neglogpacs)
                    test_states.append(states)

                # Reset model to initial param values before fast updates
                model.set_trainable_vars_vals(init_meta_param)

            # Train meta model on test error, based on test samples and estimates with initial parameter vector.
            # Reshape test_samples:
            test_obs = np.asarray(test_obs).swapaxes(0, 1).reshape(-1, nd)
            test_rewards = np.asarray(test_rewards).swapaxes(0, 1).flatten()  # Should be K*test_batchsz,1
            test_actions = np.asarray(test_actions).swapaxes(0, 1).flatten()
            test_values = np.asarray(test_values).swapaxes(0, 1).flatten()
            test_dones = np.asarray(test_dones).swapaxes(0, 1).flatten()
            test_neglogpacs = np.asarray(test_neglogpacs).swapaxes(0, 1).flatten()
            test_states = np.asarray(test_states).swapaxes(0, 1).squeeze()
            test_states = tuple(test_states)
            model.meta_train(test_obs, test_states, test_rewards, test_actions, test_values,
                             tb_dones=test_dones, tb_neglogpacs=test_neglogpacs)

    else:
        # kshot learning toadapt to environment.
        # how often is meta policy updated
        # Is parameter reset to original meta policy after every kshot sequence??
        #
        pass
Ejemplo n.º 5
0
def q_learning(
        env,
        test_env,
        seed,
        total_timesteps=int(1e8),
        gamma=0.95,
        epsilon=0.4,
        epsilon_decay=.95,
        tau=0.99,
        buffer_size=4000,
        nbatch=128,
        trace_length=32,
        lr=5e-4,
        lrschedule='linear',
        max_grad_norm=0.01,
        units_per_hlayer=(8, 8, 8),  # pre_train_steps=1000,
        scope='model',
        update_interval=5,
        log_interval=100,
        test_interval=0,
        show_interval=0,
        logdir=None,
        keep_model=7,
        activ_fcn='relu6'):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    Implements the options of online learning or using experience replay and also
    target calculation by target networks, depending on the flags. You can reuse
    your Q-learning implementation of the last exercise.

    Args:
        env: PLE game
        approx: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        max_time_per_episode: maximum number of time steps before episode is terminated
        discount_factor: gamma, discount factor of future rewards.
        epsilon: Chance to sample a random action. Float betwen 0 and 1.
        epsilon_decay: decay rate of epsilon parameter
        use_experience_replay: Indicator if experience replay should be used.
        batch_size: Number of samples per batch.
        target: Slowly updated target network to calculate the targets. Ignored if None.

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    logger = logging.getLogger(__name__)
    logger.info(datetime.time())
    tf.reset_default_graph()
    set_global_seeds(seed)

    # Params
    ob_space = env.observation_space
    ac_space = env.action_space
    nd, = ob_space.shape
    n_ac = ac_space.n
    # use_exp_replay = False if nbatch == 1 else True

    # Create learning agent and the replay buffer
    agent = DRQNAgent(ob_space=ob_space,
                      ac_space=ac_space,
                      scope=scope,
                      lr=lr,
                      lrschedule=lrschedule,
                      max_grad_norm=max_grad_norm,
                      units_per_hlayer=units_per_hlayer,
                      nbatch=nbatch,
                      trace_length=trace_length,
                      tau=tau,
                      total_timesteps=total_timesteps,
                      update_interval=update_interval,
                      log_interval=log_interval,
                      logdir=logdir,
                      keep_model=keep_model,
                      activ_fcn=activ_fcn)
    summary_writer = agent.get_summary_writer()
    result_path = os.path.join(logdir, 'results.csv')

    sample_size = 5  # [s, a, r, s1, d]
    replay_buffer = ExperienceBuffer(buffer_size, sample_size=sample_size)

    # Keeps track of useful statistics
    stats = EpisodeStats

    # ------------------ TRAINING --------------------------------------------
    logger.info("Start Training")
    i_episode, i_sample, i_train = 0, 0, 0
    len, rew = 0, 0
    rolling_mean = deque(maxlen=30)
    return_treshold = 40  # models which achieve higher total return get stored.

    # Reset env
    obs = env.reset()
    obs = normalize_obs(obs)
    done = False

    # The rnn state consists of the "cell state" c and the "input vector" x_t = h_{t-1}
    rnn_state0 = (np.zeros([1, units_per_hlayer[2]]),
                  np.zeros([1, units_per_hlayer[2]])
                  )  #  units_per_hlayer[2] x  units_per_hlayer[2] matrix

    episode_buffer = []

    # Set the target network to be equal to the primary network
    agent.update_target(agent.target_ops)
    # for update in range(1, total_timesteps // nbatch + nbatch):  # as we start training only after nbatch experiences
    while i_sample < total_timesteps:
        # # Update target network
        # if target:
        #     target.update()

        # Choose epsilon-greedy action (with epsilon being the chance of random action) using the network
        if np.random.rand(1) < epsilon:
            _, next_rnn_state = agent.step_model.step([obs], rnn_state0)
            action = np.random.randint(0, n_ac)
        else:
            # print(obs)  # TODO do I get the right action here?
            # print(agent.step_model.predict([obs], rnn_state0))
            AP, next_rnn_state = agent.step_model.step([obs], rnn_state0)
            action = AP[0]
            # print(action)
            # action = np.random.choice(np.arange(n_ac), p=AP)

        # AP = agent.step([obs], epsilon=epsilon)  # epsilon greedy action
        # action = np.random.choice(np.arange(n_ac), p=AP)
        next_obs, reward, done, _ = env.step(action)  #action)
        # print('rew %s' % reward)
        next_obs = normalize_obs(next_obs)
        reward -= 1e-5
        i_sample += 1

        # render only every i-th episode
        if show_interval != 0:
            if i_episode % show_interval == 0:
                env.render()

        # episode stats
        # stats['episode_lengths'][i_episode] += 1
        # stats['episode_rewards'][i_episode] += reward
        len += 1  # TODO check whether this works
        rew += reward
        rolling_mean.append(reward)

        # When episode is done, add episode information to tensorboard summary and stats
        if done:  # env.game_over():
            next_obs = list(np.zeros_like(next_obs, dtype=np.float64))
            episode_buffer.append(
                np.reshape(np.array([obs, action, reward, next_obs, done]),
                           newshape=[1, 5]))

            replay_buffer.add(list(zip(np.array(
                episode_buffer))))  # TODO does thos here lead to object dtype?
            episode_buffer = []
            stats['episode_lengths'].append(len)
            stats['episode_rewards'].append(rew)

            if summary_writer is not None:
                summary = tf.Summary()
                summary.value.add(
                    tag='envs/ep_return',
                    simple_value=stats['episode_rewards'][i_episode])
                summary.value.add(
                    tag="envs/ep_length",
                    simple_value=stats['episode_lengths'][i_episode])
                summary_writer.add_summary(summary, i_episode)
                summary_samples = tf.Summary()
                summary_samples.value.add(
                    tag='envs/samples/ep_return',
                    simple_value=stats['episode_rewards'][i_episode])
                summary_samples.value.add(
                    tag="envs/samples/ep_length",
                    simple_value=stats['episode_lengths'][i_episode])
                summary_writer.add_summary(summary_samples, i_sample)
                summary_writer.flush()

            if rew > return_treshold:
                return_treshold = rew
                logger.info('Save model at max reward %s' % return_treshold)
                agent.save('inter_model')
            i_episode += 1
            # print(i_episode)
            len, rew = 0, 0
            # stats['episode_lengths'].append(0)
            # stats['episode_rewards'].append(0)
        else:
            episode_buffer.append(
                np.reshape(np.array([obs, action, reward, next_obs, done]),
                           newshape=[1, 5]))

        # Compute TD target and update the model from the sampled traces in the buffer as soon as #pre_train_steps
        # where done with the environments
        if i_episode >= nbatch:  # if number of finished episodes is greather or equal the number of episodes
            # from which traces are sampled.
            if i_sample % update_interval == 0:
                # TODO update epsilon

                agent.update_target(agent.target_ops)

                # reset rnn state (history knowledge) before every training step
                rnn_state_train = (np.zeros([nbatch, units_per_hlayer[2]]),
                                   np.zeros([nbatch, units_per_hlayer[2]]))

                # sample training batch from replay buffer
                training_batch = replay_buffer.sample(
                    nbatch=nbatch, trace_length=trace_length)

                mb_obs = training_batch[:, 0].tolist()
                mb_actions = training_batch[:, 1].astype(np.int32)
                mb_rewards = training_batch[:, 2].astype(np.float64)
                mb_next_obs = training_batch[:, 3].tolist()
                mb_dones = training_batch[:, 4].astype(bool)

                # Compute target Q values for the given batch
                mb_next_q_values, _ = agent.target_model.predict(
                    mb_next_obs, rnn_state=rnn_state_train)
                mb_best_next_action = np.argmax(mb_next_q_values, axis=1)
                mb_td_target = [
                    mb_rewards[j] +
                    gamma * mb_next_q_values[j][mb_best_next_action[j]]
                    for j in range(nbatch * trace_length)
                ]

                # train model
                # start_time = datetime.time()
                loss = agent.train(mb_obs, mb_actions, mb_td_target,
                                   rnn_state_train)
                # logger.info('Train duration: %s - %s' % (start_time, datetime.time()))
                i_train += 1

                # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates
                if test_interval > 0 and i_train > 0 and (i_train %
                                                          test_interval == 0):
                    # print('testing')
                    ep_return = agent.test_run(test_env,
                                               n_eps=30,
                                               n_pipes=2000)
                    with open(result_path, "a") as csvfile:
                        writer = csv.writer(csvfile)
                        ep_return = [str(p) for p in ep_return]
                        ep_return.insert(0, ('step_%s_eps_%s' %
                                             (i_sample, i_episode)))
                        writer.writerow(ep_return)
        if done:
            # Reset the model
            next_obs = env.reset()
            next_obs = normalize_obs(next_obs)

        epsilon *= epsilon_decay
        obs = next_obs
        rnn_state0 = next_rnn_state

    # Save final model when training is finished.
    agent.save('final_model')
    logger.info('Finished Training. Saving Final model.')

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info('Total number of finished episodes during training: %s' %
                i_episode)
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')
Ejemplo n.º 6
0
def eval_model(render, nepisodes, **params):
    logger = logging.getLogger(__name__)
    logger.info('Evaluating learning algorithm...\n')
    logger.info(params["eval_model"])

    logger.debug('\nMake Environment with seed %s' % params["seed"])
    # TODO make non-clipped env, even if agent is trained on clipped env
    ple_env = make_ple_env(params["env"],
                           seed=params["seed"])  # , allow_early_resets=True)

    tf.reset_default_graph()
    set_global_seeds(params["seed"])
    model_idx = []

    if params["eval_model"] == 'final':
        f = glob.glob(os.path.join(params["logdir"], 'final_model-*.meta'))
        idx = f.find('final_model')
        f_name = f[idx:-5]
        model_idx.append(f_name)
        with tf.Session() as sess:
            OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model(
                sess, logdir=params["logdir"], f_name=f_name)
            model_performance = run_episodes(sess, ple_env, nepisodes, 1000,
                                             render, params["epsilon"], OBS,
                                             RNN_S_IN, RNN_S_OUT, PRED_Q)

            # Add model performance metrics
            avg_performances = np.mean(model_performance)
            var_performances = np.var(model_performance)
            maximal_returns = np.max(model_performance)

        tf.reset_default_graph()

    elif params["eval_model"] == 'all':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        var_performances = []
        maximal_returns = []
        iii = 0
        for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model(
                    sess, logdir=params["logdir"], f_name=f_name)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 1000, render,
                                                 params["epsilon"], OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q)

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
    elif params["eval_model"] == "config":
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        var_performances = []
        maximal_returns = []

        # Setup log csv file
        fieldnames = ['model']
        for i in range(nepisodes):
            fieldnames.append(('eps' + str(i)))
        path = os.path.join(params["logdir"], 'results.csv')
        with open(path, "w") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(fieldnames)

        # Run evaluation episodes
        models = glob.glob(
            os.path.join(params["logdir"], '*config_model*.meta'))
        models.sort()
        for f in models:
            logger.info('Restore model: %s' % f)
            idx = f.find('config_model')
            f_name = f[idx:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_drqn_model(
                    sess, logdir=params["logdir"], f_name=f_name)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 2000, render,
                                                 params["epsilon"], OBS,
                                                 RNN_S_IN, RNN_S_OUT,
                                                 PRED_Q)  # TODO 1000

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()

            # Save episode information in csv file for further analysis.
            # Each row contains nepisodes episodes using the current model "f_name".
            with open(path, "a") as csvfile:  # TODO add real returns
                writer = csv.writer(csvfile)
                model_performance = [str(p) for p in model_performance]
                model_performance.insert(0, f_name)
                writer.writerow(model_performance)

    logger.info(params["logdir"])
    logger.info('Results of the evaluation of the learning algorithm:')
    logger.info('Restored models: %s' % model_idx)
    logger.info('Average performance per model: %s' % avg_performances)
    logger.info('Performance variance per model: %s' % var_performances)
    logger.info('Maximum episode return per model: %s' % maximal_returns)
    ple_env.close()

    if len(avg_performances) > 0:
        return np.mean(avg_performances), np.mean(var_performances), np.mean(
            maximal_returns)
    else:
        return -5, 0, -5
def eval_model(render,
               nepisodes,
               test_steps,
               save_traj=False,
               result_file='test_results.csv',
               **params):
    logger = logging.getLogger(__name__)
    logger.info('Evaluating learning algorithm...\n')
    logger.info(params["eval_model"])

    logger.debug('\nMake Environment with seed %s' % params["seed"])
    # TODO use different seed for every run!#, allow_early_resets=True)
    # TODO make non-clipped env, even if agent is trained on clipped env
    ple_env = make_ple_env(params["test_env"], seed=params["seed"])

    tf.reset_default_graph()
    set_global_seeds(params["seed"])
    model_idx = []

    if save_traj:
        result_path = os.path.join(params["logdir"], result_file)
    else:
        result_path = None

    recurrent = (params["architecture"] == 'lstm'
                 or params["architecture"] == 'gru')
    if params["eval_model"] == 'final':
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(
                os.path.join(params["logdir"], '*final_model-*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('final_model')
            f_name = f[idx:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])
                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()

    elif params["eval_model"] == 'inter':
        # Use all stored maximum performance models and the final model.
        # print('Eval now!')
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
    elif params["eval_model"] == 'analysis':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        std_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            # print(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                std_performances.append(np.std(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
        return model_idx, avg_performances, std_performances

    logger.info(params["logdir"])
    logger.info('Results of the evaluation of the learning algorithm:')
    logger.info('Restored models: %s' % model_idx)
    logger.info('Average performance per model: %s' % avg_performances)
    logger.info('Performance variance per model: %s' % var_performances)
    logger.info('Maximum episode return per model: %s' % maximal_returns)
    ple_env.close()

    if not avg_performances == []:
        return np.mean(avg_performances), np.mean(var_performances), np.mean(
            maximal_returns)
    else:
        return -3000, 3000, -3000