Esempio n. 1
0
from mpl_toolkits.mplot3d import Axes3D
from toolsm import tools
import numpy as np
import baselines.common.tf_util as U
# path_root = '/media/d/e/et/baselines'
from baselines.common.distributions import DiagGaussianPd

import os
import pandas as pd
from toolsm import tools

TabularActionPrecision = 5

import baselines
from toolsm import logger
path_root = logger.get_logger_dir('baselines/KL2Clip', baselines, 'results/KL2Clip' )

# print(path_root)
# exit()
_BATCH_NORM_DECAY = 0.997
_BATCH_NORM_EPSILON = 1e-5


def batch_norm_relu(inputs, is_training):
    """Performs a batch normalization followed by a ReLU."""
    # We set fused=True for a significant performance boost. See
    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
    inputs = tf.layers.batch_normalization(
        inputs=inputs, momentum=_BATCH_NORM_DECAY,
        epsilon=_BATCH_NORM_EPSILON, center=True,
        scale=True, training=is_training, fused=True)
Esempio n. 2
0
def main():
    parser, args_default = arg_parser_common()
    args = parser.parse_args()


    import json
    from dotmap import DotMap
    from copy import copy, deepcopy
    keys_exclude = [ 'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval', 'n_steps', 'n_minibatches',
        'play', 'n_eval_epsiodes', 'force_write', 'kl2clip_sharelogstd','policy_variance_state_dependent',
                   'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps', 'gradient_rectify', 'rectify_scale','kl2clip_clipcontroltype', 'reward_scale', 'coef_predict_task','explore_additive_rate','explore_additive_threshold','explore_timesteps', 'debug_halfcheetah', 'name_project', 'n_opt_epochs', 'coef_entropy', 'log_interval', 'save_interval', 'save_debug', 'isatari', 'env_full', 'envtype']
    keys_exclude.extend(['logstd','lam','hidden_sizes','num_layers','num_sharing_layers','ac_fn','lam_decay','policy_type'])
    # TODO: These args should not be used as name of dir only if they are specified.
    # TODO: Split args into..... group_keys and run_keys.

    #  -------------------- prepare args

    args.env_full = args.env
    args.env = args.env_full.split('-v')[0]

    if not args.isatari:
        args.envtype = MUJOCO
        if '-v' not in args.env_full:
            args.env_full = f'{args.env}-v2'
    else:
        keys_exclude.append('logstd')
        args.envtype = ATARI
        # if 'NoFrameskip' not in args.env:
        #     args.env = f''
        if '-v' not in args.env_full:
            args.env_full = f'{args.env}-v4'
    tools.warn_(f'Run with setting for {args.envtype} task!!!!!')

    assert bool(args.alg) != bool(args.cliptype), 'Either alg or cliptype should be specified'
    if args.alg: # For release
        args.cliptype = alg2cliptype[args.alg]
        keys_exclude.append('cliptype')
        if len(args.keys_group) ==0:
            args.keys_group = ['alg']
        if args.name_group is None:
            args.name_group = ''
    else: # For debug
        keys_exclude.append('alg')
        if len(args.keys_group) ==0:
            args.keys_group = ['cliptype','clipargs']
        if args.name_group is None:
            args.name_group = 'tmp'


    # ------ Set the values of args
    def update_dict(dictmain, dictnew):
        for key_arg in dictnew:
            if key_arg.startswith('__'):
                # This means that the value are customized for the specific values
                key_interest  = key_arg[2:] #e.g., __cliptype
                value_interest  = dictmain[key_interest] #Search value from dictmain. e.g., kl_klrollback_constant_withratio
                if value_interest in dictnew[ key_arg ].keys():
                    dictmain = update_dict( dictmain, dictnew[ key_arg ][value_interest])
            else:
                if isinstance(dictnew[key_arg], dict) and key_arg in dictmain.keys():
                    dictmain[key_arg].update( dictnew[key_arg] )
                else:
                    dictmain[key_arg] = copy( dictnew[key_arg])
        return dictmain

    def reform_specific_dict(d):
        dictmain = dict( (k,v) for k,v in d.items() if not k.startswith('__') )
        dictspecific = dict( (k,v) for k,v in d.items() if k.startswith('__') )
        return update_dict( dictmain, dictspecific )


    # If the value of the following args are None, then it is setted by the following values
    keys_del = []
    args = vars(args)
    keys = list(args.keys())
    for key in keys:
        if args[key] is None:
            del args[key] #Delete the value of args
            keys_del.append( key )
    if len(keys_del) > 0:
        print( 'The following args are not provided value by the args. They will used built-in values.\n', ', '.join(keys_del) )

    # args__ = update_dict( copy(args_default), args ) # We need to update the basic args, e.g., env, cliptype
    # args__  = reform_specific_dict( args__)
    # The following operations may seems strange. Maybe I will give a more clear one in the furture.
    args__ = update_dict( deepcopy(args), args_default ) # generate the default value from args_default
    args = update_dict( args__, args ) # The priority of the customed value is highest
    for key in keys_del: # make sure that keys_del are within args.keys()
        assert key in args.keys(), key
    # print( json.dumps(args, indent=True) )
    # exit()
    # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear.
    # --- prepare dir
    import baselines
    root_dir = tools_logger.get_logger_dir(  'baselines', 'results', baselines )
    args = tools_logger.prepare_dirs( args, key_first='env', keys_exclude=keys_exclude, dirs_type=['log' ], root_dir=root_dir )
    # --- prepare args for use
    args.cliptype = ClipType[ args.cliptype ]

    args.zip_dirs = ['model','monitor']
    for d in args.zip_dirs:
        args[f'{d}_dir'] = osp.join(args.log_dir, d)
        os.mkdir( args[f'{d}_dir'] )

    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2_AdaClip import ppo2
    # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2
    import baselines.ppo2_AdaClip.policies as plcs
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()


    set_global_seeds(args.seed)
    policy = getattr(plcs, args.policy_type)


    # ------ prepare env
    # args.eval_model = args.n_eval_epsiodes > 0
    if args.envtype == MUJOCO:
        def make_mujoco_env(rank=0):
            def _thunk():
                env = gym.make(args.env_full)
                env.seed(args.seed + rank)
                env = bench.Monitor(env, os.path.join(args.log_dir, 'monitor', str(rank)), allow_early_resets=True)
                return env

            return _thunk

        if args.n_envs == 1:
            env = DummyVecEnv([make_mujoco_env()])
        else:
            from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
            env = SubprocVecEnv([make_mujoco_env(i) for i in range(args.n_envs)])
        env = VecNormalize(env, reward_scale=args.reward_scale)

        env_test = None
        if args.n_eval_epsiodes > 0:
            if args.n_eval_epsiodes == 1:
                env_test = DummyVecEnv([make_mujoco_env()])
            else:
                from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
                env_test = SubprocVecEnv([make_mujoco_env(i) for i in range(args.n_eval_epsiodes)])
            env_test = VecNormalize(env_test, ret=False, update=False)  # It doesn't need to normalize return
    else:
        from baselines.common.vec_env.vec_frame_stack import VecFrameStack
        from baselines.common.cmd_util import make_atari_env
        env = VecFrameStack(make_atari_env(args.env_full, num_env=args.n_envs, seed=args.seed), 4)
        env_test = None
        #  TODO : debug VecFrame
        if args.n_eval_epsiodes > 0:
            env_test = VecFrameStack(make_atari_env(args.env_full, num_env=args.n_eval_epsiodes, seed=args.seed), 4)
            # env_test.reset()
            # env_test.render()
    # ----------- learn
    if args.envtype == MUJOCO:
        lr = args.lr
        # cliprange = args.clipargs.cliprange
    elif args.envtype == ATARI:
        lr = lambda f: f * args.lr
        # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None
    # print('action_space',env.action_space)
    ppo2.learn(policy=policy, env=env, env_eval=env_test, n_steps=args.n_steps, nminibatches=args.n_minibatches,
               lam=args.lam, gamma=0.99, n_opt_epochs=args.n_opt_epochs, log_interval=args.log_interval,
               ent_coef=args.coef_entropy,
               lr=lr,
               total_timesteps=args.num_timesteps,
               cliptype=args.cliptype, save_interval=args.save_interval, args=args)

    tools_logger.finish_dir( args.log_dir )
Esempio n. 3
0
def main():
    parser, clipargs_default_all, args_default_all = arg_parser_common()
    args = parser.parse_args()

    import json
    from dotmap import DotMap
    keys_exclude = [
        'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval',
        'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write',
        'kl2clip_sharelogstd', 'policy_variance_state_dependent',
        'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps',
        'gradient_rectify', 'rectify_scale', 'kl2clip_clipcontroltype',
        'reward_scale', 'coef_predict_task', 'explore_additive_rate',
        'explore_additive_threshold', 'explore_timesteps', 'debug_halfcheetah',
        'name_project', 'env_pure', 'n_opt_epochs', 'coef_entropy',
        'log_interval', 'save_interval', 'save_debug', 'is_atari'
    ]
    # 'is_atari'

    #  -------------------- prepare args

    args.env_pure = args.env.split('-v')[0]

    # env_mujocos = 'InvertedPendulum,InvertedDoublePendulum,HalfCheetah,Hopper,Walker2d,Ant,Reacher,Swimmer,Humanoid'
    # env_mujocos = tools.str2list(env_mujocos)
    if not args.is_atari:
        env_type = MUJOCO
        if '-v' not in args.env:
            args.env = f'{args.env}-v2'
    else:
        env_type = ATARI
        if '-v' not in args.env:
            args.env = f'{args.env}-v4'
    tools.warn_(f'Run with setting for {env_type} task!!!!!')

    # --- set value of clipargs
    clipargs_default = clipargs_default_all[env_type]

    clipargs = clipargs_default[args.cliptype].copy()
    clipargs.update(args.clipargs)
    args.clipargs = clipargs

    # --- prepare other args
    # If the value of the following args are None, then it is setted by the following values
    args_default = args_default_all[env_type]
    args = DotMap(vars(args))
    print(
        "The followng arg value is None, thus they are setted by built-in value:"
    )

    for argname in args_default.keys():
        if args[argname] is None:
            if args.env_pure in args_default[argname].keys():
                args[argname] = args_default[argname][args.env_pure]
            else:
                args[argname] = args_default[argname]['_default']
            print(f"{argname}={args[argname]}")
    # print( json.dumps( args.toDict(), indent='\t') )
    # exit()
    # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear.
    # --- prepare dir
    import baselines
    root_dir = tools_logger.get_logger_dir('baselines', baselines, 'results')
    args = tools_logger.prepare_dirs(args,
                                     key_first='env',
                                     keys_exclude=keys_exclude,
                                     dirs_type=['log'],
                                     root_dir=root_dir)
    # --- prepare args for use
    args.cliptype = ClipType[args.cliptype]

    args.zip_dirs = ['model', 'monitor']
    for d in args.zip_dirs:
        args[f'{d}_dir'] = osp.join(args.log_dir, d)
        os.mkdir(args[f'{d}_dir'])

    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2_AdaClip import ppo2
    # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2
    import baselines.ppo2_AdaClip.policies as plcs
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    set_global_seeds(args.seed)
    policy = getattr(plcs, args.policy_type)

    # ------ prepare env
    # args.eval_model = args.n_eval_epsiodes > 0
    if env_type == MUJOCO:

        def make_mujoco_env(rank=0):
            def _thunk():
                env = gym.make(args.env)
                env.seed(args.seed + rank)
                env = bench.Monitor(env,
                                    os.path.join(args.log_dir, 'monitor',
                                                 str(rank)),
                                    allow_early_resets=True)
                return env

            return _thunk

        if args.n_envs == 1:
            env = DummyVecEnv([make_mujoco_env()])
        else:
            from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
            env = SubprocVecEnv(
                [make_mujoco_env(i) for i in range(args.n_envs)])
        env = VecNormalize(env, reward_scale=args.reward_scale)

        env_test = None
        if args.n_eval_epsiodes > 0:
            if args.n_eval_epsiodes == 1:
                env_test = DummyVecEnv([make_mujoco_env()])
            else:
                from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
                env_test = SubprocVecEnv(
                    [make_mujoco_env(i) for i in range(args.n_eval_epsiodes)])
            env_test = VecNormalize(
                env_test, ret=False,
                update=False)  # It doesn't need to normalize return
    else:
        from baselines.common.vec_env.vec_frame_stack import VecFrameStack
        from baselines.common.cmd_util import make_atari_env
        env = VecFrameStack(
            make_atari_env(args.env, num_env=args.n_envs, seed=args.seed), 4)
        env_test = None
        #  TODO : debug VecFrame
        if args.n_eval_epsiodes > 0:
            env_test = VecFrameStack(
                make_atari_env(args.env,
                               num_env=args.n_eval_epsiodes,
                               seed=args.seed), 4)
            # env_test.reset()
            # env_test.render()
    # ----------- learn
    if env_type == MUJOCO:
        lr = args.lr
        # cliprange = args.clipargs.cliprange
    elif env_type == ATARI:
        lr = lambda f: f * args.lr
        # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None
    args.env_type = env_type
    ppo2.learn(policy=policy,
               env=env,
               env_eval=env_test,
               n_steps=args.n_steps,
               nminibatches=args.n_minibatches,
               lam=args.lam,
               gamma=0.99,
               n_opt_epochs=args.n_opt_epochs,
               log_interval=args.log_interval,
               ent_coef=args.coef_entropy,
               lr=lr,
               total_timesteps=args.num_timesteps,
               cliptype=args.cliptype,
               save_interval=args.save_interval,
               args=args)

    tools_logger.finish_dir(args.log_dir)