コード例 #1
0
def meil(WORKING_DIR, EXPERT_DIR, args):

    expert_distribution = Gaussian_Density()
    with tf.Session() as sess:
        env = gym.make(args.env)
        expert = load_policy(sess, EXPERT_DIR)
        expert_distribution.train(env, expert, args.trajects, args.distr_gamma, args.iter_length)
        env.close()
    expert_density = expert_distribution.density()

    env = gym.make(args.env)
    policy_distr = Gaussian_Density()
    policy = lambda s: np.random.uniform(-2.0, 2.0, size=env.action_space.shape) # random policy
    policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length)
    density = policy_distr.density()
    for i in range(args.rounds):
        reward = lambda s: expert_density(s) / (density(s) + args.eps)
        logger_kwargs = setup_logger_kwargs(str(i), data_dir=WORKING_DIR)
        message = "\nRound {} out of {}\n".format(i + 1, args.rounds)
        ppo(message, lambda : gym.make(args.env), custom_reward=reward, actor_critic=core.mlp_actor_critic,
            ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, 
            steps_per_epoch=args.steps, epochs=args.epochs, logger_kwargs=logger_kwargs)

        with tf.Session() as sess:
            policy = load_policy(sess, os.path.join(WORKING_DIR, str(i)))
            policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length)
            density = policy_distr.density()

    env.close()
    opt_dir = reward_validation(WORKING_DIR, args)
    return opt_dir
コード例 #2
0
def main():
    # Environment Fcn
    env_fn = lambda: \
        NormalizedBoxEnv(
            CentauroTrayEnv(**env_params),
            # normalize_obs=True,
            normalize_obs=False,
            online_normalization=False,
            obs_mean=None,
            obs_var=None,
            obs_alpha=0.001,
        )

    # Logger kwargs
    logger_kwargs = setup_logger_kwargs(EXP_NAME, SEED)

    with tf.Graph().as_default():
        sac(
            env_fn,
            actor_critic=mlp_actor_critic,
            ac_kwargs=dict(hidden_sizes=(128, 128, 128)),
            seed=SEED,
            steps_per_epoch=PATHS_PER_EPOCH * PATH_LENGTH,
            epochs=EPOCHS,
            replay_size=int(1e6),
            gamma=0.99,
            polyak=0.995,  # Polyak avg target pol (0-1)
            lr=1e-3,
            alpha=0.2,  # entropy regularization coefficient (inv rew scale)
            batch_size=BATCH_SIZE,
            start_steps=10000,
            max_ep_len=PATH_LENGTH,  # Max length for trajectory
            logger_kwargs=logger_kwargs,
            save_freq=1)
コード例 #3
0
ファイル: test_ppo.py プロジェクト: domingoesteban/robolearn
def main():
    # Environment Fcn
    env_fn = lambda: \
        Reacher2D3DofGoalCompoEnv(**env_params)
    # Logger kwargs
    logger_kwargs = setup_logger_kwargs(EXP_NAME, SEED)

    with tf.Graph().as_default():
        ppo(
            env_fn,
            actor_critic=mlp_actor_critic,
            ac_kwargs=dict(hidden_sizes=(128, 128, 128)),
            seed=SEED,
            steps_per_epoch=PATHS_PER_EPOCH * PATH_LENGTH,
            epochs=10000,
            gamma=0.99,  # Discount factor (0-1)
            clip_ratio=0.2,  # clip pol objective (0.1-0.3)
            pi_lr=3e-4,
            vf_lr=1e-3,
            train_pi_iters=80,  # Max grad steps in pol loss per epoch
            train_v_iters=80,  # Max grad steps in val loss per epoch
            lam=0.97,  # Lambda for GAE-Lambda (0-1)
            max_ep_len=PATH_LENGTH,  # Max length for trajectory
            target_kl=0.01,  # KLdiv between new and old policies
            logger_kwargs=logger_kwargs,
            save_freq=10,
        )
コード例 #4
0
ファイル: logger.py プロジェクト: matrl-project/matrl
 def __init__(self, config):
     self.log_dir = config["output_dir"]
     logger_kwargs = setup_logger_kwargs(config["exp_name"], config["seed"])
     logger_kwargs["output_dir"] = config["output_dir"]
     self.csv_logger = EpochLogger(**logger_kwargs)
     self.csv_logger.save_config(config)
     self.tf_logger = SummaryWriter(os.path.join(self.log_dir))
コード例 #5
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        logger_kwargs = setup_logger_kwargs('MultiTaskDDPG')
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(globals())

        self.start_steps = 10000
コード例 #6
0
ファイル: spinningup_if.py プロジェクト: zackzhao1/Distiller
    def solve(self, env1, env2):
        msglogger.info("AMC: Using Spinningup")

        # training_noise_duration = amc_cfg.num_training_episodes * steps_per_episode
        heatup_duration = env1.amc_cfg.ddpg_cfg.num_heatup_episodes * env1.steps_per_episode

        exp_name = "Test"
        seed = 0
        # The number and size of the Actor-Critic MLP hidden layers
        layers, hid = 2, 300
        logger_kwargs = setup_logger_kwargs(exp_name)

        ddpg.ddpg(
            env=env1,
            test_env=env2,
            actor_critic=core.mlp_actor_critic,
            ac_kwargs=dict(hidden_sizes=[hid] * layers,
                           output_activation=tf.sigmoid),
            gamma=1,  # discount rate
            #seed=seed,
            epochs=400,
            replay_size=env1.amc_cfg.ddpg_cfg.replay_buffer_size,
            batch_size=64,
            start_steps=heatup_duration,
            steps_per_epoch=env1.steps_per_episode,
            act_noise=env1.amc_cfg.ddpg_cfg.initial_training_noise,
            pi_lr=env1.amc_cfg.ddpg_cfg.actor_lr,
            q_lr=env1.amc_cfg.ddpg_cfg.critic_lr,
            logger_kwargs=logger_kwargs,
            noise_decay=env1.amc_cfg.ddpg_cfg.training_noise_decay)
コード例 #7
0
ファイル: ADC.py プロジェクト: zhepengfei/distiller
    def ddpg_spinup(env1, env2):
        from spinup.utils.run_utils import setup_logger_kwargs
        exp_name = "Test"
        seed = 0
        # The number and size of the Actor-Critic MLP hidden layers
        layers, hid = 2, 300
        logger_kwargs = setup_logger_kwargs(exp_name)  # ,  seed)

        ddpg.ddpg(
            env=env1,
            test_env=env2,
            actor_critic=core.mlp_actor_critic,
            ac_kwargs=dict(hidden_sizes=[hid] * layers,
                           output_activation=tf.sigmoid),
            gamma=1,  # discount rate
            seed=seed,
            epochs=400,
            replay_size=2000,
            batch_size=64,
            start_steps=env1.amc_cfg.num_heatup_epochs,
            steps_per_epoch=800 * env1.num_layers(
            ),  # every 50 episodes perform 10 episodes of testing
            act_noise=0.5,
            pi_lr=1e-4,
            q_lr=1e-3,
            logger_kwargs=logger_kwargs)
コード例 #8
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     logger_kwargs = setup_logger_kwargs('MultiTaskDDPGAutoQuery')
     self.logger = EpochLogger(**logger_kwargs)
     self.logger.save_config(globals())
     self.init_query = False
     self.init_reward = False
     self.query_reward = 0
コード例 #9
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
    def __init__(self,
                 action_space,
                 observation_space,
                 rng,
                 eps=0.9,
                 discount_factor=0.99,
                 alpha=1e-3):
        self.rng = rng
        logger_kwargs = setup_logger_kwargs('SingleTaskDDPG', self.rng)
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.actor_critic = MLPActorCritic
        # ac_kwargs=dict() ****?????*****
        # seed=0
        self.replay_size = int(1e6)
        self.polyak = 0.995
        self.gamma = discount_factor
        self.pi_lr = alpha
        self.q_lr = alpha
        self.batch_size = 100
        self.start_steps = 10000
        self.update_after = 1000
        self.update_every = 50
        self.act_noise = 0.1

        self.step_count = 0
        self.action_space = action_space
        self.observation_space = observation_space
        # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(17,), dtype=np.float32) #fix

        # torch.manual_seed(seed)
        # np.random.seed(seed)

        # self.obs_dim = self.observation_space.shape
        self.act_dim = self.action_space.shape[0]
        # act_dim = self.action_space.n

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.action_space.high[0]

        self.net = False
コード例 #10
0
def worker_test(ps, start_time):

    from spinup.utils.run_utils import setup_logger_kwargs

    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
    logger = EpochLogger(**logger_kwargs)
    config = locals()
    del config['ps']
    logger.save_config(config)

    agent = Model(args)
    keys = agent.get_weights()[0]

    weights = ray.get(ps.pull.remote(keys))
    agent.set_weights(keys, weights)
    test_env = gym.make(args.env)
    while True:
        ave_ret = agent.test_agent(test_env, args)
        # print("test Average Ret:", ave_ret, "time:", time.time()-start_time)
        logger.log_tabular('AverageTestEpRet', ave_ret)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)
コード例 #11
0
ファイル: train_cont_model.py プロジェクト: ac-93/braille_rl
    'use_HER':True,
    'use_prev_a':True,
    'gamma':0.95,
    'polyak':0.995,
    'act_lr' :0.0005,
    'crit_lr':0.0005,
    'alph_lr':0.001,

    # ==== noise params ====
    'alpha': 'auto',         # fixed or auto balance
    'target_entropy':-6  # fixed or auto define with act_dim
}

saved_model_dir = os.path.join('../saved_models/', rl_params['platform'], rl_params['env_type'], rl_params['env_mode'])
logger_kwargs = setup_logger_kwargs(exp_name='cont_sac',
                                    seed=rl_params['seed'],
                                    data_dir=saved_model_dir,
                                    datestamp=False)

if 'sim' in rl_params['platform']:
    from braille_rl.envs.sim.cont_sim_braille_env.mockKBGymEnv import mockKBGymEnv
    env = mockKBGymEnv(mode=rl_params['env_mode'], max_steps=rl_params['max_ep_len'])
elif 'robot' in rl_params['platform']:
    from braille_rl.envs.robot.cont_ur5_braille_env.ur5GymEnv import UR5GymEnv
    env = UR5GymEnv(mode=rl_params['env_mode'], max_steps=rl_params['max_ep_len'])

sac(env, logger_kwargs=logger_kwargs,
         network_params=network_params,
         rl_params=rl_params,
         resume_training=False,
         resume_params=dict())
コード例 #12
0
        # Perform VPG update!
        update()

        # Log info about epoch
        #logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', average_only=True)
        #logger.log_tabular('EpLen', average_only=True)
        #logger.log_tabular('VVals', with_min_and_max=True)
        #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        #logger.log_tabular('LossPi', average_only=True)
        #logger.log_tabular('LossV', average_only=True)
        #logger.log_tabular('DeltaLossPi', average_only=True)
        #logger.log_tabular('DeltaLossV', average_only=True)
        #logger.log_tabular('Entropy', average_only=True)
        #logger.log_tabular('KL', average_only=True)
        #logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()


from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(exp_name, seed)

vpg(lambda: gym.make(env),
    actor_critic=core.mlp_actor_critic,
    ac_kwargs=dict(hidden_sizes=[layer_size] * layers),
    gamma=gamma,
    seed=seed,
    steps_per_epoch=steps,
    epochs=epochs,
    logger_kwargs=logger_kwargs)
コード例 #13
0
    parser.add_argument('--pi_lr', type=float, default=3e-4)
    parser.add_argument('--clp_ratio', type=float, default=0.2)
    parser.add_argument('--jahThresh', type=int, default=100000)

    args = parser.parse_args()

    mpi_fork(args.cpu)  # run parallel code with mpi

    #    from spinup.utils.run_utils import setup_logger_kwargs
    #    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)

    from spinup.utils.run_utils import setup_logger_kwargs
    import time
    currTime = round(time.time())
    logger_kwargs = setup_logger_kwargs(
        args.exp_name,
        str(args.seed) + "_e" + str(args.epochs) + "_" + args.objective + "_" +
        str(currTime) + "_PPO")

    ppo(lambda: gym.make(args.env,
                         netname=args.exp_name,
                         objective=args.objective,
                         seed=args.seed,
                         jahThresh=args.jahThresh),
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
        gamma=args.gamma,
        seed=args.seed,
        steps_per_epoch=args.steps,
        epochs=args.epochs,
        logger_kwargs=logger_kwargs,
        clip_ratio=args.clp_ratio,
コード例 #14
0
seed = 125259
num_epochs = 100
ep_per_epoch = 5000
random_steps = 10000
max_ep_len = 1000
hidden_sizes = [400, 300]
buffer_size = int(1e6)
batch_size = 100
gamma = 0.99
lr_a = 1e-3
lr_q = 1e-3
p = 0.995
save_freq = 1

# Set logger
logger_kwargs = setup_logger_kwargs('test_own', seed)
logger = EpochLogger(**logger_kwargs)
logger.save_config(locals())

# Setting seed
np.random.seed(seed)
tf.set_random_seed(seed)

# Create environment
env = gym.make('BipedalWalker-v2')
test_env = gym.make('BipedalWalker-v2')
#env.render()

# Get imporant variables
action_space = env.action_space
obs_space = env.observation_space
コード例 #15
0
    pi_lr = params["actor_learning_rate"]
    vf_lr = params["critic_learning_rate"]
    minibatch_size = params["batch_size"]
    discountFactor = params["discount_factor"]
    explorationRate = params["exploration_rate"]
    memorySize = params["memory_size"]
    hidden_sizes = params['hidden_sizes']
    act_noise_amount = params["action_noise"]
    policy_net_training_steps = params["training_steps"]
    # print("hidden_sizes: ", hidden_sizes)

    DEFAULT_DATA_DIR = osp.join(
        osp.abspath(osp.dirname(osp.dirname(__file__))), 'data')
    # PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
    logger_kwargs = setup_logger_kwargs(args.exp_name,
                                        seed=args.seed,
                                        data_dir=DEFAULT_DATA_DIR)
    # PROJECT_ROOT = logger_kwargs['output_dir']
    # saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=0.5)
    sess = tf.Session()
    with tf.variable_scope('main'):
        policy_net = actor_critic_core.AC_Net(
            env,
            EP_MAX=epochs,
            EP_LEN=episode_steps,
            GAMMA=discountFactor,
            AR=pi_lr,
            CR=vf_lr,
            BATCH=minibatch_size,
            UPDATE_STEP=policy_net_training_steps,
            hidden_sizes=hidden_sizes,
コード例 #16
0
    parser.add_argument("--noise1", default=0.1,
                        type=float)  # Probability of selecting random action
    parser.add_argument("--noise2", default=0.1,
                        type=float)  # Std of Gaussian exploration noise
    args = parser.parse_args()

    file_name = "DDPG_%s_%s" % (args.env_name, str(args.seed))
    buffer_name = "ExpertN%sN%s_%s_%s" % (str(args.noise1), str(
        args.noise2), args.env_name, str(args.seed))
    print("---------------------------------------")
    print("Settings: " + file_name)
    print("---------------------------------------")

    from spinup.utils.logx import EpochLogger
    from spinup.utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs('BufferQuality_' + buffer_name,
                                        args.seed)
    """set up logger"""
    logger = EpochLogger(**logger_kwargs)
    logger_save_freq = 10

    if not os.path.exists("./buffers"):
        os.makedirs("./buffers")

    env = gym.make(args.env_name)

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
コード例 #17
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     logger_kwargs = setup_logger_kwargs('MultiTaskDDPGAugmentedOracle')
     self.logger = EpochLogger(**logger_kwargs)
     self.logger.save_config(globals())
コード例 #18
0
ファイル: sac.py プロジェクト: yining043/SAC-discrete
        'max_ep_len':18000,
        'max_noop':10,
        'save_freq':5,
        'render':True,

        # rl params
        'gamma':0.99,
        'polyak':0.995,
        'lr':0.00025,
        'grad_clip_val':5.0,

        # entropy params
        'alpha': 'auto',
        'target_entropy_start':0.5, # proportion of max_entropy
        'target_entropy_stop':0.5,
        'target_entropy_steps':1e6,
    }

    saved_model_dir = '../../saved_models'
    logger_kwargs = setup_logger_kwargs(exp_name='sac_discrete_gb_atari_' + rl_params['env_name'], seed=rl_params['seed'], data_dir=saved_model_dir, datestamp=False)

    env = gym.make(rl_params['env_name'])

    # avoids crash when later rendering the environment
    if rl_params['render']:
        test_env(lambda:env)

    sac(lambda:env, logger_kwargs=logger_kwargs,
                    network_params=network_params,
                    rl_params=rl_params)
コード例 #19
0
ファイル: meil-v3.py プロジェクト: johnhallman/reil-algorithm
def ppo(BASE_DIR,
        expert_density,
        env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        steps_per_epoch=1000,
        epochs=10,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=50,
        train_v_iters=50,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        data_n=10):

    data = {}  # ALL THE DATA

    logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # update rule
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    policy_distr = Gaussian_Density()
    policy = lambda s: np.random.uniform(
        -2.0, 2.0, size=env.action_space.shape)  # random policy
    policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                       args.iter_length)
    density = policy_distr.density()

    data[0] = {
        'pol_s': policy_distr.num_samples,
        'pol_t': policy_distr.num_trajects
    }

    dist_rewards = []

    # repeat REIL for given number of rounds
    for i in range(args.rounds):

        message = "\nRound {} out of {}\n".format(i + 1, args.rounds)
        reward = lambda s: expert_density(s) / (density(s) + args.eps)

        dist_rewards.append(reward)

        start_time = time.time()
        o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        r = reward(o)  # custom reward

        # Main loop: collect experience in env and update/log each epoch
        for epoch in range(epochs):
            for t in range(local_steps_per_epoch):

                a, v_t, logp_t = sess.run(get_action_ops,
                                          feed_dict={x_ph: o.reshape(1, -1)})

                # save and log
                buf.store(o, a, r, v_t, logp_t)
                logger.store(VVals=v_t)

                o, old_r, d, _ = env.step(a[0])
                r = reward(o)
                ep_ret += r
                ep_len += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal or (t == local_steps_per_epoch - 1):
                    if not (terminal):
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = old_r if d else sess.run(
                        v, feed_dict={x_ph: o.reshape(1, -1)})
                    last_val = reward(o)
                    buf.finish_path(last_val)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    r = reward(o)

            # store model!
            if (epoch == epochs - 1): logger.save_state({'env': env}, None)

            # Perform PPO update!
            update()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts',
                               (epoch + 1) * steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('DeltaLossPi', average_only=True)
            logger.log_tabular('DeltaLossV', average_only=True)
            logger.log_tabular('Entropy', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('ClipFrac', average_only=True)
            logger.log_tabular('StopIter', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
            print(message)

        policy = lambda state: sess.run(
            get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0]
        data[i] = {
            'pol_s': policy_distr.num_samples,
            'pol_t': policy_distr.num_trajects
        }
        data[i]['rewards'] = evaluate_reward(env, policy, data_n)

        if i != args.rounds - 1:
            policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                               args.iter_length)
            density = policy_distr.density()

    return data, dist_rewards
コード例 #20
0
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
         steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3,
         train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, 
         backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), 
         save_freq=10, algo='trpo'):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ============  ================  ========================================
            Symbol        Shape             Description
            ============  ================  ========================================
            ``pi``        (batch, act_dim)  | Samples actions from policy given 
                                            | states.
            ``logp``      (batch,)          | Gives log probability, according to
                                            | the policy, of taking actions ``a_ph``
                                            | in states ``x_ph``.
            ``logp_pi``   (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``.
            ``info``      N/A               | A dict of any intermediate quantities
                                            | (from calculating the policy or log 
                                            | probabilities) which are needed for
                                            | analytically computing KL divergence.
                                            | (eg sufficient statistics of the
                                            | distributions)
            ``info_phs``  N/A               | A dict of placeholders for old values
                                            | of the entries in ``info``.
            ``d_kl``      ()                | A symbol for computing the mean KL
                                            | divergence between the current policy
                                            | (``pi``) and the old policy (as 
                                            | specified by the inputs to 
                                            | ``info_phs``) over the batch of 
                                            | states given in ``x_ph``.
            ``v``         (batch,)          | Gives the value estimate for states
                                            | in ``x_ph``. (Critical: make sure 
                                            | to flatten this!)
            ============  ================  ========================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TRPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        delta (float): KL-divergence limit for TRPO / NPG update. 
            (Should be small for stability. Values like 0.01, 0.05.)

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        damping_coeff (float): Artifact for numerical stability, should be 
            smallish. Adjusts Hessian-vector product calculation:
            
            .. math:: Hv \\rightarrow (\\alpha I + H)v

            where :math:`\\alpha` is the damping coefficient. 
            Probably don't play with this hyperparameter.

        cg_iters (int): Number of iterations of conjugate gradient to perform. 
            Increasing this will lead to a more accurate approximation
            to :math:`H^{-1} g`, and possibly slightly-improved performance,
            but at the cost of slowing things down. 

            Also probably don't play with this hyperparameter.

        backtrack_iters (int): Maximum number of steps allowed in the 
            backtracking line search. Since the line search usually doesn't 
            backtrack, and usually only steps back once when it does, this
            hyperparameter doesn't often matter.

        backtrack_coeff (float): How far back to step during backtracking line
            search. (Always between 0 and 1, usually above 0.5.)

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        algo: Either 'trpo' or 'npg': this code supports both, since they are 
            almost the same.

    """

    # initialize logger and save it
    

    # initialize seed, and set tf and np
    
    # get the env function, observation dimensions, and action dimensions
    
    # Share information about action space with policy architecture
    
    # Inputs to computation graph
    
    # Main outputs from computation graph, plus placeholders for old pdist (for KL)
    
    # Need all placeholders in *this* order later (to zip with data from buffer)
    
    # Every step, get: action, value, logprob, & info for pdist (for computing kl div)
    
    # Experience buffer
    # calculate the number of steps per epoch per process
    

    # get the info shapes
    

    # initialize the bugger
    

    # Count variables
    
    # TRPO losses
    # ratio of pi / pi_old
    # pi loss
    # v loss
    
    # Optimizer for value function
    

    # Symbols needed for CG solver
    # pi params
    # gradient
    # v_ph and hvp
    
    # check if the damping coeff is needed
    # if so, update hvp (damping_coeff * v_ph)
    
    # Symbols for getting and setting params
    # get pi params
    # set pi params
    
    # create a tf session and initialize it's variables
    
    # Sync params across processes
    

    # Setup model saving
    

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """

        # initialize x as 0s of shape b
    

        # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        # make a copy of b and r as r and p
    
        # calculate r dot old (r dot r)
    

        # for cg_iterations
    

            # calc z as Ax(p)
    

            # calculate alpha 
            

            # increment x
      

            # decrement r

            # calculate r dot new (r dot r)

            # calculate p

            # update r dot old with r dot new

    def update():
        # Prepare hessian func, gradient eval
        # get inputs as a dictionary, all phs and buffer

        # calculate Hx

        # get g, pi_l_old, v_l_old

        # get g and pi_l_old averages

        # Core calculations for TRPO or NPG
        # get x

        # get alpha

        # get old paramers

        def set_and_eval(step):
            # set pi params with v_ph
            # old_params - alpha * x * step

            # return average of d_kl and pi_loss operation

        # handle npg
        
            # npg has no backtracking or hard kl constraint enforcement

        # handle trpo
        
            # trpo augments npg with backtracking line search, hard kl
            # for backtrack iterations
        
        # Value function updates
        # for train_v_iterations
        
        # update v_l_new with v_loss operation
        

        # Log changes from update
        
    # Update start time

    # reset variables
    # o, r, d, ep_ret, ep_len

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            # get agent outputs

            # decontruct the above to a, v_t, logp_t, info_t

            # save and log


            # take an action

            # update ep rewards and length
          

            # check if the episode is done

            # check if terminal or at max t for local epoch
            
                # if trajectory didn't reach terminal state, bootstrap value target

                # add the finish path to buffer

                    # only save EpRet / EpLen if trajectory finished

                # reset environment variables
                # o, r, d, ep_ret, ep_len

        # Save model
      

        # Perform TRPO or NPG update!

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo=='trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='HalfCheetah-v2')
    parser.add_argument('--hid', type=int, default=64)
    parser.add_argument('--l', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--cpu', type=int, default=4)
    parser.add_argument('--steps', type=int, default=4000)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--exp_name', type=str, default='trpo')
    args = parser.parse_args()

    mpi_fork(args.cpu)  # run parallel code with mpi

    from spinup.utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)

    trpo(lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, 
         seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs,
         logger_kwargs=logger_kwargs)
コード例 #21
0
    parser.add_argument('--backfil', type=int, default=0)
    # this parameter is deprecated, and this model will have skip function by default.
    parser.add_argument('--skip', type=int, default=0)
    parser.add_argument('--score_type', type=int, default=0)
    parser.add_argument('--batch_job_slice', type=int, default=10000)
    parser.add_argument('--sched_algo', type=int, default=4)
    args = parser.parse_args()

    from spinup.utils.run_utils import setup_logger_kwargs

    # build absolute path for using in hpc_env.
    current_dir = os.getcwd()
    workload_file = os.path.join(current_dir, args.workload)
    log_data_dir = os.path.join(current_dir, './data/logs/')
    logger_kwargs = setup_logger_kwargs(args.exp_name,
                                        seed=args.seed,
                                        data_dir=log_data_dir)
    if args.pre_trained:
        model_file = os.path.join(current_dir, args.trained_model)
        # get_probs, get_value = load_policy(model_file, 'last')

        ppo(workload_file,
            args.model,
            gamma=args.gamma,
            seed=args.seed,
            traj_per_epoch=args.trajs,
            epochs=args.epochs,
            logger_kwargs=logger_kwargs,
            pre_trained=1,
            trained_model=os.path.join(model_file, "simple_save"),
            attn=args.attn,
コード例 #22
0
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='HalfCheetah-v2')
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--exp_name', type=str, default='ex13-td3')
    parser.add_argument('--use_soln', action='store_true')
    args = parser.parse_args()

    from spinup.utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name + '-' + args.env.lower(),
                                        args.seed)

    all_kwargs = dict(env_fn=lambda: gym.make(args.env),
                      actor_critic=core.mlp_actor_critic,
                      ac_kwargs=dict(hidden_sizes=[128, 128]),
                      max_ep_len=150,
                      seed=args.seed,
                      logger_kwargs=logger_kwargs,
                      epochs=10)

    if args.use_soln:
        true_td3(**all_kwargs)
    else:
        td3(**all_kwargs)
コード例 #23
0
    args = parser.parse_args()

    #mpi_fork(args.cpu)  # run parallel code with mpi

    ##make env dir to arrange files to each env dir
    from spinup.user_config import DEFAULT_DATA_DIR
    import os

    datadir = os.path.join(DEFAULT_DATA_DIR, args.env)
    os.makedirs(datadir, exist_ok=True)
    ##

    from spinup.utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name,
                                        args.seed,
                                        data_dir=datadir)

    sigail(lambda: gym.make(args.env),
           actor_critic=core.mlp_actor_critic_add,
           ac_kwargs=dict(hidden_sizes=[args.hid] * args.l,
                          policy_logvar=args.logvar),
           gamma=args.gamma,
           d_hidden_size=args.d_hid,
           seed=args.seed,
           steps_per_epoch=args.steps,
           epochs=args.epochs,
           train_pi_iters=args.g_itr,
           train_v_iters=args.g_itr,
           beta=args.beta,
           logger_kwargs=logger_kwargs,
コード例 #24
0
    rospy.init_node('pelican_attitude_controller_sac_training',
                    anonymous=True,
                    log_level=rospy.WARN)
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='PelicanAttControllerEnv-v0')
    parser.add_argument('--hid', type=int, default=64)
    parser.add_argument('--l', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--epochs', type=int, default=10000)
    parser.add_argument('--exp_name', type=str, default='sac')
    args = parser.parse_args()

    from spinup.utils.run_utils import setup_logger_kwargs
    DEFAULT_DATA_DIR = osp.join(
        osp.abspath(osp.dirname(osp.dirname(__file__))), 'data')
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed,
                                        DEFAULT_DATA_DIR)
    outdir = '/tmp/openai_ros_experiments/'
    env = gym.make(args.env)
    # env = gym.wrappers.Monitor(env, outdir, force=True)
    sac(lambda: env,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
        gamma=args.gamma,
        seed=args.seed,
        epochs=args.epochs,
        logger_kwargs=logger_kwargs)
コード例 #25
0
ファイル: sac.py プロジェクト: munyan/soft-actor-critic
        'batch_size': 64,
        'start_steps': 4000,
        'max_ep_len': 1000,
        'save_freq': 5,
        'render': True,

        # rl params
        'gamma': 0.99,
        'polyak': 0.995,
        'lr': 0.001,
        'grad_clip_val': None,

        # entropy params
        'alpha': 'auto',  # fixed or auto balance
        'target_entropy': 'auto',  # fixed or auto define with act_dim
    }

    saved_model_dir = '../../saved_models'
    logger_kwargs = setup_logger_kwargs(exp_name='sac_cont_image_' +
                                        rl_params['env_name'],
                                        seed=rl_params['seed'],
                                        data_dir=saved_model_dir,
                                        datestamp=False)

    env = gym.make(rl_params['env_name'])

    sac(lambda: env,
        logger_kwargs=logger_kwargs,
        network_params=network_params,
        rl_params=rl_params)
コード例 #26
0
def run_exp(alg="sac",alpha=None,add_penalty=1,mult_penalty=None,cost_penalty=0,buckets=None,
         epochs=30,start_steps=10000,split_policy=False,ac_kwargs={"hidden_sizes":(256,256)},
            safe_policy=False,entropy_constraint=-1,collector_policy=None,filename="",steps_per_epoch=10001,
            num_test_episodes=10,act_noise=0.1,data_aug=False,env_name='Safexp-PointGoal1-v0',batch_size=100):

    # alg determines whether sac, ppo or td3 is used.
    # alpha is the exploration parameter in sac. Add_parameter is Beta from the proposal.
    # If mult_penalty is not None, all rewards get multiplied by it once the constraint is violated (1-alpha from the proposal)
    # cost_penalty is equal to zeta from the proposal. buckets determines how the accumulated cost is discretized for the agent:
    #if it is None, cost is a continouos variable, else there are buckets indicator variables for a partition of [0,constraint]
    # (with the last only activating if the constraint is violated). Epochs indicates, how many epochs to train for, start_steps indicates
    # how many random exploratory actions to perform before using the trained policy. Split_policy changes the network architecture
    # such that a second network is used for the policy and q-values when the constraint is violated. ac_kwargs is a dict containing
    # the arguments for the actor-critic class. Hidden sizes is a tuple containing the sizes for all hidden layers.
    # safe_policy indicates the saving location for a trained safe policy. If provided, the safe policy will take over whenever the constraint
    # is violated. filename determines, where in the results folder the res29541.pts-3.tensorflow-1-vmults and trained policy get saved to.
    # Steps_per epoch determines the amount of environment interaction per epoch. Num_test episodes the amount of test_episodes
    # (only for evaluation) that are performed after each epoch. act_noise controls the exploration noise used in the td3 algorithm.
    # Entropy_constraint is the entropy to aim for (if sac is used with trainable alpha)
    # Collector_policy specifies
    if mult_penalty == -1:
        mult_penalty = None
    if buckets == -1:
        buckets = None
    if entropy_constraint == 0:
        entropy_constraint = None
    if alpha == 0:
        alpha = None

    env = gym.make(env_name) # Create an instance of the safety-gym environment.
    # Create an instance of the constrained environment.
    env = constraint_wrapper(env,add_penalty=add_penalty,mult_penalty=mult_penalty,
                             cost_penalty=cost_penalty,buckets=buckets,safe_policy=safe_policy)
    logger_kwargs = setup_logger_kwargs(filename+"policy",data_dir="results/")
    assert alg == "sac" or alg == "td3" or alg == "ppo"
    # Select learning method
    if alg == "sac":
        import spinup.algos.pytorch.sac.core as core
        if split_policy:
            actor_critic = core.MLPActorCriticSplit
        else:
            actor_critic = core.MLPActorCritic
        # Start training with SAC
        sac_pytorch(lambda: env,epochs=epochs,alpha=alpha,steps_per_epoch=steps_per_epoch,start_steps=start_steps,
                    logger_kwargs=logger_kwargs,num_test_episodes=num_test_episodes,actor_critic=actor_critic,ac_kwargs=ac_kwargs,entropy_constraint=entropy_constraint,collector_policy=collector_policy,data_aug=data_aug,batch_size=batch_size)
    elif alg == "td3":
        import spinup.algos.pytorch.td3.core as core
        if split_policy:
            actor_critic = core.MLPActorCriticSplit
        else:
            actor_critic = core.MLPActorCritic
        # Start training with TD3
        td3_pytorch(lambda: env,epochs=epochs,steps_per_epoch=steps_per_epoch,start_steps=start_steps,logger_kwargs=logger_kwargs,
                    actor_critic=actor_critic,act_noise=act_noise,ac_kwargs=ac_kwargs,collector_policy=collector_policy,data_aug=data_aug,num_test_episodes=num_test_episodes,batch_size=batch_size)
    elif alg == "ppo":
        import spinup.algos.pytorch.ppo.core as core
        assert collector_policy==None
        if split_policy:
            actor_critic = core.MLPActorCriticSplit
        else:
            actor_critic = core.MLPActorCritic
        # Start training with PPO
        ppo_pytorch(lambda: env, epochs=epochs, steps_per_epoch=steps_per_epoch,
                    logger_kwargs=logger_kwargs,
                    actor_critic=actor_critic, ac_kwargs=ac_kwargs)

    #Ideally, you would separate train and test runs more directly here rather than reylying on the alg to work exactly as described...
    # Store training results in pickle file
    with open("results/"+filename+"rews.pkl", 'wb') as f:
        pickle.dump(env.total_rews, f)
    with open("results/"+filename+"costs.pkl", 'wb') as f:
        pickle.dump(env.total_costs, f)
コード例 #27
0
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--vm_checkpoint', type=str)
    parser.add_argument('--pim_checkpoint', type=str, default=None)

    args = parser.parse_args()

    if args.cpu > 1:
        mpi_tools.mpi_fork(args.cpu, allow_run_as_root=args.allow_run_as_root)  # run parallel code with mpi

    # Setup experiment name
    env = make_env(args.seed)

    from spinup.utils.run_utils import setup_logger_kwargs

    experiment_name = args.exp_name or env.spec.id
    logger_kwargs = setup_logger_kwargs(experiment_name, args.seed)

    # Load or create model
    saved_model_file = None
    if args.saved_model_file:
        saved_model_file = pathlib.Path(args.saved_model_file)
    elif args.continue_training:
        save_dir = pathlib.Path(logger_kwargs['output_dir'], 'pyt_save')
        if save_dir.exists():
            saved_model_file = get_latest_saved_file(save_dir, prefix='model')

    if saved_model_file:
        assert saved_model_file.exists()
        model = torch.load(saved_model_file, map_location=DEVICE)
        for p in model.parameters():
            p.requires_grad_()
コード例 #28
0
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''):

    video_dir = 'videos/' + args.exp_name

    direct = os.getcwd() + '/data/'
    experiment_directory = direct + args.exp_name
    print(experiment_directory)

    print(sys.argv)
    if not os.path.exists(experiment_directory):
        os.makedirs(experiment_directory)
        f = open(experiment_directory + '/args', 'w')
        f.write(' '.join(sys.argv))
        f.flush()

    indexes = [1, 5, 10, 15]
    states_visited_indexes = [0, 5, 10, 15]

    states_visited_cumulative = []
    states_visited_cumulative_baseline = []

    running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent = 0
    running_avg_ent_xy = 0

    running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states)))
    running_avg_p_baseline_xy = np.zeros(
        shape=(tuple(ant_utils.num_states_2d)))
    running_avg_ent_baseline = 0
    running_avg_ent_baseline_xy = 0

    pct_visited = []
    pct_visited_baseline = []
    pct_visited_xy = []
    pct_visited_xy_baseline = []

    running_avg_entropies = []
    running_avg_entropies_xy = []
    running_avg_ps_xy = []
    avg_ps_xy = []

    running_avg_entropies_baseline = []
    running_avg_entropies_baseline_xy = []
    running_avg_ps_baseline_xy = []
    avg_ps_baseline_xy = []

    policies = []
    distributions = []
    initial_state = init_state(env)

    prebuf = ExperienceBuffer()
    env.reset()
    for t in range(10000):
        action = env.action_space.sample()
        obs, reward, done, _ = env.step(action)
        prebuf.store(get_state(env, obs))
        if done:
            env.reset()
            done = False

    prebuf.normalize()
    normalization_factors = prebuf.normalization_factors
    utils.log_statement(normalization_factors)
    prebuf = None
    if not args.gaussian:
        normalization_factors = []

    reward_fn = np.zeros(shape=(tuple(ant_utils.num_states)))

    for i in range(epochs):
        utils.log_statement("*** ------- EPOCH %d ------- ***" % i)

        # clear initial state if applicable.
        if not args.initial_state:
            initial_state = []
        else:
            utils.log_statement(initial_state)
        utils.log_statement("max reward: " + str(np.max(reward_fn)))

        logger_kwargs = setup_logger_kwargs("model%02d" % i,
                                            data_dir=experiment_directory)

        # Learn policy that maximizes current reward function.
        print("Learning new oracle...")
        seed = random.randint(1, 100000)
        sac = AntSoftActorCritic(lambda: gym.make(args.env),
                                 reward_fn=reward_fn,
                                 xid=i + 1,
                                 seed=seed,
                                 gamma=args.gamma,
                                 ac_kwargs=dict(hidden_sizes=[args.hid] *
                                                args.l),
                                 logger_kwargs=logger_kwargs,
                                 normalization_factors=normalization_factors)

        # The first policy is random
        if i == 0:
            sac.soft_actor_critic(epochs=0)
        else:
            sac.soft_actor_critic(epochs=args.episodes,
                                  initial_state=initial_state,
                                  start_steps=args.start_steps)
        policies.append(sac)

        p, _ = sac.test_agent(T, normalization_factors=normalization_factors)
        distributions.append(p)
        weights = utils.get_weights(distributions)

        epoch = 'epoch_%02d' % (i)
        if args.render:
            if i < 10:
                sac.record(T=args.record_steps,
                           n=1,
                           video_dir=video_dir + '/baseline/' + epoch,
                           on_policy=False)
            sac.record(T=args.record_steps,
                       n=1,
                       video_dir=video_dir + '/entropy/' + epoch,
                       on_policy=True)

        # Execute the cumulative average policy thus far.
        # Estimate distribution and entropy.
        print("Executing mixed policy...")
        average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \
            execute_average_policy(env, policies, T, weights,
                                   reward_fn=reward_fn, norm=normalization_factors,
                                   initial_state=initial_state, n=args.n,
                                   render=args.render, video_dir=video_dir+'/mixed/'+epoch, epoch=i,
                                   record_steps=args.record_steps)

        print("Calculating maxEnt entropy...")
        round_entropy = entropy(average_p.ravel())
        round_entropy_xy = entropy(average_p_xy.ravel())

        # Update running averages for maxEnt.
        print("Updating maxEnt running averages...")
        running_avg_ent = running_avg_ent * (
            i) / float(i + 1) + round_entropy / float(i + 1)
        running_avg_ent_xy = running_avg_ent_xy * (
            i) / float(i + 1) + round_entropy_xy / float(i + 1)
        running_avg_p *= (i) / float(i + 1)
        running_avg_p += average_p / float(i + 1)
        running_avg_p_xy *= (i) / float(i + 1)
        running_avg_p_xy += average_p_xy / float(i + 1)

        # update reward function
        print("Update reward function")
        eps = 1 / np.sqrt(ant_utils.total_state_space)
        if args.cumulative:
            reward_fn = grad_ent(running_avg_p)
        else:
            reward_fn = 1.
            average_p += eps
            reward_fn /= average_p
        average_p = None  # delete big array

        # (save for plotting)
        running_avg_entropies.append(running_avg_ent)
        running_avg_entropies_xy.append(running_avg_ent_xy)
        if i in indexes:
            running_avg_ps_xy.append(np.copy(running_avg_p_xy))
            avg_ps_xy.append(np.copy(average_p_xy))

        print("Collecting baseline experience....")
        p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random(
            T, normalization_factors=normalization_factors, n=args.n)

        plotting.states_visited_over_time(states_visited,
                                          states_visited_baseline, i)
        plotting.states_visited_over_time(states_visited_xy,
                                          states_visited_xy_baseline,
                                          i,
                                          ext='_xy')

        # save for cumulative plot.
        if i in states_visited_indexes:
            # average over a whole bunch of rollouts
            # slow: so only do this when needed.
            print("Averaging unique xy states visited....")
            states_visited_xy = compute_states_visited_xy(
                env,
                policies,
                norm=normalization_factors,
                T=T,
                n=args.n,
                N=args.avg_N)
            states_visited_xy_baseline = compute_states_visited_xy(
                env,
                policies,
                norm=normalization_factors,
                T=T,
                n=args.n,
                N=args.avg_N,
                initial_state=initial_state,
                baseline=True)
            states_visited_cumulative.append(states_visited_xy)
            states_visited_cumulative_baseline.append(
                states_visited_xy_baseline)

        print("Compute baseline entropy....")
        round_entropy_baseline = entropy(p_baseline.ravel())
        round_entropy_baseline_xy = entropy(p_baseline_xy.ravel())

        # Update baseline running averages.
        print("Updating baseline running averages...")
        running_avg_ent_baseline = running_avg_ent_baseline * (
            i) / float(i + 1) + round_entropy_baseline / float(i + 1)
        running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * (
            i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1)

        running_avg_p_baseline *= (i) / float(i + 1)
        running_avg_p_baseline += p_baseline / float(i + 1)
        running_avg_p_baseline_xy *= (i) / float(i + 1)
        running_avg_p_baseline_xy += p_baseline_xy / float(i + 1)

        p_baseline = None

        # (save for plotting)
        running_avg_entropies_baseline.append(running_avg_ent_baseline)
        running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy)
        if i in indexes:
            running_avg_ps_baseline_xy.append(
                np.copy(running_avg_p_baseline_xy))
            avg_ps_baseline_xy.append(np.copy(p_baseline_xy))

        utils.log_statement(average_p_xy)
        utils.log_statement(p_baseline_xy)

        # Calculate percent of state space visited.
        pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size)
        pct_visited.append(pct)
        pct_xy = np.count_nonzero(running_avg_p_xy) / float(
            running_avg_p_xy.size)
        pct_visited_xy.append(pct_xy)

        pct_baseline = np.count_nonzero(running_avg_p_baseline) / float(
            running_avg_p_baseline.size)
        pct_visited_baseline.append(pct_baseline)
        pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float(
            running_avg_p_baseline_xy.size)
        pct_visited_xy_baseline.append(pct_xy_baseline)

        # Print round summary.
        col_headers = ["", "baseline", "maxEnt"]
        col1 = [
            "round_entropy_xy", "running_avg_ent_xy", "round_entropy",
            "running_avg_ent", "% state space xy", "% total state space"
        ]
        col2 = [
            round_entropy_baseline_xy, running_avg_ent_baseline_xy,
            round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline,
            pct_baseline
        ]
        col3 = [
            round_entropy_xy, running_avg_ent_xy, round_entropy,
            running_avg_ent, pct_xy, pct
        ]
        table = tabulate(np.transpose([col1, col2, col3]),
                         col_headers,
                         tablefmt="fancy_grid",
                         floatfmt=".4f")
        utils.log_statement(table)

        # Plot from round.
        plotting.heatmap(running_avg_p_xy, average_p_xy, i)
        plotting.heatmap1(running_avg_p_baseline_xy, i)

        if i == states_visited_indexes[3]:
            plotting.states_visited_over_time_multi(
                states_visited_cumulative, states_visited_cumulative_baseline,
                states_visited_indexes)

    # save final expert weights to use with the trained oracles.
    weights_file = experiment_directory + '/policy_weights'
    np.save(weights_file, weights)

    # cumulative plots.
    plotting.running_average_entropy(running_avg_entropies,
                                     running_avg_entropies_baseline)
    plotting.running_average_entropy(running_avg_entropies_xy,
                                     running_avg_entropies_baseline_xy,
                                     ext='_xy')

    plotting.heatmap4(running_avg_ps_xy,
                      running_avg_ps_baseline_xy,
                      indexes,
                      ext="cumulative")
    plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch")

    plotting.percent_state_space_reached(pct_visited,
                                         pct_visited_baseline,
                                         ext='_total')
    plotting.percent_state_space_reached(pct_visited_xy,
                                         pct_visited_xy_baseline,
                                         ext="_xy")

    return policies
コード例 #29
0
        logger.dump_tabular()


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--hid', type=int, default=64)
    parser.add_argument('--l', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--cpu', type=int, default=4)
    parser.add_argument('--steps', type=int, default=4000)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--exp_name', type=str, default='ppo')
    args = parser.parse_args()

    mpi_fork(args.cpu)  # run parallel code with mpi

    from spinup.utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)

    ppo(lambda: gym.make(args.env),
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
        gamma=args.gamma,
        seed=args.seed,
        steps_per_epoch=args.steps,
        epochs=args.epochs,
        logger_kwargs=logger_kwargs)
コード例 #30
0
    parser.add_argument('--ensemble_size', type=int, default=20)
    parser.add_argument('--replay_buf_bootstrap_p', type=float, default=0.75)
    parser.add_argument('--hardcopy_target_nn', action="store_true", help='Target network update method: hard copy')
    parser.add_argument('--act_noise',type=float, default=0.1)
    parser.add_argument("--exploration-strategy", type=str, choices=["action_noise", "epsilon_greedy"],
                        default='epsilon_greedy', help='action_noise or epsilon_greedy')
    parser.add_argument("--epsilon-max", type=float, default=1.0, help='maximum of epsilon')
    parser.add_argument("--epsilon-min", type=float, default=.01, help='minimum of epsilon')
    parser.add_argument("--epsilon-decay", type=float, default=.001, help='epsilon decay')

    parser.add_argument("--data_dir", type=str, default=None)

    args = parser.parse_args()

    # Set log data saving directory
    from spinup.utils.run_utils import setup_logger_kwargs
    data_dir = osp.join(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.dirname(osp.abspath(__file__)))))),
                        'spinup_data')
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed, data_dir, datestamp=True)

    dbedpg(env_name=args.env, render_env=args.render_env,
           act_noise=args.act_noise,
           ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
           gamma=args.gamma, seed=args.seed,
           ensemble_size=args.ensemble_size,
           replay_buf_bootstrap_p=args.replay_buf_bootstrap_p,
           epochs=args.epochs,
           steps_per_epoch=args.steps_per_epoch, start_steps=args.start_steps,
           logger_kwargs=logger_kwargs)