Beispiel #1
0
    def test_run(self):
        tf.reset_default_graph()
        env = gym.make('Pendulum-v0')
        env = hrl.envs.C2DEnvWrapper(env, [5])
        env = hrl.envs.ScaledRewards(env, 0.1)
        state_shape = list(env.observation_space.shape)
        global_step = tf.get_variable('global_step', [],
                                      dtype=tf.int32,
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        def f_q(inputs):
            q = hrl.network.Utils.layer_fcs(inputs[0], [200, 100],
                                            env.action_space.n,
                                            l2=1e-4)
            return {"q": q}

        agent = hrl.DQN(
            f_create_q=f_q,
            state_shape=state_shape,
            # OneStepTD arguments
            num_actions=env.action_space.n,
            discount_factor=0.99,
            ddqn=False,
            # target network sync arguments
            target_sync_interval=100,
            target_sync_rate=1.0,
            # sampler arguments
            update_interval=4,
            replay_size=1000,
            batch_size=32,
            # epsilon greedy arguments
            greedy_epsilon=hrl.utils.CappedLinear(1e5, 0.5, 0.1),
            global_step=global_step,
            network_optimizer=hrl.network.LocalOptimizer(
                tf.train.AdamOptimizer(1e-3), grad_clip=10.0))
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with agent.create_session(config=config) as sess:
            runner = hrl.envs.EnvRunner(env,
                                        agent,
                                        evaluate_interval=sys.maxint,
                                        render_interval=sys.maxint,
                                        logdir=None)
            runner.episode(50)
Beispiel #2
0
 def create_agent(n_optimizer, global_step):
     agent = hrl.DQN(
         f_create_q=f_q,
         state_shape=state_shape,
         # OneStepTD arguments
         num_actions=env.action_space.n,
         discount_factor=0.99,
         ddqn=False,
         # target network sync arguments
         target_sync_interval=100,
         target_sync_rate=1.0,
         # sampler arguments
         update_interval=4,
         replay_size=1000,
         batch_size=32,
         # epsilon greedy arguments
         greedy_epsilon=hrl.utils.CappedLinear(1e5, 0.5, 0.1),
         global_step=global_step,
         network_optimizer=n_optimizer)
     return agent
                            upsample_bias=(1, 1, 1, 0.1))

gamma = 0.9
_agent = hrl.DQN(
    f_create_q=f_net,
    state_shape=state_shape,
    # OneStepTD arguments
    num_actions=len(AGENT_ACTIONS),
    discount_factor=gamma,
    ddqn=True,
    # target network sync arguments
    target_sync_interval=1,
    target_sync_rate=target_sync_rate,
    # epsilon greeedy arguments
    # greedy_epsilon=0.025,
    # greedy_epsilon=0.05,
    # greedy_epsilon=0.075,
    # greedy_epsilon=0.2,  # 0.2 -> 0.15 -> 0.1
    greedy_epsilon=CappedLinear(10000, 0.15, 0.05),
    # greedy_epsilon=CappedLinear(10000, 0.1, 0.025),
    # optimizer arguments
    network_optimizer=hrl.network.LocalOptimizer(optimizer_td, 1.0),
    # sampler arguments
    sampler=TransitionSampler(replay_buffer,
                              batch_size=8,
                              interval=1,
                              minimum_count=103),
    # checkpoint
    global_step=global_step)


class Logger(object):
Beispiel #4
0
global_step = tf.get_variable('global_step', [],
                              dtype=tf.int32,
                              initializer=tf.constant_initializer(0),
                              trainable=False)

agent = hrl.DQN(
    f_create_q=f_net,
    state_shape=state_shape,
    # OneStepTD arguments
    num_actions=len(ACTIONS),
    discount_factor=0.9,
    ddqn=False,
    # target network sync arguments
    target_sync_interval=1,
    target_sync_rate=target_sync_rate,
    # epsilon greeedy arguments
    greedy_epsilon=0.2,
    # optimizer arguments
    network_optimizer=hrl.network.LocalOptimizer(optimizer_td, 10.0),
    # max_gradient=10.0,
    # sampler arguments
    sampler=TransitionSampler(BalancedMapPlayback(num_actions=len(ACTIONS),
                                                  capacity=15000),
                              batch_size=8,
                              interval=1),
    # checkpoint
    global_step=global_step)


def log_info(update_info):
    global action_fraction
    global action_td_loss