def test_run(self): tf.reset_default_graph() env = gym.make('Pendulum-v0') env = hrl.envs.C2DEnvWrapper(env, [5]) env = hrl.envs.ScaledRewards(env, 0.1) state_shape = list(env.observation_space.shape) global_step = tf.get_variable('global_step', [], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) def f_q(inputs): q = hrl.network.Utils.layer_fcs(inputs[0], [200, 100], env.action_space.n, l2=1e-4) return {"q": q} agent = hrl.DQN( f_create_q=f_q, state_shape=state_shape, # OneStepTD arguments num_actions=env.action_space.n, discount_factor=0.99, ddqn=False, # target network sync arguments target_sync_interval=100, target_sync_rate=1.0, # sampler arguments update_interval=4, replay_size=1000, batch_size=32, # epsilon greedy arguments greedy_epsilon=hrl.utils.CappedLinear(1e5, 0.5, 0.1), global_step=global_step, network_optimizer=hrl.network.LocalOptimizer( tf.train.AdamOptimizer(1e-3), grad_clip=10.0)) config = tf.ConfigProto() config.gpu_options.allow_growth = True with agent.create_session(config=config) as sess: runner = hrl.envs.EnvRunner(env, agent, evaluate_interval=sys.maxint, render_interval=sys.maxint, logdir=None) runner.episode(50)
def create_agent(n_optimizer, global_step): agent = hrl.DQN( f_create_q=f_q, state_shape=state_shape, # OneStepTD arguments num_actions=env.action_space.n, discount_factor=0.99, ddqn=False, # target network sync arguments target_sync_interval=100, target_sync_rate=1.0, # sampler arguments update_interval=4, replay_size=1000, batch_size=32, # epsilon greedy arguments greedy_epsilon=hrl.utils.CappedLinear(1e5, 0.5, 0.1), global_step=global_step, network_optimizer=n_optimizer) return agent
upsample_bias=(1, 1, 1, 0.1)) gamma = 0.9 _agent = hrl.DQN( f_create_q=f_net, state_shape=state_shape, # OneStepTD arguments num_actions=len(AGENT_ACTIONS), discount_factor=gamma, ddqn=True, # target network sync arguments target_sync_interval=1, target_sync_rate=target_sync_rate, # epsilon greeedy arguments # greedy_epsilon=0.025, # greedy_epsilon=0.05, # greedy_epsilon=0.075, # greedy_epsilon=0.2, # 0.2 -> 0.15 -> 0.1 greedy_epsilon=CappedLinear(10000, 0.15, 0.05), # greedy_epsilon=CappedLinear(10000, 0.1, 0.025), # optimizer arguments network_optimizer=hrl.network.LocalOptimizer(optimizer_td, 1.0), # sampler arguments sampler=TransitionSampler(replay_buffer, batch_size=8, interval=1, minimum_count=103), # checkpoint global_step=global_step) class Logger(object):
global_step = tf.get_variable('global_step', [], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) agent = hrl.DQN( f_create_q=f_net, state_shape=state_shape, # OneStepTD arguments num_actions=len(ACTIONS), discount_factor=0.9, ddqn=False, # target network sync arguments target_sync_interval=1, target_sync_rate=target_sync_rate, # epsilon greeedy arguments greedy_epsilon=0.2, # optimizer arguments network_optimizer=hrl.network.LocalOptimizer(optimizer_td, 10.0), # max_gradient=10.0, # sampler arguments sampler=TransitionSampler(BalancedMapPlayback(num_actions=len(ACTIONS), capacity=15000), batch_size=8, interval=1), # checkpoint global_step=global_step) def log_info(update_info): global action_fraction global action_td_loss