def test_frozenlake_value_debug(): for i in range(1): ll_runs = 1 steps = 20000 ep_s = ExponentialDecay(steps // 10, 0.5, 0.05, steps) device = 'cuda' actions = 2 env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=frozen_lake, max_steps=40) critic = DiscreteVTable((env.height, env.width)).to(device) critic.weights.data = v_table policy = VPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=0.0).to(device) batch_size = 16 * ll_runs join = OneObsToState() done = torch.tensor([0], dtype=torch.uint8) state = env.reset() env.render() print("") while not done.all(): lookahead_state, lookahead_reward, lookahead_done, info = env.lookahead() action_dist = policy(join(lookahead_state), lookahead_reward, lookahead_done) action = action_dist.sample() n, reward, done, reset, info = env.step(action) env.render() print("")
def test_frozenlake_window_sizes(): for _ in range(3): ll_runs = 600 steps = 30000 ep_s = ExponentialDecay(steps // 10, 0.05, steps) replay_window = ll_runs * steps // 20 batch_size = 32 * ll_runs batch_size = 8 * ll_runs run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window, batch_size=8 * ll_runs, workers=1, steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000) batch_size = 16 * ll_runs run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window, batch_size=16 * ll_runs, workers=1, steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000) batch_size = 32 * ll_runs run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window, batch_size=32 * ll_runs, workers=1, steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000) batch_size = 64 * ll_runs run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window, batch_size=64 * ll_runs, workers=1, steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000)
def test_F3_oh_value(): for i in range(3): ll_runs = 1 steps = 10000 ep_s = ExponentialDecay(steps / 16, 0.5, 0.05, steps) lr_s = ConstantSched(0.05) device = 'cuda' actions = 3 obs_shape = (1, ) batch_size = 32 env = gym.make('F3-v0') #env = RewardPerStep(env, reward_per_step=-0.01) env = TimeLimit(env, max_episode_steps=20) env = NormalizeFunctional(env, obs_f=normalize_obs, reward_f=normalize_reward) env = LookAhead(env) env = Reset(env) #env = Monitor(env) env = BatchTensor(env, device='cuda') #critic = FixupV(obs_shape, 4).to(device) critic = OneHotV(obs_shape, 12).to(device) policy = VPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) exp_buffer = ExpBuffer(max_timesteps=steps // 10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=obs_shape) stepper = td_value.Stepper(env, OneObsToState(), exp_buffer) run_on(stepper=stepper, learner=td_value.train_one_value, env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, actions=actions, exp_buffer=exp_buffer, batch_size=batch_size, discount=0.8, lr_sched=lr_s, rendermode='episodic', steps=steps, logging_freq=1, run_id=f'f5_value_{i}', warmup_steps=0)
def test_cliffwalk_q_baseline(): for i in range(3): ll_runs = 600 steps = 5000 ep_s = ExponentialDecay(steps // 10, 0.4, 0.02, steps) device = 'cuda' actions = 4 env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=cliff_walk, max_steps=40) critic = DiscreteQTable((env.height, env.width), actions).to(device) policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) batch_size = 16 * ll_runs exp_buffer = ExpBuffer(max_timesteps=steps//10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=env.observation_space_shape) run_on(stepper=one_step, learner=train_one, env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, exp_buffer=exp_buffer, batch_size=batch_size, discount=0.8, steps=steps, logging_freq=100, run_id=f'cliffwalk_q_{i}', warmup_steps=10)
def test_anthill_importance_sampled(): for i in range(10): ll_runs = 600 steps = 40000 ep_s = ExponentialDecay(steps // 10, 0.05, steps) replay_window = ll_runs * steps // 10 device = 'cuda' actions = 4 env = gym.make('SimpleGrid-v2', n=ll_runs, device=device, map_string=anthill) critic = DiscreteQTable((env.height, env.width), actions).to(device) policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) batch_size = 16 * ll_runs exp_buffer = PrioritizedExpBuffer(replay_window, batch_size, True, *env.observation_space_shape) run_deep_q_on(env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, exp_buffer=exp_buffer, batch_size=batch_size, workers=1, discount=0.8, steps=steps, logging_freq=100, run_id=f'anthill_imp_smp_{i}', warmup=1000)
def test_puddlejump_baseline(): for i in range(5): ll_runs = 600 steps = 40000 ep_s = ExponentialDecay(steps // 10, 0.05, steps) replay_window = ll_runs * steps // 10 device = 'cuda' actions = 4 env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=puddle_jumping, max_steps=100) critic = DiscreteQTable((env.height, env.width), actions).to(device) policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) exp_buffer = ExpBuffer(replay_window, *env.observation_space_shape) batch_size = 16 * ll_runs run_deep_q_on(env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, exp_buffer=exp_buffer, batch_size=batch_size, workers=1, discount=0.8, steps=steps, logging_freq=100, run_id=f'puddle_baseline_{i}', warmup=1000)
def test_frozenlake_value_importance_sampled(): for i in range(3): ll_runs = 600 steps = 5000 ep_s = ExponentialDecay(steps // 10, 0.4, 0.02, steps) device = 'cuda' actions = 2 env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=frozen_lake, max_steps=40) critic = DiscreteVTable((env.height, env.width)).to(device) policy = VPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=0.5).to(device) batch_size = 16 * ll_runs exp_buffer = PrioritizedExpBuffer(max_timesteps=steps//10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=env.observation_space_shape, importance_sample=True) run_on(stepper=one_step_value, learner=train_one_value, env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, exp_buffer=exp_buffer, batch_size=batch_size, discount=0.99, steps=steps, logging_freq=100, run_id=f'frozenlake_value_imps_{i}', warmup_steps=10, lr=0.05)
def test_frozenlake_deepq_grid_search(): for _ in range(3): for discount in np.arange(0.84, 1.0, 0.04): ll_runs = 600 batch_size = 16 * ll_runs steps = 15000 ep_s = ExponentialDecay(half_life=steps // 7.0, scale=0.4, bias=0.02, steps=steps) lr_s = ConstantSched(0.05) device = 'cuda' actions = 4 env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=frozen_lake, max_steps=40, reward_per_timestep=-0.01) critic = FixupQ((env.height, env.width), actions, 4).to(device) policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) exp_buffer = ExpBuffer( max_timesteps=steps // 8, ll_runs=ll_runs, batch_size=batch_size, observation_shape=env.observation_space_shape) run_on(stepper=one_step, learner=train_one, env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, lr_sched=lr_s, exp_buffer=exp_buffer, batch_size=batch_size, discount=0.99, steps=steps, logging_freq=100, run_id=f'frozenlake_deepq_discount_{discount}', warmup_steps=10)
def test_F5_deep_q_proj(): for i in range(10): ll_runs = 1 steps = 1000 ep_s = ExponentialDecay(steps / 15, 0.3, 0.05, steps) lr_s = ConstantSched(0.05) device = 'cuda' actions = 3 obs_shape = (1, ) batch_size = 16 * ll_runs env = gym.make('F5-v0') env = TimeLimit(env, max_episode_steps=50) env = NormalizeFunctional(env, obs_f=norm_f5, reward_f=normalize_reward) env = Reset(env) env = Monitor(env) env = BatchTensor(env, device='cuda') #critic = ProjFixupQ(obs_shape, actions, 20, 4).to(device) critic = EnsembleQ(obs_shape, actions, hidden=20, blocks=4).to(device) behaviour_policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist).to(device) greedy_policy = QPolicy(critic, actions, GreedyDist).to(device) exp_buffer = ExpBuffer(max_timesteps=steps // 10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=obs_shape) algo = Q(env, critic, behaviour_policy, greedy_policy, exp_buffer, device=device, plot=FastPlot(actions)) algo.run(run_id='base_line', steps=steps, batch_size=batch_size, discount_factor=0.95, lr_sched=lr_s, eps_sched=ep_s, logging_freq=10)
def test_frozenlake_value_grid(): for i in range(3): for step_penalty in np.arange(0.002, 0.1, 0.002): eps = 0.4 steps = 8000 ll_runs = 600 ep_s = ExponentialDecay(steps // 5, eps, 0.02, steps) lr_s = ConstantSched(0.05) device = 'cuda' actions = 2 env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=frozen_lake, max_steps=40, reward_per_timestep=-step_penalty) critic = FixupV((env.height, env.width), 4).to(device) policy = VPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=0.5).to(device) batch_size = 16 * ll_runs exp_buffer = ExpBuffer( max_timesteps=steps // 8, ll_runs=ll_runs, batch_size=batch_size, observation_shape=env.observation_space_shape) run_on(stepper=one_step_value, learner=train_one_value, env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, exp_buffer=exp_buffer, batch_size=batch_size, discount=0.99, steps=steps, logging_freq=100, run_id=f'frozenlake_step_{step_penalty}', warmup_steps=10, lr_sched=lr_s)
def test_fake_lunar_lander(): for i in range(2): ll_runs = 600 steps = 20000 ep_s = ExponentialDecay(steps // 10, 0.05, steps) replay_window = ll_runs * steps // 10 device = 'cuda' actions = 5 env = gym.make('GridLunarLander-v0', n=ll_runs, device=device) length = sum([np.prod(shape) for shape in env.observation_space_shape]) critic = DiscreteQTable((length,), actions).to(device) policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) exp_buffer = ExpBuffer(replay_window, *env.observation_space_shape) batch_size = 16 * ll_runs run_deep_q_on(env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, exp_buffer=exp_buffer, batch_size=batch_size, workers=1, discount=0.8, steps=steps, logging_freq=100, run_id=f'frozenlake_baseline_{i}', warmup=1000)
def test_lawn_deepq_baseline(): for i in range(3): ll_runs = 1 steps = 10000 ep_s = ExponentialDecay(steps // 10, 0.4, 0.02, steps) lr_s = ConstantSched(0.05) device = 'cuda' actions = 4 env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=lawn, max_steps=40) critic = FixupQ((env.height, env.width), actions, 4).to(device) policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) batch_size = 16 * ll_runs exp_buffer = ExpBuffer(max_timesteps=steps // 10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=env.observation_space_shape) run_on(stepper=one_step, learner=train_one, env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, exp_buffer=exp_buffer, batch_size=batch_size, discount=0.8, lr_sched=lr_s, rendermode='parallel', steps=steps, logging_freq=100, run_id=f'lawn_deeoq_{i}', warmup_steps=10)