def test_F3_oh_value(): for i in range(3): ll_runs = 1 steps = 10000 ep_s = ExponentialDecay(steps / 16, 0.5, 0.05, steps) lr_s = ConstantSched(0.05) device = 'cuda' actions = 3 obs_shape = (1, ) batch_size = 32 env = gym.make('F3-v0') #env = RewardPerStep(env, reward_per_step=-0.01) env = TimeLimit(env, max_episode_steps=20) env = NormalizeFunctional(env, obs_f=normalize_obs, reward_f=normalize_reward) env = LookAhead(env) env = Reset(env) #env = Monitor(env) env = BatchTensor(env, device='cuda') #critic = FixupV(obs_shape, 4).to(device) critic = OneHotV(obs_shape, 12).to(device) policy = VPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device) exp_buffer = ExpBuffer(max_timesteps=steps // 10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=obs_shape) stepper = td_value.Stepper(env, OneObsToState(), exp_buffer) run_on(stepper=stepper, learner=td_value.train_one_value, env=env, critic=critic, policy=policy, ll_runs=ll_runs, eps_sched=ep_s, actions=actions, exp_buffer=exp_buffer, batch_size=batch_size, discount=0.8, lr_sched=lr_s, rendermode='episodic', steps=steps, logging_freq=1, run_id=f'f5_value_{i}', warmup_steps=0)
def test_F5_deep_q_proj(): for i in range(10): ll_runs = 1 steps = 1000 ep_s = ExponentialDecay(steps / 15, 0.3, 0.05, steps) lr_s = ConstantSched(0.05) device = 'cuda' actions = 3 obs_shape = (1, ) batch_size = 16 * ll_runs env = gym.make('F5-v0') env = TimeLimit(env, max_episode_steps=50) env = NormalizeFunctional(env, obs_f=norm_f5, reward_f=normalize_reward) env = Reset(env) env = Monitor(env) env = BatchTensor(env, device='cuda') #critic = ProjFixupQ(obs_shape, actions, 20, 4).to(device) critic = EnsembleQ(obs_shape, actions, hidden=20, blocks=4).to(device) behaviour_policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist).to(device) greedy_policy = QPolicy(critic, actions, GreedyDist).to(device) exp_buffer = ExpBuffer(max_timesteps=steps // 10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=obs_shape) algo = Q(env, critic, behaviour_policy, greedy_policy, exp_buffer, device=device, plot=FastPlot(actions)) algo.run(run_id='base_line', steps=steps, batch_size=batch_size, discount_factor=0.95, lr_sched=lr_s, eps_sched=ep_s, logging_freq=10)