Esempio n. 1
0
def test_F3_oh_value():

    for i in range(3):
        ll_runs = 1
        steps = 10000
        ep_s = ExponentialDecay(steps / 16, 0.5, 0.05, steps)
        lr_s = ConstantSched(0.05)
        device = 'cuda'
        actions = 3
        obs_shape = (1, )
        batch_size = 32

        env = gym.make('F3-v0')
        #env = RewardPerStep(env, reward_per_step=-0.01)
        env = TimeLimit(env, max_episode_steps=20)
        env = NormalizeFunctional(env,
                                  obs_f=normalize_obs,
                                  reward_f=normalize_reward)
        env = LookAhead(env)
        env = Reset(env)
        #env = Monitor(env)
        env = BatchTensor(env, device='cuda')

        #critic = FixupV(obs_shape, 4).to(device)
        critic = OneHotV(obs_shape, 12).to(device)
        policy = VPolicy(critic,
                         actions,
                         EpsilonGreedyProperDiscreteDist,
                         epsilon=1.0).to(device)

        exp_buffer = ExpBuffer(max_timesteps=steps // 10,
                               ll_runs=ll_runs,
                               batch_size=batch_size,
                               observation_shape=obs_shape)

        stepper = td_value.Stepper(env, OneObsToState(), exp_buffer)

        run_on(stepper=stepper,
               learner=td_value.train_one_value,
               env=env,
               critic=critic,
               policy=policy,
               ll_runs=ll_runs,
               eps_sched=ep_s,
               actions=actions,
               exp_buffer=exp_buffer,
               batch_size=batch_size,
               discount=0.8,
               lr_sched=lr_s,
               rendermode='episodic',
               steps=steps,
               logging_freq=1,
               run_id=f'f5_value_{i}',
               warmup_steps=0)
Esempio n. 2
0
def test_F5_deep_q_proj():
    for i in range(10):
        ll_runs = 1
        steps = 1000
        ep_s = ExponentialDecay(steps / 15, 0.3, 0.05, steps)
        lr_s = ConstantSched(0.05)
        device = 'cuda'
        actions = 3
        obs_shape = (1, )
        batch_size = 16 * ll_runs

        env = gym.make('F5-v0')
        env = TimeLimit(env, max_episode_steps=50)
        env = NormalizeFunctional(env,
                                  obs_f=norm_f5,
                                  reward_f=normalize_reward)
        env = Reset(env)
        env = Monitor(env)
        env = BatchTensor(env, device='cuda')

        #critic = ProjFixupQ(obs_shape, actions, 20, 4).to(device)
        critic = EnsembleQ(obs_shape, actions, hidden=20, blocks=4).to(device)
        behaviour_policy = QPolicy(critic, actions,
                                   EpsilonGreedyProperDiscreteDist).to(device)
        greedy_policy = QPolicy(critic, actions, GreedyDist).to(device)
        exp_buffer = ExpBuffer(max_timesteps=steps // 10,
                               ll_runs=ll_runs,
                               batch_size=batch_size,
                               observation_shape=obs_shape)
        algo = Q(env,
                 critic,
                 behaviour_policy,
                 greedy_policy,
                 exp_buffer,
                 device=device,
                 plot=FastPlot(actions))
        algo.run(run_id='base_line',
                 steps=steps,
                 batch_size=batch_size,
                 discount_factor=0.95,
                 lr_sched=lr_s,
                 eps_sched=ep_s,
                 logging_freq=10)