Beispiel #1
0
def test_action_mask_run_acer(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = ACER(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines import ACER

# There already exists an environment generator
# that will make and wrap atari environments correctly.
# Here we are also multiprocessing training (num_env=4 => 4 processes)
env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0)
# Frame-stacking with 4 frames
env = VecFrameStack(env, n_stack=4)

model = ACER('CnnPolicy', env, verbose=1)
model.learn(total_timesteps=25000)

# save
model.save("cnn_pong")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Beispiel #3
0
mean_reward, std_reward = evaluate_policy(model,
                                          env,
                                          n_eval_episodes=20,
                                          deterministic=False)
print('mean_reward = %s +/- %s' % (mean_reward, std_reward))

# Enjoy trained agent
env.rendermode = 'on'
obs = env.reset()
cumreward = 0
results = list()
minable = list()

while True:
    #for i in range(turns):
    action, _states = model.predict(obs, deterministic=False)
    obs, rewards, dones, info = env.step(action)
    cumreward += rewards
    print(action, rewards, dones, cumreward)
    results.append(info[0])
    a = abs(info[1] -
            1)  #translating sequence errors to be positive, else zero
    minable.append(a)

    # if info[1]==1:
    #     results.append(info[0])
    #     a=abs(info[1]-1) #translating sequence errors to be positive, else zero
    #     minable.append(a)
    #env.renderif('on')
    if dones == True:
        break