Beispiel #1
0
def evaluate(step,
             policy_net,
             device,
             env,
             n_actions,
             eps=0.05,
             num_episode=5):
    env = wrap_deepmind(env, clip_rewards=True)
    sa = m.ActionSelector(eps, eps, policy_net, EPS_DECAY, n_actions, device)
    e_rewards = []
    q = deque(maxlen=5)
    for i in range(num_episode):
        env.reset()
        e_reward = 0
        for _ in range(10):  # no-op
            n_frame, _, done, _ = env.step(0)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

        while not done:
            state = torch.cat(list(q))[1:].unsqueeze(0)
            # print(state.shape)
            action, eps = sa.select_action(state, train)
            n_frame, reward, done, info = env.step(action)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

            e_reward += reward
        e_rewards.append(e_reward)

    f = open("file.txt", 'a')
    f.write("%f, %d, %d\n" %
            (float(sum(e_rewards)) / float(num_episode), step, num_episode))
    f.close()
def demo(num_episode=1):
    eps = 0.01
    env_raw = make_atari(args.env_name)
    env = wrap_deepmind(env_raw)
    c, h, w = m.fp(env.reset()).shape
    n_actions = env.action_space.n
    policy_net = m.DQN(h, w, n_actions, device).to(device)
    if device == "cuda":
        policy_net.load_state_dict(
            torch.load("models/" +
                       args.env_name.replace("NoFrameskip-v4", "") +
                       "_best.pth"))
    else:
        policy_net.load_state_dict(torch.load("models/"+args.env_name.replace("NoFrameskip-v4","")+\
            "_best.pth", map_location=torch.device('cpu')))
    policy_net.eval()
    sa = m.ActionSelector(eps, eps, policy_net, 100, n_actions, device)
    q = deque(maxlen=5)
    e_rewards = []
    for eee in range(num_episode):
        print("Demo episode %d/%d" % (eee + 1, num_episode) + "...")
        env.reset()
        e_reward = 0
        for _ in range(5):  # no-op
            n_frame, _, done, _ = env.step(0)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

        while not done:
            if num_episode <= 1:
                env.render()
                time.sleep(0.02)
            state = torch.cat(list(q))[1:].unsqueeze(0)
            action, eps = sa.select_action(state, False)
            n_frame, reward, done, _ = env.step(action)
            n_frame = m.fp(n_frame)
            q.append(n_frame)
            e_reward += reward

        e_rewards.append(e_reward)
    avg_reward = float(sum(e_rewards)) / float(num_episode)
    env.close()
    print("Average reward of " + args.env_name + " is %.1f" % (avg_reward))
    print("Average std of " + args.env_name + " is %.1f" % (np.std(e_rewards)))
Beispiel #3
0
def evaluate(step,
             policy_net,
             device,
             env,
             n_actions,
             eps=0.01,
             num_episode=5):
    global best_reward
    if not os.path.exists("models"):
        os.makedirs("models")
    env = wrap_deepmind(env)
    sa = m.ActionSelector(eps, eps, policy_net, EPS_DECAY, n_actions, device)
    e_rewards = []
    q = deque(maxlen=5)
    for _ in range(num_episode):
        env.reset()
        e_reward = 0
        for _ in range(5):  # no-op
            n_frame, _, done, _ = env.step(0)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

        while not done:
            state = torch.cat(list(q))[1:].unsqueeze(0)  # 为什么这里取的是后4帧的图像???
            action, eps = sa.select_action(state, train)
            n_frame, reward, done, _ = env.step(action)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

            e_reward += reward
        e_rewards.append(e_reward)

    f = open(env_name + ".csv", 'a')
    avg_reward = float(sum(e_rewards)) / float(num_episode)
    std = np.array(e_rewards).std()
    print("The average reward is: %.5f" % (avg_reward, ))
    if avg_reward > best_reward:
        print("Best reward, save model to disk!!!")
        torch.save(policy_net.state_dict(),
                   "models/" + env_name + "_" + str(int(avg_reward)) + ".pth")
        best_reward = avg_reward
    f.write("%f, %f, %d, %d\n" % (avg_reward, std, step, num_episode))
    f.close()
Beispiel #4
0
# 2. Seed and best value
torch.manual_seed(114514)
best_reward = 0.0

# 3. environment reset
env_name = args.env_id.replace(
    "NoFrameskip-v4",
    "") if "NoFrameskip-v4" in args.env_id else args.env_id.replace(
        "-ramNoFrameskip-v4", "")
env_raw = make_atari(args.env_id)
env = wrap_deepmind(env_raw,
                    frame_stack=False,
                    episode_life=True,
                    clip_rewards=True)

c, h, w = m.fp(env.reset()).shape
n_actions = env.action_space.n

# 4. Network reset
policy_net = m.DQN(h, w, n_actions, device).to(device)
target_net = m.DQN(h, w, n_actions, device).to(device)
policy_net.apply(
    policy_net.init_weights
)  # apply函数会把init_weights函数作用在每一个子模块上,如果更换了模型结构也可以不用更改inti函数,这就是apply的好处
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# 5. DQN hyperparameters
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1.
Beispiel #5
0
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu")  # if gpu is to be used

# 3. environment reset
# env_name = 'Breakout'
env_name = 'SpaceInvaders'
# env_name = 'Riverraid'
# env_name = 'Seaquest'
# env_name = 'MontezumaRevenge'
env_raw = make_atari('{}NoFrameskip-v4'.format(env_name))
env = wrap_deepmind(env_raw,
                    frame_stack=False,
                    episode_life=True,
                    clip_rewards=True)

c, h, w = m.fp(env.reset()).shape
n_actions = env.action_space.n
print(n_actions)

# 4. Network reset
policy_net = m.DQN(h, w, n_actions, device).to(device)
target_net = m.DQN(h, w, n_actions, device).to(device)
policy_net.apply(policy_net.init_weights)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# 5. DQN hyperparameters
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1.
EPS_END = 0.1