Beispiel #1
0
def play_random_custom(env, steps):
    _NOP = 0

    actions = [['start'], ['NOOP'], ['right', 'A'], ['left', 'A'],
               ['left', 'B'], ['right', 'B'], ['up'], ['down'], ['A'], ['B']]

    env = JoypadSpace(env, actions)

    env.reset()

    action = 0
    start = time.time()
    # play_human
    for t in range(0, steps):
        # get the mapping of keyboard keys to actions in the environment
        if hasattr(env, 'get_keys_to_action'):
            keys_to_action = env.get_keys_to_action()
        elif hasattr(env.unwrapped, 'get_keys_to_action'):
            keys_to_action = env.unwrapped.get_keys_to_action()
        else:
            raise ValueError('env has no get_keys_to_action method')

        # # change action every 6 frames
        if t % 6 == 0:
            action = env.action_space.sample()

            # after 500 timesteps, stop pressing start button
            if t > 500:
                while action == 0:
                    action = env.action_space.sample()

        observation, reward, done, info = env.step(action)
        # print("---------------------------t: ", t)
        # print("action space: ", action, env.action_space)
        # print("obs: ", observation)
        # print("reward: ", reward)
        # print("info: ", info)
        # runs game at about 60fps
        time.sleep(0.016667)
        env.render()

    end = time.time()
    env.close()
    print("time: ", (end - start), " seconds  for ", steps, "steps")
Beispiel #2
0
        def play_random_custom(env, steps):
            _NOP = 0

            env = JoypadSpace(env, actions)

            env.reset()

            action = 0
            start = time.time()

            if SHOULD_TRAIN:

                init_screen = get_screen()
                _, _, screen_height, screen_width = init_screen.shape

                # INIT Neural Network
                policy = Policy(screen_height, screen_width, len(actions))

                if SHOULD_LOAD_STATE:
                    print("Loading model from: ", DATA_PATH)
                    policy.load_state_dict(torch.load(DATA_PATH))

                optimizer = optim.Adam(policy.parameters(), lr=1e-2)
                eps = np.finfo(np.float32).eps.item()

                # Helper functions
                def select_action(state):
                    global steps_done
                    sample = random.random()
                    eps_threshold = reward_threshold
                    # eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                    #     math.exp(-1. * steps_done / EPS_DECAY)
                    steps_done += 1
                    if sample > eps_threshold:
                        with torch.no_grad():
                            # t.max(1) will return largest column value of each row.
                            # second column on max result is index of where max element was
                            # found, so we pick action with the larger expected reward.
                            return policy(state).max(1)[1].view(1, 1)
                    else:
                        return torch.tensor([[random.randrange(len(actions))]],
                                            device=device,
                                            dtype=torch.long)

                def finish_episode():
                    R = 0
                    policy_loss = []
                    returns = []
                    for r in policy.rewards[::-1]:
                        R = r + GAMMA * R
                        returns.insert(0, R)
                    returns = torch.tensor(returns)
                    returns = (returns - returns.mean()) / \
                        (returns.std() + eps)
                    for log_prob, R in zip(policy.saved_log_probs, returns):
                        policy_loss.append(-log_prob * R)
                    optimizer.zero_grad()
                    print("POLICY LOSS: ", policy_loss)
                    # policy_loss = torch.cat(policy_loss).sum()
                    # policy_loss.backward()
                    optimizer.step()
                    torch.save(policy.state_dict(), DATA_PATH)
                    del policy.rewards[:]
                    del policy.saved_log_probs[:]

                running_reward = 10
                for i_episode in count(1):
                    print("Episode: ", i_episode)
                    state, ep_reward = env.reset(), 0
                    # Don't infinite loop while learning
                    for t in range(1, num_steps_per_episode):
                        action = select_action(state).data.cpu().numpy()[0][0]
                        # print("ACTION:", action)
                        state, reward, done, info = env.step(action)
                        if SHOULD_RENDER:
                            env.render()
                        policy.rewards.append(reward)
                        ep_reward += reward
                        if done:
                            break

                    running_reward = 0.05 * ep_reward + \
                        (1 - 0.05) * running_reward
                    finish_episode()
                    if i_episode % log_interval == 0:
                        print(
                            'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'
                            .format(i_episode, ep_reward, running_reward))
                    print("Running reward: ", running_reward)
                    if running_reward > reward_threshold:
                        print("Solved! Running reward is now {} and "
                              "the last episode runs to {} time steps!".format(
                                  running_reward, t))
                        break
            else:
                # PLAY RANDOMLY
                for t in range(0, steps):
                    # get the mapping of keyboard keys to actions in the environment
                    if hasattr(env, 'get_keys_to_action'):
                        keys_to_action = env.get_keys_to_action()
                    elif hasattr(env.unwrapped, 'get_keys_to_action'):
                        keys_to_action = env.unwrapped.get_keys_to_action()
                    else:
                        raise ValueError(
                            'env has no get_keys_to_action method')

                    # # change action every 6 frames
                    if t % 6 == 0:
                        action = env.action_space.sample()

                        # after 500 timesteps, stop pressing start button
                        if t > 500:
                            while action == 0:
                                action = env.action_space.sample()

                    observation, reward, done, info = env.step(action)
                    print("---------------------------t: ", t)
                    print("action space: ", action, env.action_space)
                    print("obs: ", observation.shape)
                    print("reward: ", reward)
                    print("info: ", info)
                    # runs game at about 60fps
                    time.sleep(0.016667)
                    env.render()

            end = time.time()
            env.close()
            print("time: ", (end - start), " seconds  for ", steps, "steps")