Esempio n. 1
0
def main(num_episodes, render=False):
    # initialize gym environment and the agent
    # env = gym.make('SpaceInvaders-v0')
    env = gym.make('Breakout-v0')
    state = env.reset()
    state_shape = list(state.shape)
    state_shape[-1] = state_shape[-1] * 5
    agent = DQNAgent(state_shape, env.action_space.n)

    states = deque(maxlen=5)

    max_train_time = 800

    # Iterate the game
    for e in range(num_episodes):
        # reset state in the beginning of each game
        state = env.reset()
        for i in range(5):
            states.appendleft(state)
        # time_t represents each frame of the game
        num_random = 0
        total_reward = 0.
        for time_t in range(max_train_time):
            # turn this on if you want to render
            if render:
                env.render()
            # Decide action
            action = agent.act(states)
            if agent.acted_randomly:
                num_random += 1
            # Advance the game to the next frame based on the action.
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            # Remember the previous state, action, reward, and done
            agent.remember(states.copy(), action, reward, next_state, done)
            # make next_state the new current state for the next frame.
            states.appendleft(next_state)
            # done becomes True when the game ends
            if done:
                # print the score and break out of the loop
                rand_perc = num_random / float(
                    time_t + 1) * 100.  # Percentage of random actions.
                print(
                    "episode: {}/{}, training_time: {}, summed_reward: {}, random_actions: {}%, eps: {}"
                    .format(e, num_episodes, time_t, total_reward, rand_perc,
                            agent.epsilon))
                # train the agent with the experience of the episode
                agent.replay(min(100, time_t))
                break
        # print("epsilon {}".format(agent.epsilon))
        if e % 1000 == 0:
            agent.save("./deep_q_model.h5")
            print("saved model")
from agents import QAgent, Agent, RandomAgent, DQNAgent

env = gym.make('LunarLander-v2')

num_episodes = 5000

agent = DQNAgent(env.observation_space.n, env.action_space.n)

average_reward = []
for episode in range(num_episodes):
    rewards = []
    state = env.reset()

    while True:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        rewards.append(reward)
        agent.step(state, action, reward, next_state, done)
        state = next_state

        if done:
            average_reward.append(np.sum(rewards))
            break

    # monitor progress
    if episode % print_evry == 0:
        reward_last_100 = int(np.mean(average_reward[-99:]))
        learning_rate = agent.scheduler.get_lr().squeeze()
        print(
            f"Episode {episode}/{num_episodes},eps:{agent.epsilon:.3f}, lr: {learning_rate}, reward:{reward_last_100}"
Esempio n. 3
0
class Player():
    def __init__(self):
        """Player implementation of dqn and random agents"""
        self.env = UnityEnvironment(
            file_name="../env/Banana_Linux_NoVis/Banana.x86_64")
        self.brain_name = self.env.brain_names[0]
        brain = self.env.brains[self.brain_name]
        # reset the environment
        env_info = self.env.reset(train_mode=False)[self.brain_name]
        # number of actions
        self.action_size = brain.vector_action_space_size
        # examine the state space
        state = env_info.vector_observations[0]
        state_size = len(state)

        self.agent = DQNAgent(state_size, self.action_size, seed=0)
        self.agent.local_network.load_state_dict(
            torch.load('../saved_models/dqn_banana_best.pth'))

    def play(self):
        """Play using best dqn agent"""
        scores = []
        scores_window = deque(maxlen=10)
        best_score = -np.inf
        eps = DEFAULT_EPS
        for i in range(NUM_EPISODES):
            env_info = self.env.reset(train_mode=False)[self.brain_name]
            state = env_info.vector_observations[0]
            score = 0
            while True:
                action = self.agent.act(state, eps)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]
                score += reward
                state = next_state
                if done:
                    break

            scores_window.append(score)
            scores.append(score)

            if i % 10 == 0:
                print('\rProgress: {}/{}, avg score: {:.2f}'.format(
                    i, NUM_EPISODES, np.mean(scores_window)),
                      end="")
                sys.stdout.flush()

        return scores, best_score

    def play_random(self):
        """Play by choosing random actions"""
        scores = []
        scores_window = deque(maxlen=10)
        best_score = -np.inf
        eps = DEFAULT_EPS
        for i in range(NUM_EPISODES):
            env_info = self.env.reset(train_mode=False)[self.brain_name]
            state = env_info.vector_observations[0]
            score = 0
            while True:
                action = np.random.randint(self.action_size)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]
                score += reward
                state = next_state
                if done:
                    break

            scores_window.append(score)
            scores.append(score)

            if i % 10 == 0:
                print('\rProgress: {}/{}, avg score: {:.2f}'.format(
                    i, NUM_EPISODES, np.mean(scores_window)),
                      end="")
                sys.stdout.flush()

        return scores, best_score