예제 #1
0
def train(agent: DQNAgent, env: Env, episodes: int = 10_000):
    display = False

    progression = tqdm.trange(episodes,
                              desc=f"Training {agent.name}",
                              unit="episode")
    fps = 0

    for episode in progression:
        state = env.reset()

        mean_reward = 0
        return_ = 0
        x_pos = 0

        for step in count(1):
            t = time()
            action = agent.act(np.asarray(state), explore=True)
            next_state, reward, done, info = env.step(action)
            agent.memorize(
                Experience((state, next_state, action, done, reward)))
            state = next_state
            agent.learn()

            mean_reward += (reward - mean_reward) / step
            return_ += reward
            x_pos = max(x_pos, info["x_pos"])
            fps = fps * 0.9 + 0.1 / (time() - t)

            if not step % 100:
                try:
                    display = (yaml.load(
                        (PROJECT_DIRECTORY / "display.yml").read_text()).get(
                            agent.name, {}).get("display", False))
                except:
                    pass
            if display:
                env.render()

            if done or info["flag_get"]:
                break

        progression.set_description(
            f"Training {agent.name}; "
            f"Frames: {agent.step} ({fps:.0f} FPS); "
            f"last progression: {x_pos} ({x_pos/3260:.1%}); "
            f"eps: {agent.eps:.2f}")

        agent.register_episode(
            EpisodeMetrics(episode=episode,
                           x_pos=x_pos,
                           return_=return_,
                           steps=step))

    agent.save_model()
예제 #2
0
def main():

    env = UnityEnvironment(
        file_name=
        "/home/faten/projects/deep-reinforcement-learning/p1_navigation/Banana_Linux/Banana.x86_64"
    )

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]
    state_size = len(state)

    agent = DQNAgent(state_size, action_size, seed=0)

    scores = train(env, agent)

    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Epsiode #')
    plt.show()

    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(3):
        state = env.reset()
        for j in range(200):
            action = agent.act(state)
            env.render()
            state, reward, done, _ = env.step(action)
            if done:
                break

    env.close()
예제 #3
0
    test_scores_i = []
    avg_scores = []
    scores_window = deque(maxlen=100)
    config = generate_configuration_qnet(action_size, state_size)
    agent = DQNAgent(config)
    agent.create_dirs()

    eps = config.eps_start

    for i_episode in range(1, config.n_episodes + 1):
        # Reset the environment and the score
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        while True:
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state, reward, done = env_info.vector_observations[
                0], env_info.rewards[0], env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)
        scores.append(score)
        avg_scores.append(np.mean(scores_window))
        eps = max(config.eps_min, config.eps_decay * eps)
        print(
            '\rEpisode {}\tEps {:.2f}\tLast Score: {:.2f}\tAverage Score: {:.2f}'
            .format(i_episode, eps, score, np.mean(scores_window)),
reward_history = []
nb_epsiodes = 1000
# for episode in range(nb_epsiodes):episode_reward_averague
episode_reward_average = -1
with tqdm.trange(nb_epsiodes) as t:
    for episode in t:
        # agent.reset()
        observation = env.reset()
        observation = deepcopy(observation)
        agent.observe(observation)
        done = False
        episode_reward = []
        step = 0
        # train
        while not done:
            action = agent.act()
            observation, reward, done, info = env.step(action)
            observation = deepcopy(observation)
            # reward = observation[0]
            if done:
                if step < 199:
                    reward = 100
                agent.observe(observation, reward, done)
                episode_reward.append(reward)

                episode_reward_average = 0.01*np.mean(episode_reward) + 0.99*episode_reward_average
                reward_history.append(np.mean(episode_reward))
                t.set_description('Episode {}, steps:{}, reward:{} '.format(episode, step, np.mean(episode_reward)))
                t.set_postfix(episode_reward=episode_reward_average)
                step_history.append(step)
                break
예제 #5
0
from agents.dqn_agent import DQNAgent

env = gym.make("LunarLander-v2")
env.seed(0)
agent = DQNAgent(env.action_space.n, env.observation_space.shape[0])
episodes = 400
steps = 3000
loss = []
for i_episode in range(episodes):
    obv = np.reshape(env.reset(), (1, 8))
    total_reward = 0
    done = False
    for t in range(steps):
        # env.render()
        # print(observation)
        action = agent.act(obv, total_reward, done)
        next_obv, reward, done, info = env.step(action)
        next_obv = np.reshape(next_obv, (1, 8))
        total_reward += reward
        agent.store_transition(obv, action, reward, next_obv, done)
        obv = next_obv
        agent.replay()
        if done:
            print("{}/{}, reward: {} in {} timesteps".format(
                i_episode, episodes, total_reward, t + 1))
            break
    loss.append(total_reward)

    # Average score of last 100 episode
    if len(loss) >= 100:
        is_solved = np.mean(loss[-100:])
예제 #6
0
def test():
    # set hyperparameters (not really important for running the agent)
    # higher eps. decay rate
    buffer_size = int(1e5)
    batch_size = 64
    gamma = 0.99
    tau = 1e-3
    learning_rate = 5e-4
    eps_start = 1.0
    eps_end = 0.01
    eps_decay = 0.999
    fc1_units = 64
    fc2_units = 64
    q_function_update_fraction = 4
    seed = 0

    ############ THE ENVIRONMENT ###############
    env = UnityEnvironment(file_name='Banana_Linux/Banana.x86_64', seed=seed)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # get the number of agents
    num_agents = len(env_info.agents)

    # get the size of the action space
    action_size = brain.vector_action_space_size

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    # initialize agent

    dqn_agent = DQNAgent(name=None,
                         state_size=state_size,
                         action_size=action_size,
                         learning_rate=learning_rate,
                         discount_rate=gamma,
                         eps_start=eps_start,
                         eps_end=eps_end,
                         eps_decay=eps_decay,
                         tau=tau,
                         network_architecture=[fc1_units, fc2_units],
                         experience_replay_buffer_size=buffer_size,
                         experience_replay_buffer_batch_size=batch_size,
                         experience_replay_start_size=3200,
                         q_function_update_fraction=q_function_update_fraction,
                         device='gpu',
                         seed=seed)

    dqn_agent.load_state_dict(torch.load('checkpoint.pth'))

    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations
    scores = np.zeros(num_agents)

    for i in range(200):
        actions = dqn_agent.act(states)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += rewards
        states = next_states
        if np.any(dones):
            break