Example #1
0
def main():
    #============= Initialize variables ===========#

    environment = Environment()
    agent = Agent()
    # ================= Running episodes =================#
    all_rewards = []
    batch_size = BATCH_SIZE
    for e in range(EPISODES):
        state, action, next_state, episode_reward = environment.reset(
        )  # Reset level in tank
        # Running through states in the episode
        for t in range(MAX_TIME):
            action = agent.act(state)
            z = agent.action_choices[action]
            terminated, next_state = environment.get_next_state(z, state)
            reward = environment.get_reward(next_state, terminated, t)
            agent.remember(state, action, next_state, reward, terminated)
            episode_reward += reward
            if terminated:
                break
            if environment.show_rendering:
                environment.render(z, next_state[-1])
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
            if keyboard.is_pressed('ctrl+c'):
                break
        # Live plot rewards
        # agent.decay_exploration()
        all_rewards.append(episode_reward)
        if keyboard.is_pressed('ctrl+c'):
            break
        if LIVE_REWARD_PLOT:
            environment.plot(all_rewards, agent.epsilon)
        if not environment.running:
            break

    print("##### {} EPISODES DONE #####".format(e))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))
    print("Mean rewards for the last 10 episodes: {}".format(
        np.mean(all_rewards[-10:])))
    if SAVE_ANN_MODEL:
        print("ANN_Model was saved")
Example #2
0
def main():
    # ============= Initialize variables and objects ===========#
    max_mean_reward = 50 * len(TANK_PARAMS)
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    mean_episode = MAIN_PARAMS["MEAN_EPISODE"]
    episodes = MAIN_PARAMS["EPISODES"]
    all_rewards = []
    all_mean_rewards = []

    # ================= Running episodes =================#
    try:
        for e in range(episodes):
            states, episode_reward = environment.reset()  # Reset level in tank
            for t in range(MAIN_PARAMS["MAX_TIME"]):
                actions = agent.act(states[-1])  # get action choice from state
                z = agent.get_z(actions)

                terminated, next_state = environment.get_next_state(
                    z, states[-1], t)  # Calculate next state with action
                rewards = sum_rewards(
                    next_state, terminated,
                    get_reward)  # get reward from transition to next state

                # Store data
                episode_reward.append(np.sum(rewards))

                states.append(next_state)
                agent.remember(states, rewards, terminated, t)

                if environment.show_rendering:
                    environment.render(z)
                if True in terminated:
                    break

            all_rewards.append(np.sum(np.array(episode_reward)))

            # Print mean reward and save better models
            if e % mean_episode == 0 and e != 0:
                mean_reward = np.mean(all_rewards[-mean_episode:])
                all_mean_rewards.append(mean_reward)
                print("{} of {}/{} episodes\
                     reward: {} exp_1: {} exp_2: {}".format(
                    mean_episode,
                    e,
                    episodes,
                    round(mean_reward, 2),
                    round(agent.epsilon[0], 2),
                    round(agent.epsilon[1], 2),
                ))
                if agent.save_model_bool:
                    max_mean_reward = agent.save_model(mean_reward,
                                                       max_mean_reward)
                # Train model
            if agent.is_ready():
                agent.Qreplay(e)

            if keyboard.is_pressed("ctrl+x"):
                break

            if environment.live_plot:
                environment.plot(all_rewards, agent.epsilon)
            if not environment.running:
                break
            # if agent.epsilon <= agent.epsilon_min:
            #     break
    except KeyboardInterrupt:
        pass
    print("Memory length: {}".format(len(agent.memory)))
    print("##### {} EPISODES DONE #####".format(e + 1))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))
    plt.ioff()
    plt.clf()
    x_range = np.arange(0, e - e % mean_episode, mean_episode)
    plt.plot(x_range, all_mean_rewards)
    plt.ylabel("Mean rewards of last {} episodes".format(mean_episode))
    plt.show()