コード例 #1
0
def main():
    #============= Initialize variables ===========#

    environment = Environment()
    agent = Agent()
    # ================= Running episodes =================#
    all_rewards = []
    batch_size = BATCH_SIZE
    for e in range(EPISODES):
        state, action, next_state, episode_reward = environment.reset(
        )  # Reset level in tank
        # Running through states in the episode
        for t in range(MAX_TIME):
            action = agent.act(state)
            z = agent.action_choices[action]
            terminated, next_state = environment.get_next_state(z, state)
            reward = environment.get_reward(next_state, terminated, t)
            agent.remember(state, action, next_state, reward, terminated)
            episode_reward += reward
            if terminated:
                break
            if environment.show_rendering:
                environment.render(z, next_state[-1])
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
            if keyboard.is_pressed('ctrl+c'):
                break
        # Live plot rewards
        # agent.decay_exploration()
        all_rewards.append(episode_reward)
        if keyboard.is_pressed('ctrl+c'):
            break
        if LIVE_REWARD_PLOT:
            environment.plot(all_rewards, agent.epsilon)
        if not environment.running:
            break

    print("##### {} EPISODES DONE #####".format(e))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))
    print("Mean rewards for the last 10 episodes: {}".format(
        np.mean(all_rewards[-10:])))
    if SAVE_ANN_MODEL:
        print("ANN_Model was saved")
コード例 #2
0
def main():
    # ============= Initialize variables and objects ===========#
    max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"]
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    mean_episode = MAIN_PARAMS["MEAN_EPISODE"]
    episodes = MAIN_PARAMS["EPISODES"]

    all_rewards = []
    all_mean_rewards = []
    t_mean = []

    # ================= Running episodes =================#
    try:
        for e in range(episodes):
            states, episode_reward = environment.reset()  # Reset level in tank
            for t in range(MAIN_PARAMS["MAX_TIME"]):
                actions = agent.act(states[-1])  # get action choice from state
                z = agent.get_z(actions)

                terminated, next_state = environment.get_next_state(
                    z, states[-1], t
                )  # Calculate next state with action
                rewards = sum_rewards(
                    next_state, terminated, get_reward
                )  # get reward from transition to next state

                # Store data
                rewards = sum_rewards(next_state, terminated, get_reward)
                rewards.append(np.sum(rewards))
                episode_reward.append(rewards)

                states.append(next_state)
                agent.remember(states, rewards, terminated, t)

                if environment.show_rendering:
                    environment.render(z)
                if True in terminated:
                    break

            episode_reward = np.array(episode_reward)
            episode_total_reward = []
            t_mean.append(t)
            for i in range(environment.n_tanks + 1):
                episode_total_reward.append(sum(episode_reward[:, i]))
            all_rewards.append(episode_total_reward)

            # Print mean reward and save better models
            if e % mean_episode == 0 and e != 0:
                mean_reward = np.array(all_rewards[-mean_episode:])
                mean_r = []
                t_mean = int(np.mean(t_mean))
                for i in range(environment.n_tanks + 1):
                    mean_r.append(np.mean(mean_reward[:, i]))
                all_mean_rewards.append(mean_r)
                print(
                    f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]}  \
                    r1: {mean_r[0]} ex1: {round(agent.epsilon[0],2)} r2: {mean_r[1]} ex2: {round(agent.epsilon[1],2)}"
                )
                t_mean = []
                if mean_r[-1] >= max_mean_reward:
                    agent.save_trained_model()
                    max_mean_reward = mean_r[-1]
                # Train model
            if agent.is_ready():
                agent.Qreplay(e)

            if not environment.running:
                break
            # if agent.epsilon <= agent.epsilon_min:
            #     break
    except KeyboardInterrupt:
        pass
    print("Memory length: {}".format(len(agent.memory)))
    print("##### {} EPISODES DONE #####".format(e + 1))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))

    all_mean_rewards = np.array(all_mean_rewards)
    labels = ["Tank 1", "Tank 2"]

    for i in range(environment.n_tanks):
        plt.plot(all_mean_rewards[:, i], label=labels[i])
    plt.legend()
    plt.show()

    plt.plot(all_mean_rewards[:, -1], label="Total rewards")
    plt.ylabel("Mean rewards of last {} episodes".format(mean_episode))
    plt.legend()
    plt.show()
コード例 #3
0
def main():
    # ============= Initialize variables and objects ===========#
    max_mean_reward = 50 * len(TANK_PARAMS)
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    mean_episode = MAIN_PARAMS["MEAN_EPISODE"]
    episodes = MAIN_PARAMS["EPISODES"]
    all_rewards = []
    all_mean_rewards = []

    # ================= Running episodes =================#
    try:
        for e in range(episodes):
            states, episode_reward = environment.reset()  # Reset level in tank
            for t in range(MAIN_PARAMS["MAX_TIME"]):
                actions = agent.act(states[-1])  # get action choice from state
                z = agent.get_z(actions)

                terminated, next_state = environment.get_next_state(
                    z, states[-1], t)  # Calculate next state with action
                rewards = sum_rewards(
                    next_state, terminated,
                    get_reward)  # get reward from transition to next state

                # Store data
                episode_reward.append(np.sum(rewards))

                states.append(next_state)
                agent.remember(states, rewards, terminated, t)

                if environment.show_rendering:
                    environment.render(z)
                if True in terminated:
                    break

            all_rewards.append(np.sum(np.array(episode_reward)))

            # Print mean reward and save better models
            if e % mean_episode == 0 and e != 0:
                mean_reward = np.mean(all_rewards[-mean_episode:])
                all_mean_rewards.append(mean_reward)
                print("{} of {}/{} episodes\
                     reward: {} exp_1: {} exp_2: {}".format(
                    mean_episode,
                    e,
                    episodes,
                    round(mean_reward, 2),
                    round(agent.epsilon[0], 2),
                    round(agent.epsilon[1], 2),
                ))
                if agent.save_model_bool:
                    max_mean_reward = agent.save_model(mean_reward,
                                                       max_mean_reward)
                # Train model
            if agent.is_ready():
                agent.Qreplay(e)

            if keyboard.is_pressed("ctrl+x"):
                break

            if environment.live_plot:
                environment.plot(all_rewards, agent.epsilon)
            if not environment.running:
                break
            # if agent.epsilon <= agent.epsilon_min:
            #     break
    except KeyboardInterrupt:
        pass
    print("Memory length: {}".format(len(agent.memory)))
    print("##### {} EPISODES DONE #####".format(e + 1))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))
    plt.ioff()
    plt.clf()
    x_range = np.arange(0, e - e % mean_episode, mean_episode)
    plt.plot(x_range, all_mean_rewards)
    plt.ylabel("Mean rewards of last {} episodes".format(mean_episode))
    plt.show()
def main():
    # ============= Initialize variables and objects ===========#
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    z = []
    h = []
    d = []
    # ================= Running episodes =================#

    state, episode_reward = environment.reset()
    h_ = np.array([state[0][0][0], state[0][1][0]])
    h.append(h_)
    for t in range(MAIN_PARAMS["MAX_TIME"]):
        action = agent.act(state[-1])  # get action choice from state
        z_ = agent.action_choices[
            action]  # convert action choice into valve position
        z.append(np.array(z_))
        terminated, next_state = environment.get_next_state(
            z[-1], state[-1], t)  # Calculate next state with action
        reward = get_reward(
            next_state, terminated)  # get reward from transition to next state

        # Store data
        episode_reward.append(reward)

        state.append(next_state)
        h_ = []
        d_ = []
        for i in range(agent.n_tanks):
            d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i])
            h_.append(np.array(next_state[i][0]))
        d.append(d_)
        h.append(h_)
        if environment.show_rendering:
            environment.render(z[-1])
        if True in terminated:
            break

        if keyboard.is_pressed("ctrl+x"):
            break

        if not environment.running:
            break
    print(np.sum(episode_reward))

    _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)
    d = np.array(d)
    h = np.array(h[:-1])
    z = np.array(z)
    h *= 10

    ax1.plot(h[:-1, 0], color="peru", label="Tank 1")
    ax1.plot(h[:-1, 1], color="firebrick", label="Tank 2")
    ax1.set_ylabel("Level")
    ax1.legend(loc="upper right")
    ax1.set_ylim(0, 10)

    ax2.plot(z[1:, 0], color="peru", label="Tank 1")
    ax2.plot(z[1:, 1], color="firebrick", label="Tank 2")
    ax2.legend(loc="upper right")
    ax2.set_ylabel("Valve")
    ax2.set_ylim(0, 1.01)

    ax3.plot(d[:, 0], color="peru", label="Tank 1")
    ax3.plot(d[:, 1], color="firebrick", label="Tank 2")
    ax3.set_ylabel("Disturbance")
    ax3.legend(loc="upper right")

    # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"])
    plt.tight_layout()
    plt.xlabel("Time")
    plt.show()
コード例 #5
0
def main():
    # ============= Initialize variables and objects ===========#
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    z = []
    h = []
    d = []
    # ================= Running episodes =================#

    state, episode_reward = environment.reset()
    h_ = np.array([state[0][i][0] for i in range(6)])
    h.append(h_)
    for t in range(MAIN_PARAMS["MAX_TIME"]):
        action = agent.act(state[-1])  # get action choice from state
        z_ = agent.action_choices[
            action]  # convert action choice into valve position
        z.append(np.array(z_))
        terminated, next_state = environment.get_next_state(
            z[-1], state[-1], t)  # Calculate next state with action
        reward = sum_rewards(
            next_state, terminated,
            get_reward)  # get reward from transition to next state
        # Store data
        episode_reward.append(reward)

        state.append(next_state)
        h_ = []
        d_ = []
        for i in range(agent.n_tanks):
            d_.append(environment.tanks[i].dist.flow[t - 1] +
                      environment.q_inn[i])
            h_.append(np.array(next_state[i][0]))
        d.append(d_)
        h.append(h_)
        if environment.show_rendering:
            environment.render(z[-1])
        if True in terminated:
            break

        if not environment.running:
            break
    colors = [
        "peru",
        "firebrick",
        "darkslategray",
        "darkviolet",
        "mediumseagreen",
        "darkcyan",
    ]
    print(f"reward: {np.sum(episode_reward)}")
    h = np.array(h) * 10
    d = np.array(d)
    z = np.array(z)
    for i in range(2):
        _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)
        ax1.plot(
            h[1:-1, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax1.plot(
            h[1:-1, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax1.plot(
            h[1:-1, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax1.set_ylabel("Level")
        ax1.legend(loc="upper left")
        ax1.set_ylim(2.5, 7.5)

        ax2.plot(
            z[1:, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax2.plot(
            z[1:, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax2.plot(
            z[1:, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax2.set_ylabel("Valve")
        ax2.legend(loc="upper left")
        ax2.set_ylim(-0.01, 1.01)

        ax3.plot(
            d[1:-1, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax3.plot(
            d[1:-1, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax3.plot(
            d[1:-1, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax3.set_ylabel("Disturbance")
        ax3.legend(loc="upper left")
        ax3.set_ylim(-0.01, 4)

        plt.tight_layout()
        plt.xlabel("Time")
        plt.show()
コード例 #6
0
def main():
    # ============= Initialize variables and objects ===========#
    max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"]
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    mean_episode = MAIN_PARAMS["MEAN_EPISODE"]
    episodes = MAIN_PARAMS["EPISODES"]

    all_rewards = []
    all_mean_rewards = []
    t_mean = []

    # ================= Running episodes =================#
    try:
        for e in range(episodes):
            states, episode_reward = environment.reset()  # Reset level in tank

            for t in range(MAIN_PARAMS["MAX_TIME"]):
                z = agent.act(states[-1])  # get action choice from state

                terminated, next_state = environment.get_next_state(
                    z, states[-1], t)
                states.append(next_state)

                rewards = sum_rewards(next_state, terminated, get_reward)
                rewards.append(np.sum(rewards))
                episode_reward.append(rewards)

                agent.remember(states, rewards, terminated, t)

                if environment.show_rendering:
                    environment.render(z)
                if True in terminated:
                    break

            # Collect summary of episode
            episode_reward = np.array(episode_reward)
            episode_total_reward = []
            t_mean.append(t)
            for i in range(environment.n_tanks + 1):
                episode_total_reward.append(sum(episode_reward[:, i]))
            all_rewards.append(episode_total_reward)

            # Print mean reward and save better models
            if e % mean_episode == 0 and e != 0:
                mean_reward = np.array(all_rewards[-mean_episode:])
                mean_r = []
                t_mean = int(np.mean(t_mean))
                for i in range(environment.n_tanks + 1):
                    mean_r.append(np.mean(mean_reward[:, i]))
                all_mean_rewards.append(mean_r)
                print(
                    f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]}"
                )
                t_mean = []
                if mean_r[-1] >= max_mean_reward:
                    agent.save_trained_model()
                    max_mean_reward = mean_r[-1]
            agent.PolicyGradientReplay(e)

            if not environment.running:
                break

    except KeyboardInterrupt:
        pass
    print("Memory length: {}".format(len(agent.memory)))
    print("##### {} EPISODES DONE #####".format(e + 1))
    print("Max rewards for all episodes: {}".format(np.max(all_rewards)))

    all_mean_rewards = np.array(all_mean_rewards)

    plt.plot(all_mean_rewards[:, -1], label="Total rewards")
    plt.ylabel("Mean rewards of last {} episodes".format(mean_episode))
    plt.legend()
    plt.show()