def main(): #============= Initialize variables ===========# environment = Environment() agent = Agent() # ================= Running episodes =================# all_rewards = [] batch_size = BATCH_SIZE for e in range(EPISODES): state, action, next_state, episode_reward = environment.reset( ) # Reset level in tank # Running through states in the episode for t in range(MAX_TIME): action = agent.act(state) z = agent.action_choices[action] terminated, next_state = environment.get_next_state(z, state) reward = environment.get_reward(next_state, terminated, t) agent.remember(state, action, next_state, reward, terminated) episode_reward += reward if terminated: break if environment.show_rendering: environment.render(z, next_state[-1]) if len(agent.memory) > batch_size: agent.replay(batch_size) if keyboard.is_pressed('ctrl+c'): break # Live plot rewards # agent.decay_exploration() all_rewards.append(episode_reward) if keyboard.is_pressed('ctrl+c'): break if LIVE_REWARD_PLOT: environment.plot(all_rewards, agent.epsilon) if not environment.running: break print("##### {} EPISODES DONE #####".format(e)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) print("Mean rewards for the last 10 episodes: {}".format( np.mean(all_rewards[-10:]))) if SAVE_ANN_MODEL: print("ANN_Model was saved")
def main(): # ============= Initialize variables and objects ===========# max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"] environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) mean_episode = MAIN_PARAMS["MEAN_EPISODE"] episodes = MAIN_PARAMS["EPISODES"] all_rewards = [] all_mean_rewards = [] t_mean = [] # ================= Running episodes =================# try: for e in range(episodes): states, episode_reward = environment.reset() # Reset level in tank for t in range(MAIN_PARAMS["MAX_TIME"]): actions = agent.act(states[-1]) # get action choice from state z = agent.get_z(actions) terminated, next_state = environment.get_next_state( z, states[-1], t ) # Calculate next state with action rewards = sum_rewards( next_state, terminated, get_reward ) # get reward from transition to next state # Store data rewards = sum_rewards(next_state, terminated, get_reward) rewards.append(np.sum(rewards)) episode_reward.append(rewards) states.append(next_state) agent.remember(states, rewards, terminated, t) if environment.show_rendering: environment.render(z) if True in terminated: break episode_reward = np.array(episode_reward) episode_total_reward = [] t_mean.append(t) for i in range(environment.n_tanks + 1): episode_total_reward.append(sum(episode_reward[:, i])) all_rewards.append(episode_total_reward) # Print mean reward and save better models if e % mean_episode == 0 and e != 0: mean_reward = np.array(all_rewards[-mean_episode:]) mean_r = [] t_mean = int(np.mean(t_mean)) for i in range(environment.n_tanks + 1): mean_r.append(np.mean(mean_reward[:, i])) all_mean_rewards.append(mean_r) print( f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]} \ r1: {mean_r[0]} ex1: {round(agent.epsilon[0],2)} r2: {mean_r[1]} ex2: {round(agent.epsilon[1],2)}" ) t_mean = [] if mean_r[-1] >= max_mean_reward: agent.save_trained_model() max_mean_reward = mean_r[-1] # Train model if agent.is_ready(): agent.Qreplay(e) if not environment.running: break # if agent.epsilon <= agent.epsilon_min: # break except KeyboardInterrupt: pass print("Memory length: {}".format(len(agent.memory))) print("##### {} EPISODES DONE #####".format(e + 1)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) all_mean_rewards = np.array(all_mean_rewards) labels = ["Tank 1", "Tank 2"] for i in range(environment.n_tanks): plt.plot(all_mean_rewards[:, i], label=labels[i]) plt.legend() plt.show() plt.plot(all_mean_rewards[:, -1], label="Total rewards") plt.ylabel("Mean rewards of last {} episodes".format(mean_episode)) plt.legend() plt.show()
def main(): # ============= Initialize variables and objects ===========# max_mean_reward = 50 * len(TANK_PARAMS) environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) mean_episode = MAIN_PARAMS["MEAN_EPISODE"] episodes = MAIN_PARAMS["EPISODES"] all_rewards = [] all_mean_rewards = [] # ================= Running episodes =================# try: for e in range(episodes): states, episode_reward = environment.reset() # Reset level in tank for t in range(MAIN_PARAMS["MAX_TIME"]): actions = agent.act(states[-1]) # get action choice from state z = agent.get_z(actions) terminated, next_state = environment.get_next_state( z, states[-1], t) # Calculate next state with action rewards = sum_rewards( next_state, terminated, get_reward) # get reward from transition to next state # Store data episode_reward.append(np.sum(rewards)) states.append(next_state) agent.remember(states, rewards, terminated, t) if environment.show_rendering: environment.render(z) if True in terminated: break all_rewards.append(np.sum(np.array(episode_reward))) # Print mean reward and save better models if e % mean_episode == 0 and e != 0: mean_reward = np.mean(all_rewards[-mean_episode:]) all_mean_rewards.append(mean_reward) print("{} of {}/{} episodes\ reward: {} exp_1: {} exp_2: {}".format( mean_episode, e, episodes, round(mean_reward, 2), round(agent.epsilon[0], 2), round(agent.epsilon[1], 2), )) if agent.save_model_bool: max_mean_reward = agent.save_model(mean_reward, max_mean_reward) # Train model if agent.is_ready(): agent.Qreplay(e) if keyboard.is_pressed("ctrl+x"): break if environment.live_plot: environment.plot(all_rewards, agent.epsilon) if not environment.running: break # if agent.epsilon <= agent.epsilon_min: # break except KeyboardInterrupt: pass print("Memory length: {}".format(len(agent.memory))) print("##### {} EPISODES DONE #####".format(e + 1)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) plt.ioff() plt.clf() x_range = np.arange(0, e - e % mean_episode, mean_episode) plt.plot(x_range, all_mean_rewards) plt.ylabel("Mean rewards of last {} episodes".format(mean_episode)) plt.show()
def main(): # ============= Initialize variables and objects ===========# environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) z = [] h = [] d = [] # ================= Running episodes =================# state, episode_reward = environment.reset() h_ = np.array([state[0][0][0], state[0][1][0]]) h.append(h_) for t in range(MAIN_PARAMS["MAX_TIME"]): action = agent.act(state[-1]) # get action choice from state z_ = agent.action_choices[ action] # convert action choice into valve position z.append(np.array(z_)) terminated, next_state = environment.get_next_state( z[-1], state[-1], t) # Calculate next state with action reward = get_reward( next_state, terminated) # get reward from transition to next state # Store data episode_reward.append(reward) state.append(next_state) h_ = [] d_ = [] for i in range(agent.n_tanks): d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i]) h_.append(np.array(next_state[i][0])) d.append(d_) h.append(h_) if environment.show_rendering: environment.render(z[-1]) if True in terminated: break if keyboard.is_pressed("ctrl+x"): break if not environment.running: break print(np.sum(episode_reward)) _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) d = np.array(d) h = np.array(h[:-1]) z = np.array(z) h *= 10 ax1.plot(h[:-1, 0], color="peru", label="Tank 1") ax1.plot(h[:-1, 1], color="firebrick", label="Tank 2") ax1.set_ylabel("Level") ax1.legend(loc="upper right") ax1.set_ylim(0, 10) ax2.plot(z[1:, 0], color="peru", label="Tank 1") ax2.plot(z[1:, 1], color="firebrick", label="Tank 2") ax2.legend(loc="upper right") ax2.set_ylabel("Valve") ax2.set_ylim(0, 1.01) ax3.plot(d[:, 0], color="peru", label="Tank 1") ax3.plot(d[:, 1], color="firebrick", label="Tank 2") ax3.set_ylabel("Disturbance") ax3.legend(loc="upper right") # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"]) plt.tight_layout() plt.xlabel("Time") plt.show()
def main(): # ============= Initialize variables and objects ===========# environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) z = [] h = [] d = [] # ================= Running episodes =================# state, episode_reward = environment.reset() h_ = np.array([state[0][i][0] for i in range(6)]) h.append(h_) for t in range(MAIN_PARAMS["MAX_TIME"]): action = agent.act(state[-1]) # get action choice from state z_ = agent.action_choices[ action] # convert action choice into valve position z.append(np.array(z_)) terminated, next_state = environment.get_next_state( z[-1], state[-1], t) # Calculate next state with action reward = sum_rewards( next_state, terminated, get_reward) # get reward from transition to next state # Store data episode_reward.append(reward) state.append(next_state) h_ = [] d_ = [] for i in range(agent.n_tanks): d_.append(environment.tanks[i].dist.flow[t - 1] + environment.q_inn[i]) h_.append(np.array(next_state[i][0])) d.append(d_) h.append(h_) if environment.show_rendering: environment.render(z[-1]) if True in terminated: break if not environment.running: break colors = [ "peru", "firebrick", "darkslategray", "darkviolet", "mediumseagreen", "darkcyan", ] print(f"reward: {np.sum(episode_reward)}") h = np.array(h) * 10 d = np.array(d) z = np.array(z) for i in range(2): _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) ax1.plot( h[1:-1, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax1.plot( h[1:-1, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax1.plot( h[1:-1, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax1.set_ylabel("Level") ax1.legend(loc="upper left") ax1.set_ylim(2.5, 7.5) ax2.plot( z[1:, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax2.plot( z[1:, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax2.plot( z[1:, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax2.set_ylabel("Valve") ax2.legend(loc="upper left") ax2.set_ylim(-0.01, 1.01) ax3.plot( d[1:-1, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax3.plot( d[1:-1, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax3.plot( d[1:-1, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax3.set_ylabel("Disturbance") ax3.legend(loc="upper left") ax3.set_ylim(-0.01, 4) plt.tight_layout() plt.xlabel("Time") plt.show()
def main(): # ============= Initialize variables and objects ===========# max_mean_reward = MAIN_PARAMS["MAX_MEAN_REWARD"] environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) mean_episode = MAIN_PARAMS["MEAN_EPISODE"] episodes = MAIN_PARAMS["EPISODES"] all_rewards = [] all_mean_rewards = [] t_mean = [] # ================= Running episodes =================# try: for e in range(episodes): states, episode_reward = environment.reset() # Reset level in tank for t in range(MAIN_PARAMS["MAX_TIME"]): z = agent.act(states[-1]) # get action choice from state terminated, next_state = environment.get_next_state( z, states[-1], t) states.append(next_state) rewards = sum_rewards(next_state, terminated, get_reward) rewards.append(np.sum(rewards)) episode_reward.append(rewards) agent.remember(states, rewards, terminated, t) if environment.show_rendering: environment.render(z) if True in terminated: break # Collect summary of episode episode_reward = np.array(episode_reward) episode_total_reward = [] t_mean.append(t) for i in range(environment.n_tanks + 1): episode_total_reward.append(sum(episode_reward[:, i])) all_rewards.append(episode_total_reward) # Print mean reward and save better models if e % mean_episode == 0 and e != 0: mean_reward = np.array(all_rewards[-mean_episode:]) mean_r = [] t_mean = int(np.mean(t_mean)) for i in range(environment.n_tanks + 1): mean_r.append(np.mean(mean_reward[:, i])) all_mean_rewards.append(mean_r) print( f"Mean {mean_episode} of {e}/{episodes} episodes ### timestep {t_mean+1} ### tot reward: {mean_r[-1]}" ) t_mean = [] if mean_r[-1] >= max_mean_reward: agent.save_trained_model() max_mean_reward = mean_r[-1] agent.PolicyGradientReplay(e) if not environment.running: break except KeyboardInterrupt: pass print("Memory length: {}".format(len(agent.memory))) print("##### {} EPISODES DONE #####".format(e + 1)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) all_mean_rewards = np.array(all_mean_rewards) plt.plot(all_mean_rewards[:, -1], label="Total rewards") plt.ylabel("Mean rewards of last {} episodes".format(mean_episode)) plt.legend() plt.show()