def main(): #============= Initialize variables ===========# environment = Environment() agent = Agent() # ================= Running episodes =================# all_rewards = [] batch_size = BATCH_SIZE for e in range(EPISODES): state, action, next_state, episode_reward = environment.reset( ) # Reset level in tank # Running through states in the episode for t in range(MAX_TIME): action = agent.act(state) z = agent.action_choices[action] terminated, next_state = environment.get_next_state(z, state) reward = environment.get_reward(next_state, terminated, t) agent.remember(state, action, next_state, reward, terminated) episode_reward += reward if terminated: break if environment.show_rendering: environment.render(z, next_state[-1]) if len(agent.memory) > batch_size: agent.replay(batch_size) if keyboard.is_pressed('ctrl+c'): break # Live plot rewards # agent.decay_exploration() all_rewards.append(episode_reward) if keyboard.is_pressed('ctrl+c'): break if LIVE_REWARD_PLOT: environment.plot(all_rewards, agent.epsilon) if not environment.running: break print("##### {} EPISODES DONE #####".format(e)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) print("Mean rewards for the last 10 episodes: {}".format( np.mean(all_rewards[-10:]))) if SAVE_ANN_MODEL: print("ANN_Model was saved")
def main(): # ============= Initialize variables and objects ===========# max_mean_reward = 50 * len(TANK_PARAMS) environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) mean_episode = MAIN_PARAMS["MEAN_EPISODE"] episodes = MAIN_PARAMS["EPISODES"] all_rewards = [] all_mean_rewards = [] # ================= Running episodes =================# try: for e in range(episodes): states, episode_reward = environment.reset() # Reset level in tank for t in range(MAIN_PARAMS["MAX_TIME"]): actions = agent.act(states[-1]) # get action choice from state z = agent.get_z(actions) terminated, next_state = environment.get_next_state( z, states[-1], t) # Calculate next state with action rewards = sum_rewards( next_state, terminated, get_reward) # get reward from transition to next state # Store data episode_reward.append(np.sum(rewards)) states.append(next_state) agent.remember(states, rewards, terminated, t) if environment.show_rendering: environment.render(z) if True in terminated: break all_rewards.append(np.sum(np.array(episode_reward))) # Print mean reward and save better models if e % mean_episode == 0 and e != 0: mean_reward = np.mean(all_rewards[-mean_episode:]) all_mean_rewards.append(mean_reward) print("{} of {}/{} episodes\ reward: {} exp_1: {} exp_2: {}".format( mean_episode, e, episodes, round(mean_reward, 2), round(agent.epsilon[0], 2), round(agent.epsilon[1], 2), )) if agent.save_model_bool: max_mean_reward = agent.save_model(mean_reward, max_mean_reward) # Train model if agent.is_ready(): agent.Qreplay(e) if keyboard.is_pressed("ctrl+x"): break if environment.live_plot: environment.plot(all_rewards, agent.epsilon) if not environment.running: break # if agent.epsilon <= agent.epsilon_min: # break except KeyboardInterrupt: pass print("Memory length: {}".format(len(agent.memory))) print("##### {} EPISODES DONE #####".format(e + 1)) print("Max rewards for all episodes: {}".format(np.max(all_rewards))) plt.ioff() plt.clf() x_range = np.arange(0, e - e % mean_episode, mean_episode) plt.plot(x_range, all_mean_rewards) plt.ylabel("Mean rewards of last {} episodes".format(mean_episode)) plt.show()