def main(): # ============================== # Settings # ============================== N_episodes = 200 load_model = False # load model save_model = True # save model on last episode save_model_filename = os.path.join("model", "model.h5") info = { "env": {"Ny": 20, "Nx": 20}, "agent": {"policy_mode": "epsgreedy", # "epsgreedy", "softmax" "eps": 1.0, "eps_decay": 2.0*np.log(10.0)/N_episodes}, "brain": {"discount": 0.99, "learning_rate": 0.9}, "memory": {} } # ============================== # Setup environment and agent # ============================== env = Environment(info) agent = Agent(env, info) brain = Brain(env, info) memory = Memory(info) if load_model: brain.load_model(save_model_filename) # ============================== # Train agent # ============================== for episode in range(N_episodes): iter = 0 state = env.starting_state() while env.is_terminal_state(state) == False: # Pick an action by sampling action probabilities action, model_output, prob = agent.get_action(state, brain, env) # Collect reward and observe next state reward = env.get_reward(state, action) state_next = env.perform_action(state, action) # Append quantities to memory memory.append_to_memory(state, state_next, action, model_output, prob, reward) # Transition to next state state = state_next iter += 1 # Print policy_mode = agent.agent_info["policy_mode"] if (policy_mode == "epsgreedy"): print("[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}".format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory))) elif (policy_mode == "softmax"): print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(episode, policy_mode, iter, sum(memory.reward_memory))) # Update model when episode finishes brain.update(memory, env) agent.episode += 1 # Save model if save_model and (episode == N_episodes-1): brain.save_model(save_model_filename) # Clear memory for next episode memory.clear_memory()
def main(): # ========================= # Settings # ========================= learning_mode = "QLearning" # "RewardAveraging", "QLearning" if learning_mode == "RewardAveraging": from RewardAveraging_BrainClass import Brain N_episodes = 100000 env_info = {"Ny": 7, "Nx": 7} brain_info = {} agent_info = { "name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes } elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes = 10000 env_info = {"Ny": 7, "Nx": 7} brain_info = { "Q_learning_rate": 0.95, "Q_discount": 1.0 } # only relevant for Q-learning agent_info = { "name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes } else: raise IOError("Error: Invalid learning mode!") # ========================= # Set up environment, agent, memory and brain # ========================= env = Environment( env_info) # set up environment rewards and state-transition rules agent = Agent(agent_info) # set up epsilon-greedy agent brain = Brain(env, brain_info) # stores and updates Q(s,a) and policy(s) memory = Memory(env) # keeps track of run and episode (s,a) histories # ========================= # Train agent # ========================= print( "\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n" .format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon)) memory.reset_run_counters() # reset run counters once only for episode in range(N_episodes): memory.reset_episode_counters() # reset episodic counters state = env.starting_state() # starting state while not env.is_terminal(state): # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters( state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) # Update Q during episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next # Update run counters first (before updating Q) memory.update_run_counters( ) # use episode counters to update run counters agent.episode += 1 # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Print if (episode + 1) % (N_episodes / 20) == 0: print( " episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}" .format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode)) # ======================= # Print final policy # ======================= print("\nFinal policy:\n") print(brain.compute_policy(env)) print("") for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)): print(" action['{}'] = {}".format(key, val))
def main(): # ========================= # Settings # ========================= learning_mode = "QLearning" # "RewardAveraging", "QLearning" if learning_mode == "RewardAveraging": from RewardAveraging_BrainClass import Brain N_episodes = 100000 env_info = {"Ny": 7, "Nx": 7} brain_info = {} agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes} elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes = 10000 env_info = {"Ny": 7, "Nx": 7} brain_info = {"Q_learning_rate": 0.95, "Q_discount": 1.0} # only relevant for Q-learning agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes} else: raise IOError("Error: Invalid learning mode!") # ========================= # Set up environment, agent, memory and brain # ========================= env = Environment(env_info) # set up environment rewards and state-transition rules agent = Agent(agent_info) # set up epsilon-greedy agent brain = Brain(env, brain_info) # stores and updates Q(s,a) and policy(s) memory = Memory(env) # keeps track of run and episode (s,a) histories # ========================= # Train agent # ========================= print("\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n".format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon)) memory.reset_run_counters() # reset run counters once only for episode in range(N_episodes): memory.reset_episode_counters() # reset episodic counters state = env.starting_state() # starting state while not env.is_terminal(state): # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters(state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) # Update Q during episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next # Update run counters first (before updating Q) memory.update_run_counters() # use episode counters to update run counters agent.episode += 1 # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Print if (episode+1) % (N_episodes/20) == 0: print(" episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}".format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode)) # ======================= # Print final policy # ======================= print("\nFinal policy:\n") print(brain.compute_policy(env)) print("") for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)): print(" action['{}'] = {}".format(key, val))
def main(): # ============================== # Settings # ============================== N_episodes = 200 load_model = False # load model save_model = True # save model on last episode save_model_filename = os.path.join("model", "model.h5") info = { "env": { "Ny": 20, "Nx": 20 }, "agent": { "policy_mode": "epsgreedy", # "epsgreedy", "softmax" "eps": 1.0, "eps_decay": 2.0 * np.log(10.0) / N_episodes }, "brain": { "discount": 0.99, "learning_rate": 0.9 }, "memory": {} } # ============================== # Setup environment and agent # ============================== env = Environment(info) agent = Agent(env, info) brain = Brain(env, info) memory = Memory(info) if load_model: brain.load_model(save_model_filename) # ============================== # Train agent # ============================== for episode in range(N_episodes): iter = 0 state = env.starting_state() while env.is_terminal_state(state) == False: # Pick an action by sampling action probabilities action, model_output, prob = agent.get_action(state, brain, env) # Collect reward and observe next state reward = env.get_reward(state, action) state_next = env.perform_action(state, action) # Append quantities to memory memory.append_to_memory(state, state_next, action, model_output, prob, reward) # Transition to next state state = state_next iter += 1 # Print policy_mode = agent.agent_info["policy_mode"] if (policy_mode == "epsgreedy"): print( "[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}" .format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory))) elif (policy_mode == "softmax"): print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format( episode, policy_mode, iter, sum(memory.reward_memory))) # Update model when episode finishes brain.update(memory, env) agent.episode += 1 # Save model if save_model and (episode == N_episodes - 1): brain.save_model(save_model_filename) # Clear memory for next episode memory.clear_memory()
def main(): # ========================= # Settings # ========================= learning_mode = "SampleAveraging" if learning_mode == "SampleAveraging": from SampleAveraging_BrainClass import Brain N_episodes_train = 100000 N_episodes_test = 30 agent_info = {"name": "hunter", "epsilon": 0.5} env_info = {"N_global": 7} brain_info = {} elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes_train = 10000 N_episodes_test = 30 agent_info = {"name": "hunter", "epsilon": 0.5} env_info = {"N_global": 7} brain_info = { "learning_rate": 0.8, "discount": 0.9 } # only relevant for Q-learning else: raise IOError("Error: Invalid learning mode!") save_video = True video_file = "results/hunterprey.mp4" convert_mp4_to_gif = True gif_file = "results/hunterprey.gif" # ========================= # Set up environment, agent, memory and brain # ========================= agent = Agent(agent_info) env = Environment(env_info) brain = Brain(env, brain_info) memory = Memory(env) # ========================= # Train agent # ========================= print( "\nTraining '{}' agent on '{}' environment for {} episodes, testing for {} episodes (epsilon = {})...\n" .format(agent.name, env.name, N_episodes_train, N_episodes_test, agent.epsilon)) memory.reset_run_counters() # reset run counters once only state_global_history_video = [] state_target_global_history_video = [] for episode in range(N_episodes_train + N_episodes_test): if (episode >= N_episodes_train): agent.epsilon = 0 # set no exploration for test episodes memory.reset_episode_counters() # reset episodic counters # state = position of hunter relative to prey (want to get to [0,0]) # state_global = global position of hunter # state_target_global = global position of prey if episode == 0: (state, state_global, state_target_global) = env.get_random_state() else: (state, state_global, state_target_global) = env.get_random_state( set_state_global=state_global) env.set_state_terminal_global(state_target_global) state_global_history = [state_global] n_iter_episode = 0 while not env.is_terminal( state ): # NOTE: terminates when hunter hits local coordinates of (0,0) # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters( state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) state_global_next = env.perform_action_global(state_global, action) # Update Q after episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next state_global = state_global_next # Track states for video state_global_history.append(state_global) # Exit program if testing fails (bad policy) n_iter_episode += 1 if (episode >= N_episodes_train) and (n_iter_episode > 2000): raise IOError("Bad policy found! Non-terminal episode!") # Append for video output if episode >= N_episodes_train: state_global_history_video.append(state_global_history) state_target_global_history_video.append([state_target_global] * len(state_global_history)) # Update run counters first (before updating Q) memory.update_run_counters( ) # use episode counters to update run counters # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Give output to user on occasion if (episode + 1) % (N_episodes_train / 20) == 0 or (episode >= N_episodes_train): n_optimal = np.abs( env.ygrid_global[state_global_history[0][0]] - env.ygrid_global[state_target_global[0]]) + np.abs( env.xgrid_global[state_global_history[0][1]] - env.xgrid_global[state_target_global[1]]) # ===================== # Print text # ===================== mode = "train" if (episode < N_episodes_train) else "test" print( " [{} episode = {}/{}] epsilon = {}, total reward = {:.1F}, n_actions = {}, n_optimal = {}, grid goal: [{},{}] -> [{},{}]" .format(mode, episode + 1, N_episodes_train + N_episodes_test, agent.epsilon, memory.R_total_episode, memory.N_actions_episode, n_optimal, env.ygrid_global[state_global_history[0][0]], env.xgrid_global[state_global_history[0][1]], env.ygrid_global[state_target_global[0]], env.xgrid_global[state_target_global[1]])) # ===================== # Make video animation # ===================== if save_video: print("\nSaving file to '{}'...".format(video_file)) plot_hunter_prey(state_global_history_video, state_target_global_history_video, env, video_file=video_file) if convert_mp4_to_gif: print("\nConverting '{}' to '{}'...".format(video_file, gif_file)) import moviepy.editor as mp clip = mp.VideoFileClip(video_file) clip.write_gif(gif_file)