def __init__(self, actions, name="qlearner", alpha=0.1, gamma=0.99, epsilon=0.2, explore="uniform", anneal=False): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. ''' Agent.__init__(self, name=name, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = 0.0 self.q_func = defaultdict(lambda: self.default_q) # Choose explore type. Can also be "uniform" for \epsilon-greedy. self.explore = explore
def __init__(self, policy, name="fixed-policy"): ''' Args: policy (func: S ---> A) ''' Agent.__init__(self, name=name, actions=[]) self.policy = policy self.name = name
def end_of_episode(self): ''' Summary: Resets the agents prior pointers. ''' if self.anneal: self._anneal() Agent.reset(self)
def end_of_episode(self): ''' Summary: Resets the agents prior pointers. ''' if self.anneal: self._anneal() self._action_history = [] Agent.end_of_episode(self)
def reset(self): self.step_number = 0 self.episode_number = 0 if self.custom_q_init: self.q_func = self.custom_q_init else: self.q_func = defaultdict( lambda: defaultdict(lambda: self.default_q)) Agent.reset(self)
def __init__(self): self.agent = Agent("PineApple") self.opponents = [Agent("") for i in range(7)] # self.campaigns = {} self.campaignOffer = None self.day = 0 #debug fields: self.ucs_level_requested_yesterday = -1
def __init__(self, actions, gamma=0.95, horizon=4, s_a_threshold=10): Agent.__init__(self, name="rmax-h" + str(horizon), actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.s_a_threshold = s_a_threshold self.reset()
def __init__(self, actions, env_model, explore_param=m.sqrt(2), rollout_depth=100, num_rollouts_per_step=50, name="mcts", gamma=0.99): self.env_model = env_model self.rollout_depth = rollout_depth self.num_rollouts_per_step = num_rollouts_per_step self.value_total = defaultdict(float) self.explore_param = explore_param self.visitation_counts = defaultdict(lambda: 1) Agent.__init__(self, name=name, actions=actions, gamma=gamma)
def __init__(self, actions, name="Q-learning", alpha=0.1, gamma=0.9, epsilon=0.05, explore="uniform", anneal=False, custom_q_init=None, default_q=0): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. custom_q_init (defaultdict{state, defaultdict{action, float}}): a dictionary of dictionaries storing the initial q-values. Can be used for potential shaping (Wiewiora, 2003) default_q (float): the default value to initialize every entry in the q-table with [by default, set to 0.0] ''' name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = default_q # 0 # 1 / (1 - self.gamma) self.explore = explore self.custom_q_init = custom_q_init self._action_history = [] # store actions taken # Q Function: if self.custom_q_init: self.q_func = self.custom_q_init else: self.q_func = defaultdict( lambda: defaultdict(lambda: self.default_q))
def __init__(self, actions, name="qlearner", alpha=0.05, gamma=0.95, epsilon=0.01, explore="softmax"): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. ''' Agent.__init__(self, name=name, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha = alpha self.epsilon = epsilon # Choose explore type. Can also be "uniform" for \epsilon-greedy. self.explore = explore
def main(): # ============================== # Settings # ============================== N_episodes = 200 load_model = False # load model save_model = True # save model on last episode save_model_filename = os.path.join("model", "model.h5") info = { "env": {"Ny": 20, "Nx": 20}, "agent": {"policy_mode": "epsgreedy", # "epsgreedy", "softmax" "eps": 1.0, "eps_decay": 2.0*np.log(10.0)/N_episodes}, "brain": {"discount": 0.99, "learning_rate": 0.9}, "memory": {} } # ============================== # Setup environment and agent # ============================== env = Environment(info) agent = Agent(env, info) brain = Brain(env, info) memory = Memory(info) if load_model: brain.load_model(save_model_filename) # ============================== # Train agent # ============================== for episode in range(N_episodes): iter = 0 state = env.starting_state() while env.is_terminal_state(state) == False: # Pick an action by sampling action probabilities action, model_output, prob = agent.get_action(state, brain, env) # Collect reward and observe next state reward = env.get_reward(state, action) state_next = env.perform_action(state, action) # Append quantities to memory memory.append_to_memory(state, state_next, action, model_output, prob, reward) # Transition to next state state = state_next iter += 1 # Print policy_mode = agent.agent_info["policy_mode"] if (policy_mode == "epsgreedy"): print("[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}".format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory))) elif (policy_mode == "softmax"): print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(episode, policy_mode, iter, sum(memory.reward_memory))) # Update model when episode finishes brain.update(memory, env) agent.episode += 1 # Save model if save_model and (episode == N_episodes-1): brain.save_model(save_model_filename) # Clear memory for next episode memory.clear_memory()
def main(): # ========================= # Settings # ========================= learning_mode = "SampleAveraging" if learning_mode == "SampleAveraging": from SampleAveraging_BrainClass import Brain N_episodes_train = 100000 N_episodes_test = 30 agent_info = {"name": "hunter", "epsilon": 0.5} env_info = {"N_global": 7} brain_info = {} elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes_train = 10000 N_episodes_test = 30 agent_info = {"name": "hunter", "epsilon": 0.5} env_info = {"N_global": 7} brain_info = { "learning_rate": 0.8, "discount": 0.9 } # only relevant for Q-learning else: raise IOError("Error: Invalid learning mode!") save_video = True video_file = "results/hunterprey.mp4" convert_mp4_to_gif = True gif_file = "results/hunterprey.gif" # ========================= # Set up environment, agent, memory and brain # ========================= agent = Agent(agent_info) env = Environment(env_info) brain = Brain(env, brain_info) memory = Memory(env) # ========================= # Train agent # ========================= print( "\nTraining '{}' agent on '{}' environment for {} episodes, testing for {} episodes (epsilon = {})...\n" .format(agent.name, env.name, N_episodes_train, N_episodes_test, agent.epsilon)) memory.reset_run_counters() # reset run counters once only state_global_history_video = [] state_target_global_history_video = [] for episode in range(N_episodes_train + N_episodes_test): if (episode >= N_episodes_train): agent.epsilon = 0 # set no exploration for test episodes memory.reset_episode_counters() # reset episodic counters # state = position of hunter relative to prey (want to get to [0,0]) # state_global = global position of hunter # state_target_global = global position of prey if episode == 0: (state, state_global, state_target_global) = env.get_random_state() else: (state, state_global, state_target_global) = env.get_random_state( set_state_global=state_global) env.set_state_terminal_global(state_target_global) state_global_history = [state_global] n_iter_episode = 0 while not env.is_terminal( state ): # NOTE: terminates when hunter hits local coordinates of (0,0) # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters( state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) state_global_next = env.perform_action_global(state_global, action) # Update Q after episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next state_global = state_global_next # Track states for video state_global_history.append(state_global) # Exit program if testing fails (bad policy) n_iter_episode += 1 if (episode >= N_episodes_train) and (n_iter_episode > 2000): raise IOError("Bad policy found! Non-terminal episode!") # Append for video output if episode >= N_episodes_train: state_global_history_video.append(state_global_history) state_target_global_history_video.append([state_target_global] * len(state_global_history)) # Update run counters first (before updating Q) memory.update_run_counters( ) # use episode counters to update run counters # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Give output to user on occasion if (episode + 1) % (N_episodes_train / 20) == 0 or (episode >= N_episodes_train): n_optimal = np.abs( env.ygrid_global[state_global_history[0][0]] - env.ygrid_global[state_target_global[0]]) + np.abs( env.xgrid_global[state_global_history[0][1]] - env.xgrid_global[state_target_global[1]]) # ===================== # Print text # ===================== mode = "train" if (episode < N_episodes_train) else "test" print( " [{} episode = {}/{}] epsilon = {}, total reward = {:.1F}, n_actions = {}, n_optimal = {}, grid goal: [{},{}] -> [{},{}]" .format(mode, episode + 1, N_episodes_train + N_episodes_test, agent.epsilon, memory.R_total_episode, memory.N_actions_episode, n_optimal, env.ygrid_global[state_global_history[0][0]], env.xgrid_global[state_global_history[0][1]], env.ygrid_global[state_target_global[0]], env.xgrid_global[state_target_global[1]])) # ===================== # Make video animation # ===================== if save_video: print("\nSaving file to '{}'...".format(video_file)) plot_hunter_prey(state_global_history_video, state_target_global_history_video, env, video_file=video_file) if convert_mp4_to_gif: print("\nConverting '{}' to '{}'...".format(video_file, gif_file)) import moviepy.editor as mp clip = mp.VideoFileClip(video_file) clip.write_gif(gif_file)
def main(): # ========================= # Settings # ========================= learning_mode = "QLearning" # "RewardAveraging", "QLearning" if learning_mode == "RewardAveraging": from RewardAveraging_BrainClass import Brain N_episodes = 100000 env_info = {"Ny": 7, "Nx": 7} brain_info = {} agent_info = { "name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes } elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes = 10000 env_info = {"Ny": 7, "Nx": 7} brain_info = { "Q_learning_rate": 0.95, "Q_discount": 1.0 } # only relevant for Q-learning agent_info = { "name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes } else: raise IOError("Error: Invalid learning mode!") # ========================= # Set up environment, agent, memory and brain # ========================= env = Environment( env_info) # set up environment rewards and state-transition rules agent = Agent(agent_info) # set up epsilon-greedy agent brain = Brain(env, brain_info) # stores and updates Q(s,a) and policy(s) memory = Memory(env) # keeps track of run and episode (s,a) histories # ========================= # Train agent # ========================= print( "\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n" .format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon)) memory.reset_run_counters() # reset run counters once only for episode in range(N_episodes): memory.reset_episode_counters() # reset episodic counters state = env.starting_state() # starting state while not env.is_terminal(state): # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters( state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) # Update Q during episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next # Update run counters first (before updating Q) memory.update_run_counters( ) # use episode counters to update run counters agent.episode += 1 # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Print if (episode + 1) % (N_episodes / 20) == 0: print( " episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}" .format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode)) # ======================= # Print final policy # ======================= print("\nFinal policy:\n") print(brain.compute_policy(env)) print("") for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)): print(" action['{}'] = {}".format(key, val))
def main(): # ========================= # Settings # ========================= learning_mode = "QLearning" # "RewardAveraging", "QLearning" if learning_mode == "RewardAveraging": from RewardAveraging_BrainClass import Brain N_episodes = 100000 env_info = {"Ny": 7, "Nx": 7} brain_info = {} agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes} elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes = 10000 env_info = {"Ny": 7, "Nx": 7} brain_info = {"Q_learning_rate": 0.95, "Q_discount": 1.0} # only relevant for Q-learning agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes} else: raise IOError("Error: Invalid learning mode!") # ========================= # Set up environment, agent, memory and brain # ========================= env = Environment(env_info) # set up environment rewards and state-transition rules agent = Agent(agent_info) # set up epsilon-greedy agent brain = Brain(env, brain_info) # stores and updates Q(s,a) and policy(s) memory = Memory(env) # keeps track of run and episode (s,a) histories # ========================= # Train agent # ========================= print("\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n".format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon)) memory.reset_run_counters() # reset run counters once only for episode in range(N_episodes): memory.reset_episode_counters() # reset episodic counters state = env.starting_state() # starting state while not env.is_terminal(state): # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters(state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) # Update Q during episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next # Update run counters first (before updating Q) memory.update_run_counters() # use episode counters to update run counters agent.episode += 1 # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Print if (episode+1) % (N_episodes/20) == 0: print(" episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}".format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode)) # ======================= # Print final policy # ======================= print("\nFinal policy:\n") print(brain.compute_policy(env)) print("") for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)): print(" action['{}'] = {}".format(key, val))
def main(): # ============================== # Settings # ============================== N_episodes = 200 load_model = False # load model save_model = True # save model on last episode save_model_filename = os.path.join("model", "model.h5") info = { "env": { "Ny": 20, "Nx": 20 }, "agent": { "policy_mode": "epsgreedy", # "epsgreedy", "softmax" "eps": 1.0, "eps_decay": 2.0 * np.log(10.0) / N_episodes }, "brain": { "discount": 0.99, "learning_rate": 0.9 }, "memory": {} } # ============================== # Setup environment and agent # ============================== env = Environment(info) agent = Agent(env, info) brain = Brain(env, info) memory = Memory(info) if load_model: brain.load_model(save_model_filename) # ============================== # Train agent # ============================== for episode in range(N_episodes): iter = 0 state = env.starting_state() while env.is_terminal_state(state) == False: # Pick an action by sampling action probabilities action, model_output, prob = agent.get_action(state, brain, env) # Collect reward and observe next state reward = env.get_reward(state, action) state_next = env.perform_action(state, action) # Append quantities to memory memory.append_to_memory(state, state_next, action, model_output, prob, reward) # Transition to next state state = state_next iter += 1 # Print policy_mode = agent.agent_info["policy_mode"] if (policy_mode == "epsgreedy"): print( "[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}" .format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory))) elif (policy_mode == "softmax"): print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format( episode, policy_mode, iter, sum(memory.reward_memory))) # Update model when episode finishes brain.update(memory, env) agent.episode += 1 # Save model if save_model and (episode == N_episodes - 1): brain.save_model(save_model_filename) # Clear memory for next episode memory.clear_memory()
def __init__(self, actions, name=""): name = "random" if name is "" else name Agent.__init__(self, name=name, actions=actions)
def __init__(self, actions): Agent.__init__(self, name="random", actions=actions)
def reset(self): self.step_number = 0 self.q_func = defaultdict(lambda: self.default_q) Agent.reset(self)