예제 #1
0
    def __init__(self,
                 actions,
                 name="qlearner",
                 alpha=0.1,
                 gamma=0.99,
                 epsilon=0.2,
                 explore="uniform",
                 anneal=False):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
        '''
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = 0.0
        self.q_func = defaultdict(lambda: self.default_q)

        # Choose explore type. Can also be "uniform" for \epsilon-greedy.
        self.explore = explore
예제 #2
0
 def __init__(self, policy, name="fixed-policy"):
     '''
     Args:
         policy (func: S ---> A)
     '''
     Agent.__init__(self, name=name, actions=[])
     self.policy = policy
     self.name = name
예제 #3
0
 def end_of_episode(self):
     '''
     Summary:
         Resets the agents prior pointers.
     '''
     if self.anneal:
         self._anneal()
     Agent.reset(self)
예제 #4
0
 def end_of_episode(self):
     '''
     Summary:
         Resets the agents prior pointers.
     '''
     if self.anneal:
         self._anneal()
     self._action_history = []
     Agent.end_of_episode(self)
예제 #5
0
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     if self.custom_q_init:
         self.q_func = self.custom_q_init
     else:
         self.q_func = defaultdict(
             lambda: defaultdict(lambda: self.default_q))
     Agent.reset(self)
예제 #6
0
    def __init__(self):
        self.agent = Agent("PineApple")
        self.opponents = [Agent("") for i in range(7)]
        #        self.campaigns = {}
        self.campaignOffer = None
        self.day = 0

        #debug fields:
        self.ucs_level_requested_yesterday = -1
예제 #7
0
 def __init__(self, actions, gamma=0.95, horizon=4, s_a_threshold=10):
     Agent.__init__(self,
                    name="rmax-h" + str(horizon),
                    actions=actions,
                    gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.s_a_threshold = s_a_threshold
     self.reset()
예제 #8
0
    def __init__(self,
                 actions,
                 env_model,
                 explore_param=m.sqrt(2),
                 rollout_depth=100,
                 num_rollouts_per_step=50,
                 name="mcts",
                 gamma=0.99):
        self.env_model = env_model
        self.rollout_depth = rollout_depth
        self.num_rollouts_per_step = num_rollouts_per_step
        self.value_total = defaultdict(float)
        self.explore_param = explore_param
        self.visitation_counts = defaultdict(lambda: 1)

        Agent.__init__(self, name=name, actions=actions, gamma=gamma)
예제 #9
0
    def __init__(self,
                 actions,
                 name="Q-learning",
                 alpha=0.1,
                 gamma=0.9,
                 epsilon=0.05,
                 explore="uniform",
                 anneal=False,
                 custom_q_init=None,
                 default_q=0):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
            custom_q_init (defaultdict{state, defaultdict{action, float}}): a dictionary of dictionaries storing the initial q-values. Can be used for potential shaping (Wiewiora, 2003)
            default_q (float): the default value to initialize every entry in the q-table with [by default, set to 0.0]
        '''
        name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self,
                       name=name + name_ext,
                       actions=actions,
                       gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = default_q  # 0 # 1 / (1 - self.gamma)
        self.explore = explore
        self.custom_q_init = custom_q_init
        self._action_history = []  # store actions taken

        # Q Function:
        if self.custom_q_init:
            self.q_func = self.custom_q_init
        else:
            self.q_func = defaultdict(
                lambda: defaultdict(lambda: self.default_q))
예제 #10
0
    def __init__(self,
                 actions,
                 name="qlearner",
                 alpha=0.05,
                 gamma=0.95,
                 epsilon=0.01,
                 explore="softmax"):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
        '''
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha = alpha
        self.epsilon = epsilon

        # Choose explore type. Can also be "uniform" for \epsilon-greedy.
        self.explore = explore
def main():
    # ==============================
    # Settings
    # ==============================
    N_episodes = 200
    load_model = False  # load model
    save_model = True  # save model on last episode
    save_model_filename = os.path.join("model", "model.h5")

    info = {
        "env": {"Ny": 20,
                "Nx": 20},
        "agent": {"policy_mode": "epsgreedy", # "epsgreedy", "softmax"
                  "eps": 1.0,
                  "eps_decay": 2.0*np.log(10.0)/N_episodes},
        "brain": {"discount": 0.99,
                  "learning_rate": 0.9},
        "memory": {}
    }

    # ==============================
    # Setup environment and agent
    # ==============================
    env = Environment(info)
    agent = Agent(env, info)
    brain = Brain(env, info)
    memory = Memory(info)

    if load_model:
        brain.load_model(save_model_filename)

    # ==============================
    # Train agent
    # ==============================
    for episode in range(N_episodes):

        iter = 0
        state = env.starting_state()
        while env.is_terminal_state(state) == False:
            # Pick an action by sampling action probabilities
            action, model_output, prob = agent.get_action(state, brain, env)
            # Collect reward and observe next state
            reward = env.get_reward(state, action)
            state_next = env.perform_action(state, action)
            # Append quantities to memory
            memory.append_to_memory(state, state_next, action, model_output, prob, reward)
            # Transition to next state
            state = state_next
            iter += 1

        # Print
        policy_mode = agent.agent_info["policy_mode"]
        if (policy_mode == "epsgreedy"):

            print("[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}".format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory)))

        elif (policy_mode == "softmax"):

            print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(episode, policy_mode, iter, sum(memory.reward_memory)))

        # Update model when episode finishes
        brain.update(memory, env)
        agent.episode += 1

        # Save model
        if save_model and (episode == N_episodes-1):
            brain.save_model(save_model_filename)

        # Clear memory for next episode
        memory.clear_memory()
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "SampleAveraging"

    if learning_mode == "SampleAveraging":

        from SampleAveraging_BrainClass import Brain
        N_episodes_train = 100000
        N_episodes_test = 30
        agent_info = {"name": "hunter", "epsilon": 0.5}
        env_info = {"N_global": 7}
        brain_info = {}

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes_train = 10000
        N_episodes_test = 30
        agent_info = {"name": "hunter", "epsilon": 0.5}
        env_info = {"N_global": 7}
        brain_info = {
            "learning_rate": 0.8,
            "discount": 0.9
        }  # only relevant for Q-learning

    else:
        raise IOError("Error: Invalid learning mode!")

    save_video = True
    video_file = "results/hunterprey.mp4"
    convert_mp4_to_gif = True
    gif_file = "results/hunterprey.gif"

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    agent = Agent(agent_info)
    env = Environment(env_info)
    brain = Brain(env, brain_info)
    memory = Memory(env)

    # =========================
    # Train agent
    # =========================
    print(
        "\nTraining '{}' agent on '{}' environment for {} episodes, testing for {} episodes (epsilon = {})...\n"
        .format(agent.name, env.name, N_episodes_train, N_episodes_test,
                agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    state_global_history_video = []
    state_target_global_history_video = []
    for episode in range(N_episodes_train + N_episodes_test):
        if (episode >= N_episodes_train):
            agent.epsilon = 0  # set no exploration for test episodes
        memory.reset_episode_counters()  # reset episodic counters

        # state = position of hunter relative to prey (want to get to [0,0])
        # state_global = global position of hunter
        # state_target_global = global position of prey
        if episode == 0:
            (state, state_global, state_target_global) = env.get_random_state()
        else:
            (state, state_global, state_target_global) = env.get_random_state(
                set_state_global=state_global)
        env.set_state_terminal_global(state_target_global)

        state_global_history = [state_global]
        n_iter_episode = 0
        while not env.is_terminal(
                state
        ):  # NOTE: terminates when hunter hits local coordinates of (0,0)
            # Get action from policy
            action = agent.get_action(state, brain,
                                      env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(
                state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            state_global_next = env.perform_action_global(state_global, action)
            # Update Q after episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next,
                                              reward)
            # Transition to next state
            state = state_next
            state_global = state_global_next
            # Track states for video
            state_global_history.append(state_global)
            # Exit program if testing fails (bad policy)
            n_iter_episode += 1
            if (episode >= N_episodes_train) and (n_iter_episode > 2000):
                raise IOError("Bad policy found! Non-terminal episode!")

        # Append for video output
        if episode >= N_episodes_train:
            state_global_history_video.append(state_global_history)
            state_target_global_history_video.append([state_target_global] *
                                                     len(state_global_history))

        # Update run counters first (before updating Q)
        memory.update_run_counters(
        )  # use episode counters to update run counters

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Give output to user on occasion
        if (episode + 1) % (N_episodes_train / 20) == 0 or (episode >=
                                                            N_episodes_train):
            n_optimal = np.abs(
                env.ygrid_global[state_global_history[0][0]] -
                env.ygrid_global[state_target_global[0]]) + np.abs(
                    env.xgrid_global[state_global_history[0][1]] -
                    env.xgrid_global[state_target_global[1]])

            # =====================
            # Print text
            # =====================
            mode = "train" if (episode < N_episodes_train) else "test"
            print(
                " [{} episode = {}/{}] epsilon = {}, total reward = {:.1F}, n_actions = {}, n_optimal = {}, grid goal: [{},{}] -> [{},{}]"
                .format(mode, episode + 1, N_episodes_train + N_episodes_test,
                        agent.epsilon, memory.R_total_episode,
                        memory.N_actions_episode, n_optimal,
                        env.ygrid_global[state_global_history[0][0]],
                        env.xgrid_global[state_global_history[0][1]],
                        env.ygrid_global[state_target_global[0]],
                        env.xgrid_global[state_target_global[1]]))

    # =====================
    # Make video animation
    # =====================
    if save_video:
        print("\nSaving file to '{}'...".format(video_file))
        plot_hunter_prey(state_global_history_video,
                         state_target_global_history_video,
                         env,
                         video_file=video_file)

        if convert_mp4_to_gif:
            print("\nConverting '{}' to '{}'...".format(video_file, gif_file))
            import moviepy.editor as mp
            clip = mp.VideoFileClip(video_file)
            clip.write_gif(gif_file)
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "QLearning"  # "RewardAveraging", "QLearning"

    if learning_mode == "RewardAveraging":

        from RewardAveraging_BrainClass import Brain
        N_episodes = 100000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {}
        agent_info = {
            "name": "epsilon-greedy",
            "epsilon": 1.0,
            "epsilon_decay": 2.0 * np.log(10.0) / N_episodes
        }

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes = 10000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {
            "Q_learning_rate": 0.95,
            "Q_discount": 1.0
        }  # only relevant for Q-learning
        agent_info = {
            "name": "epsilon-greedy",
            "epsilon": 1.0,
            "epsilon_decay": 2.0 * np.log(10.0) / N_episodes
        }

    else:
        raise IOError("Error: Invalid learning mode!")

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    env = Environment(
        env_info)  # set up environment rewards and state-transition rules
    agent = Agent(agent_info)  # set up epsilon-greedy agent
    brain = Brain(env, brain_info)  # stores and updates Q(s,a) and policy(s)
    memory = Memory(env)  # keeps track of run and episode (s,a) histories

    # =========================
    # Train agent
    # =========================
    print(
        "\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n"
        .format(agent.name, env.name, N_episodes, learning_mode,
                agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    for episode in range(N_episodes):
        memory.reset_episode_counters()  # reset episodic counters
        state = env.starting_state()  # starting state
        while not env.is_terminal(state):
            # Get action from policy
            action = agent.get_action(state, brain,
                                      env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(
                state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            # Update Q during episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next,
                                              reward)
            # Transition to next state
            state = state_next

        # Update run counters first (before updating Q)
        memory.update_run_counters(
        )  # use episode counters to update run counters
        agent.episode += 1

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Print
        if (episode + 1) % (N_episodes / 20) == 0:
            print(
                " episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}"
                .format(episode + 1, N_episodes, agent.epsilon_effective,
                        memory.R_total_episode, memory.N_actions_episode))

    # =======================
    # Print final policy
    # =======================
    print("\nFinal policy:\n")
    print(brain.compute_policy(env))
    print("")
    for (key, val) in sorted(env.action_dict.items(),
                             key=operator.itemgetter(1)):
        print(" action['{}'] = {}".format(key, val))
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "QLearning"  # "RewardAveraging", "QLearning"

    if learning_mode == "RewardAveraging":

        from RewardAveraging_BrainClass import Brain
        N_episodes = 100000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {}
        agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes}

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes = 10000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {"Q_learning_rate": 0.95, "Q_discount": 1.0}  # only relevant for Q-learning
        agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes}

    else:
        raise IOError("Error: Invalid learning mode!")

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    env = Environment(env_info)  # set up environment rewards and state-transition rules
    agent = Agent(agent_info)  # set up epsilon-greedy agent
    brain = Brain(env, brain_info)  # stores and updates Q(s,a) and policy(s)
    memory = Memory(env)  # keeps track of run and episode (s,a) histories

    # =========================
    # Train agent
    # =========================
    print("\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n".format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    for episode in range(N_episodes):
        memory.reset_episode_counters()  # reset episodic counters
        state = env.starting_state()  # starting state
        while not env.is_terminal(state):
            # Get action from policy
            action = agent.get_action(state, brain, env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            # Update Q during episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next, reward)
            # Transition to next state
            state = state_next

        # Update run counters first (before updating Q)
        memory.update_run_counters()  # use episode counters to update run counters
        agent.episode += 1

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Print
        if (episode+1) % (N_episodes/20) == 0:
            print(" episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}".format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode))

    # =======================
    # Print final policy
    # =======================
    print("\nFinal policy:\n")
    print(brain.compute_policy(env))
    print("")
    for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)):
        print(" action['{}'] = {}".format(key, val))
예제 #15
0
def main():
    # ==============================
    # Settings
    # ==============================
    N_episodes = 200
    load_model = False  # load model
    save_model = True  # save model on last episode
    save_model_filename = os.path.join("model", "model.h5")

    info = {
        "env": {
            "Ny": 20,
            "Nx": 20
        },
        "agent": {
            "policy_mode": "epsgreedy",  # "epsgreedy", "softmax"
            "eps": 1.0,
            "eps_decay": 2.0 * np.log(10.0) / N_episodes
        },
        "brain": {
            "discount": 0.99,
            "learning_rate": 0.9
        },
        "memory": {}
    }

    # ==============================
    # Setup environment and agent
    # ==============================
    env = Environment(info)
    agent = Agent(env, info)
    brain = Brain(env, info)
    memory = Memory(info)

    if load_model:
        brain.load_model(save_model_filename)

    # ==============================
    # Train agent
    # ==============================
    for episode in range(N_episodes):

        iter = 0
        state = env.starting_state()
        while env.is_terminal_state(state) == False:
            # Pick an action by sampling action probabilities
            action, model_output, prob = agent.get_action(state, brain, env)
            # Collect reward and observe next state
            reward = env.get_reward(state, action)
            state_next = env.perform_action(state, action)
            # Append quantities to memory
            memory.append_to_memory(state, state_next, action, model_output,
                                    prob, reward)
            # Transition to next state
            state = state_next
            iter += 1

        # Print
        policy_mode = agent.agent_info["policy_mode"]
        if (policy_mode == "epsgreedy"):

            print(
                "[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}"
                .format(episode, policy_mode, iter, agent.eps_effective,
                        sum(memory.reward_memory)))

        elif (policy_mode == "softmax"):

            print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(
                episode, policy_mode, iter, sum(memory.reward_memory)))

        # Update model when episode finishes
        brain.update(memory, env)
        agent.episode += 1

        # Save model
        if save_model and (episode == N_episodes - 1):
            brain.save_model(save_model_filename)

        # Clear memory for next episode
        memory.clear_memory()
예제 #16
0
 def __init__(self, actions, name=""):
 	name = "random" if name is "" else name
     Agent.__init__(self, name=name, actions=actions)
예제 #17
0
 def __init__(self, actions):
     Agent.__init__(self, name="random", actions=actions)
예제 #18
0
 def reset(self):
     self.step_number = 0
     self.q_func = defaultdict(lambda: self.default_q)
     Agent.reset(self)