def main():
    # ==============================
    # Settings
    # ==============================
    N_episodes = 200
    load_model = False  # load model
    save_model = True  # save model on last episode
    save_model_filename = os.path.join("model", "model.h5")

    info = {
        "env": {"Ny": 20,
                "Nx": 20},
        "agent": {"policy_mode": "epsgreedy", # "epsgreedy", "softmax"
                  "eps": 1.0,
                  "eps_decay": 2.0*np.log(10.0)/N_episodes},
        "brain": {"discount": 0.99,
                  "learning_rate": 0.9},
        "memory": {}
    }

    # ==============================
    # Setup environment and agent
    # ==============================
    env = Environment(info)
    agent = Agent(env, info)
    brain = Brain(env, info)
    memory = Memory(info)

    if load_model:
        brain.load_model(save_model_filename)

    # ==============================
    # Train agent
    # ==============================
    for episode in range(N_episodes):

        iter = 0
        state = env.starting_state()
        while env.is_terminal_state(state) == False:
            # Pick an action by sampling action probabilities
            action, model_output, prob = agent.get_action(state, brain, env)
            # Collect reward and observe next state
            reward = env.get_reward(state, action)
            state_next = env.perform_action(state, action)
            # Append quantities to memory
            memory.append_to_memory(state, state_next, action, model_output, prob, reward)
            # Transition to next state
            state = state_next
            iter += 1

        # Print
        policy_mode = agent.agent_info["policy_mode"]
        if (policy_mode == "epsgreedy"):

            print("[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}".format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory)))

        elif (policy_mode == "softmax"):

            print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(episode, policy_mode, iter, sum(memory.reward_memory)))

        # Update model when episode finishes
        brain.update(memory, env)
        agent.episode += 1

        # Save model
        if save_model and (episode == N_episodes-1):
            brain.save_model(save_model_filename)

        # Clear memory for next episode
        memory.clear_memory()
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "QLearning"  # "RewardAveraging", "QLearning"

    if learning_mode == "RewardAveraging":

        from RewardAveraging_BrainClass import Brain
        N_episodes = 100000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {}
        agent_info = {
            "name": "epsilon-greedy",
            "epsilon": 1.0,
            "epsilon_decay": 2.0 * np.log(10.0) / N_episodes
        }

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes = 10000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {
            "Q_learning_rate": 0.95,
            "Q_discount": 1.0
        }  # only relevant for Q-learning
        agent_info = {
            "name": "epsilon-greedy",
            "epsilon": 1.0,
            "epsilon_decay": 2.0 * np.log(10.0) / N_episodes
        }

    else:
        raise IOError("Error: Invalid learning mode!")

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    env = Environment(
        env_info)  # set up environment rewards and state-transition rules
    agent = Agent(agent_info)  # set up epsilon-greedy agent
    brain = Brain(env, brain_info)  # stores and updates Q(s,a) and policy(s)
    memory = Memory(env)  # keeps track of run and episode (s,a) histories

    # =========================
    # Train agent
    # =========================
    print(
        "\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n"
        .format(agent.name, env.name, N_episodes, learning_mode,
                agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    for episode in range(N_episodes):
        memory.reset_episode_counters()  # reset episodic counters
        state = env.starting_state()  # starting state
        while not env.is_terminal(state):
            # Get action from policy
            action = agent.get_action(state, brain,
                                      env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(
                state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            # Update Q during episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next,
                                              reward)
            # Transition to next state
            state = state_next

        # Update run counters first (before updating Q)
        memory.update_run_counters(
        )  # use episode counters to update run counters
        agent.episode += 1

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Print
        if (episode + 1) % (N_episodes / 20) == 0:
            print(
                " episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}"
                .format(episode + 1, N_episodes, agent.epsilon_effective,
                        memory.R_total_episode, memory.N_actions_episode))

    # =======================
    # Print final policy
    # =======================
    print("\nFinal policy:\n")
    print(brain.compute_policy(env))
    print("")
    for (key, val) in sorted(env.action_dict.items(),
                             key=operator.itemgetter(1)):
        print(" action['{}'] = {}".format(key, val))
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "QLearning"  # "RewardAveraging", "QLearning"

    if learning_mode == "RewardAveraging":

        from RewardAveraging_BrainClass import Brain
        N_episodes = 100000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {}
        agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes}

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes = 10000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {"Q_learning_rate": 0.95, "Q_discount": 1.0}  # only relevant for Q-learning
        agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes}

    else:
        raise IOError("Error: Invalid learning mode!")

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    env = Environment(env_info)  # set up environment rewards and state-transition rules
    agent = Agent(agent_info)  # set up epsilon-greedy agent
    brain = Brain(env, brain_info)  # stores and updates Q(s,a) and policy(s)
    memory = Memory(env)  # keeps track of run and episode (s,a) histories

    # =========================
    # Train agent
    # =========================
    print("\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n".format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    for episode in range(N_episodes):
        memory.reset_episode_counters()  # reset episodic counters
        state = env.starting_state()  # starting state
        while not env.is_terminal(state):
            # Get action from policy
            action = agent.get_action(state, brain, env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            # Update Q during episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next, reward)
            # Transition to next state
            state = state_next

        # Update run counters first (before updating Q)
        memory.update_run_counters()  # use episode counters to update run counters
        agent.episode += 1

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Print
        if (episode+1) % (N_episodes/20) == 0:
            print(" episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}".format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode))

    # =======================
    # Print final policy
    # =======================
    print("\nFinal policy:\n")
    print(brain.compute_policy(env))
    print("")
    for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)):
        print(" action['{}'] = {}".format(key, val))
예제 #4
0
def main():
    # ==============================
    # Settings
    # ==============================
    N_episodes = 200
    load_model = False  # load model
    save_model = True  # save model on last episode
    save_model_filename = os.path.join("model", "model.h5")

    info = {
        "env": {
            "Ny": 20,
            "Nx": 20
        },
        "agent": {
            "policy_mode": "epsgreedy",  # "epsgreedy", "softmax"
            "eps": 1.0,
            "eps_decay": 2.0 * np.log(10.0) / N_episodes
        },
        "brain": {
            "discount": 0.99,
            "learning_rate": 0.9
        },
        "memory": {}
    }

    # ==============================
    # Setup environment and agent
    # ==============================
    env = Environment(info)
    agent = Agent(env, info)
    brain = Brain(env, info)
    memory = Memory(info)

    if load_model:
        brain.load_model(save_model_filename)

    # ==============================
    # Train agent
    # ==============================
    for episode in range(N_episodes):

        iter = 0
        state = env.starting_state()
        while env.is_terminal_state(state) == False:
            # Pick an action by sampling action probabilities
            action, model_output, prob = agent.get_action(state, brain, env)
            # Collect reward and observe next state
            reward = env.get_reward(state, action)
            state_next = env.perform_action(state, action)
            # Append quantities to memory
            memory.append_to_memory(state, state_next, action, model_output,
                                    prob, reward)
            # Transition to next state
            state = state_next
            iter += 1

        # Print
        policy_mode = agent.agent_info["policy_mode"]
        if (policy_mode == "epsgreedy"):

            print(
                "[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}"
                .format(episode, policy_mode, iter, agent.eps_effective,
                        sum(memory.reward_memory)))

        elif (policy_mode == "softmax"):

            print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(
                episode, policy_mode, iter, sum(memory.reward_memory)))

        # Update model when episode finishes
        brain.update(memory, env)
        agent.episode += 1

        # Save model
        if save_model and (episode == N_episodes - 1):
            brain.save_model(save_model_filename)

        # Clear memory for next episode
        memory.clear_memory()
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "SampleAveraging"

    if learning_mode == "SampleAveraging":

        from SampleAveraging_BrainClass import Brain
        N_episodes_train = 100000
        N_episodes_test = 30
        agent_info = {"name": "hunter", "epsilon": 0.5}
        env_info = {"N_global": 7}
        brain_info = {}

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes_train = 10000
        N_episodes_test = 30
        agent_info = {"name": "hunter", "epsilon": 0.5}
        env_info = {"N_global": 7}
        brain_info = {
            "learning_rate": 0.8,
            "discount": 0.9
        }  # only relevant for Q-learning

    else:
        raise IOError("Error: Invalid learning mode!")

    save_video = True
    video_file = "results/hunterprey.mp4"
    convert_mp4_to_gif = True
    gif_file = "results/hunterprey.gif"

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    agent = Agent(agent_info)
    env = Environment(env_info)
    brain = Brain(env, brain_info)
    memory = Memory(env)

    # =========================
    # Train agent
    # =========================
    print(
        "\nTraining '{}' agent on '{}' environment for {} episodes, testing for {} episodes (epsilon = {})...\n"
        .format(agent.name, env.name, N_episodes_train, N_episodes_test,
                agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    state_global_history_video = []
    state_target_global_history_video = []
    for episode in range(N_episodes_train + N_episodes_test):
        if (episode >= N_episodes_train):
            agent.epsilon = 0  # set no exploration for test episodes
        memory.reset_episode_counters()  # reset episodic counters

        # state = position of hunter relative to prey (want to get to [0,0])
        # state_global = global position of hunter
        # state_target_global = global position of prey
        if episode == 0:
            (state, state_global, state_target_global) = env.get_random_state()
        else:
            (state, state_global, state_target_global) = env.get_random_state(
                set_state_global=state_global)
        env.set_state_terminal_global(state_target_global)

        state_global_history = [state_global]
        n_iter_episode = 0
        while not env.is_terminal(
                state
        ):  # NOTE: terminates when hunter hits local coordinates of (0,0)
            # Get action from policy
            action = agent.get_action(state, brain,
                                      env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(
                state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            state_global_next = env.perform_action_global(state_global, action)
            # Update Q after episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next,
                                              reward)
            # Transition to next state
            state = state_next
            state_global = state_global_next
            # Track states for video
            state_global_history.append(state_global)
            # Exit program if testing fails (bad policy)
            n_iter_episode += 1
            if (episode >= N_episodes_train) and (n_iter_episode > 2000):
                raise IOError("Bad policy found! Non-terminal episode!")

        # Append for video output
        if episode >= N_episodes_train:
            state_global_history_video.append(state_global_history)
            state_target_global_history_video.append([state_target_global] *
                                                     len(state_global_history))

        # Update run counters first (before updating Q)
        memory.update_run_counters(
        )  # use episode counters to update run counters

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Give output to user on occasion
        if (episode + 1) % (N_episodes_train / 20) == 0 or (episode >=
                                                            N_episodes_train):
            n_optimal = np.abs(
                env.ygrid_global[state_global_history[0][0]] -
                env.ygrid_global[state_target_global[0]]) + np.abs(
                    env.xgrid_global[state_global_history[0][1]] -
                    env.xgrid_global[state_target_global[1]])

            # =====================
            # Print text
            # =====================
            mode = "train" if (episode < N_episodes_train) else "test"
            print(
                " [{} episode = {}/{}] epsilon = {}, total reward = {:.1F}, n_actions = {}, n_optimal = {}, grid goal: [{},{}] -> [{},{}]"
                .format(mode, episode + 1, N_episodes_train + N_episodes_test,
                        agent.epsilon, memory.R_total_episode,
                        memory.N_actions_episode, n_optimal,
                        env.ygrid_global[state_global_history[0][0]],
                        env.xgrid_global[state_global_history[0][1]],
                        env.ygrid_global[state_target_global[0]],
                        env.xgrid_global[state_target_global[1]]))

    # =====================
    # Make video animation
    # =====================
    if save_video:
        print("\nSaving file to '{}'...".format(video_file))
        plot_hunter_prey(state_global_history_video,
                         state_target_global_history_video,
                         env,
                         video_file=video_file)

        if convert_mp4_to_gif:
            print("\nConverting '{}' to '{}'...".format(video_file, gif_file))
            import moviepy.editor as mp
            clip = mp.VideoFileClip(video_file)
            clip.write_gif(gif_file)