コード例 #1
0
def process(render=False):

    print("CartPole main start..")
    env = gym.make('CartPole-v0')

    # Initialize the simulation
    env.reset()
    # Take one random step to get the pole and cart moving
    state, reward, done, _ = env.step(env.action_space.sample())

    memory = Memory(max_size=memory_size)

    # Make a bunch of random actions and store the experiences
    for ii in range(pretrain_length):
        # Uncomment the line below to watch the simulation
        if render:
            env.render()

        # Make a random action
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)

        if done:
            # The simulation fails so no next state
            next_state = np.zeros(state.shape)
            # Add experience to memory
            memory.add((state, action, reward, next_state))

            # Start new episode
            env.reset()
            # Take one random step to get the pole and cart moving
            state, reward, done, _ = env.step(env.action_space.sample())
        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state))
            state = next_state

    #memory.checkBuffer()

    return memory, state, env
コード例 #2
0
ファイル: duelMain030.py プロジェクト: scrambleegg7/ATARI
import matplotlib.pyplot as plt

from MemoryClass import Memory
from StateClass import SteteClass
#from env import setEnv
from AgentClass_v4duel import AgentClass

# parameters ...
#
num_consecutive_iterations = 100
num_episodes = 500

initial_training = 20000  # start point to train the data
memory_size = 30000
memory = Memory(max_size=memory_size)

MINIBATCH_SIZE = 32

ENV_NAME = 'SpaceInvaders-v0'
SAVE_NETWORK_PATH = 'saved_networks/' + ENV_NAME
SAVE_SUMMARY_PATH = 'summary/' + ENV_NAME

env = gym.make(ENV_NAME)

STATE_LENGTH = 4
myAgent = AgentClass(env.action_space.n, STATE_LENGTH)


def rgb2gray(rgb):
    return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
コード例 #3
0
import gym
import numpy as np

from skimage.transform import resize
import matplotlib.pyplot as plt

from MemoryClass import Memory
from StateClass import SteteClass
#from env import setEnv
from AgentClass import AgentClass

from PIL import Image

myMemory = Memory(max_size=10)

x = range(20)

for item in x:

    myMemory.add(item)

print(myMemory.checkBuffer())
コード例 #4
0
def main():
    # ==============================
    # Settings
    # ==============================
    N_episodes = 200
    load_model = False  # load model
    save_model = True  # save model on last episode
    save_model_filename = os.path.join("model", "model.h5")

    info = {
        "env": {"Ny": 20,
                "Nx": 20},
        "agent": {"policy_mode": "epsgreedy", # "epsgreedy", "softmax"
                  "eps": 1.0,
                  "eps_decay": 2.0*np.log(10.0)/N_episodes},
        "brain": {"discount": 0.99,
                  "learning_rate": 0.9},
        "memory": {}
    }

    # ==============================
    # Setup environment and agent
    # ==============================
    env = Environment(info)
    agent = Agent(env, info)
    brain = Brain(env, info)
    memory = Memory(info)

    if load_model:
        brain.load_model(save_model_filename)

    # ==============================
    # Train agent
    # ==============================
    for episode in range(N_episodes):

        iter = 0
        state = env.starting_state()
        while env.is_terminal_state(state) == False:
            # Pick an action by sampling action probabilities
            action, model_output, prob = agent.get_action(state, brain, env)
            # Collect reward and observe next state
            reward = env.get_reward(state, action)
            state_next = env.perform_action(state, action)
            # Append quantities to memory
            memory.append_to_memory(state, state_next, action, model_output, prob, reward)
            # Transition to next state
            state = state_next
            iter += 1

        # Print
        policy_mode = agent.agent_info["policy_mode"]
        if (policy_mode == "epsgreedy"):

            print("[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}".format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory)))

        elif (policy_mode == "softmax"):

            print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(episode, policy_mode, iter, sum(memory.reward_memory)))

        # Update model when episode finishes
        brain.update(memory, env)
        agent.episode += 1

        # Save model
        if save_model and (episode == N_episodes-1):
            brain.save_model(save_model_filename)

        # Clear memory for next episode
        memory.clear_memory()
コード例 #5
0
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "QLearning"  # "RewardAveraging", "QLearning"

    if learning_mode == "RewardAveraging":

        from RewardAveraging_BrainClass import Brain
        N_episodes = 100000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {}
        agent_info = {
            "name": "epsilon-greedy",
            "epsilon": 1.0,
            "epsilon_decay": 2.0 * np.log(10.0) / N_episodes
        }

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes = 10000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {
            "Q_learning_rate": 0.95,
            "Q_discount": 1.0
        }  # only relevant for Q-learning
        agent_info = {
            "name": "epsilon-greedy",
            "epsilon": 1.0,
            "epsilon_decay": 2.0 * np.log(10.0) / N_episodes
        }

    else:
        raise IOError("Error: Invalid learning mode!")

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    env = Environment(
        env_info)  # set up environment rewards and state-transition rules
    agent = Agent(agent_info)  # set up epsilon-greedy agent
    brain = Brain(env, brain_info)  # stores and updates Q(s,a) and policy(s)
    memory = Memory(env)  # keeps track of run and episode (s,a) histories

    # =========================
    # Train agent
    # =========================
    print(
        "\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n"
        .format(agent.name, env.name, N_episodes, learning_mode,
                agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    for episode in range(N_episodes):
        memory.reset_episode_counters()  # reset episodic counters
        state = env.starting_state()  # starting state
        while not env.is_terminal(state):
            # Get action from policy
            action = agent.get_action(state, brain,
                                      env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(
                state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            # Update Q during episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next,
                                              reward)
            # Transition to next state
            state = state_next

        # Update run counters first (before updating Q)
        memory.update_run_counters(
        )  # use episode counters to update run counters
        agent.episode += 1

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Print
        if (episode + 1) % (N_episodes / 20) == 0:
            print(
                " episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}"
                .format(episode + 1, N_episodes, agent.epsilon_effective,
                        memory.R_total_episode, memory.N_actions_episode))

    # =======================
    # Print final policy
    # =======================
    print("\nFinal policy:\n")
    print(brain.compute_policy(env))
    print("")
    for (key, val) in sorted(env.action_dict.items(),
                             key=operator.itemgetter(1)):
        print(" action['{}'] = {}".format(key, val))
コード例 #6
0
def main():
    # ==============================
    # Settings
    # ==============================
    N_episodes = 200
    load_model = False  # load model
    save_model = True  # save model on last episode
    save_model_filename = os.path.join("model", "model.h5")

    info = {
        "env": {
            "Ny": 20,
            "Nx": 20
        },
        "agent": {
            "policy_mode": "epsgreedy",  # "epsgreedy", "softmax"
            "eps": 1.0,
            "eps_decay": 2.0 * np.log(10.0) / N_episodes
        },
        "brain": {
            "discount": 0.99,
            "learning_rate": 0.9
        },
        "memory": {}
    }

    # ==============================
    # Setup environment and agent
    # ==============================
    env = Environment(info)
    agent = Agent(env, info)
    brain = Brain(env, info)
    memory = Memory(info)

    if load_model:
        brain.load_model(save_model_filename)

    # ==============================
    # Train agent
    # ==============================
    for episode in range(N_episodes):

        iter = 0
        state = env.starting_state()
        while env.is_terminal_state(state) == False:
            # Pick an action by sampling action probabilities
            action, model_output, prob = agent.get_action(state, brain, env)
            # Collect reward and observe next state
            reward = env.get_reward(state, action)
            state_next = env.perform_action(state, action)
            # Append quantities to memory
            memory.append_to_memory(state, state_next, action, model_output,
                                    prob, reward)
            # Transition to next state
            state = state_next
            iter += 1

        # Print
        policy_mode = agent.agent_info["policy_mode"]
        if (policy_mode == "epsgreedy"):

            print(
                "[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}"
                .format(episode, policy_mode, iter, agent.eps_effective,
                        sum(memory.reward_memory)))

        elif (policy_mode == "softmax"):

            print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format(
                episode, policy_mode, iter, sum(memory.reward_memory)))

        # Update model when episode finishes
        brain.update(memory, env)
        agent.episode += 1

        # Save model
        if save_model and (episode == N_episodes - 1):
            brain.save_model(save_model_filename)

        # Clear memory for next episode
        memory.clear_memory()
コード例 #7
0
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "QLearning"  # "RewardAveraging", "QLearning"

    if learning_mode == "RewardAveraging":

        from RewardAveraging_BrainClass import Brain
        N_episodes = 100000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {}
        agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes}

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes = 10000
        env_info = {"Ny": 7, "Nx": 7}
        brain_info = {"Q_learning_rate": 0.95, "Q_discount": 1.0}  # only relevant for Q-learning
        agent_info = {"name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes}

    else:
        raise IOError("Error: Invalid learning mode!")

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    env = Environment(env_info)  # set up environment rewards and state-transition rules
    agent = Agent(agent_info)  # set up epsilon-greedy agent
    brain = Brain(env, brain_info)  # stores and updates Q(s,a) and policy(s)
    memory = Memory(env)  # keeps track of run and episode (s,a) histories

    # =========================
    # Train agent
    # =========================
    print("\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n".format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    for episode in range(N_episodes):
        memory.reset_episode_counters()  # reset episodic counters
        state = env.starting_state()  # starting state
        while not env.is_terminal(state):
            # Get action from policy
            action = agent.get_action(state, brain, env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            # Update Q during episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next, reward)
            # Transition to next state
            state = state_next

        # Update run counters first (before updating Q)
        memory.update_run_counters()  # use episode counters to update run counters
        agent.episode += 1

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Print
        if (episode+1) % (N_episodes/20) == 0:
            print(" episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}".format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode))

    # =======================
    # Print final policy
    # =======================
    print("\nFinal policy:\n")
    print(brain.compute_policy(env))
    print("")
    for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)):
        print(" action['{}'] = {}".format(key, val))
コード例 #8
0
ファイル: dueltest.py プロジェクト: scrambleegg7/ATARI
#startE = 1.0
#endE = 0.1
#annealing_steps = 10000
#Set the rate of random action decrease.
e = startE
stepDrop = (startE - endE)/annealing_steps

#create lists to contain total rewards and steps per episode
jList = []
rList = []
total_steps = 0
MINIBATCH_SIZE = 32

memory_size = 30000
memory = Memory(max_size=memory_size)



for i in range(num_episodes):
        myMemory = Memory()
        #Reset environment and get first new observation
        d = False
        rAll = 0
        j = 0
        episode_reward = 0

        observation = env.reset()
        #img = observation[1:176:2,::2]
        #print(img.shape)
        #plt.imshow(img)
コード例 #9
0
ファイル: dueltest2.py プロジェクト: scrambleegg7/ATARI
    error = tf.abs(y - q_value)
    clipped_error = tf.clip_by_value(error, 0.0, 1.0)
    linear_error = 2 * (error - clipped_error)
    loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)

    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = tf.train.MomentumOptimizer(learning_rate,
                                           momentum,
                                           use_nesterov=True)
    training_op = optimizer.minimize(loss, global_step=global_step)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Let's implement a simple replay memory
memory = Memory(max_size=pre_train_steps * 2)

mspacman_color = np.array([210, 164, 74]).mean()


def preprocess_observation(obs):
    img = obs[1:176:2, ::2]  # crop and downsize
    img = img.mean(axis=2)  # to greyscale
    img[img == mspacman_color] = 0  # Improve contrast
    img = (img - 128) / 128 - 1  # normalize from -1. to 1.
    return img.reshape(88, 80, 1)


def get_initial_state(observation, last_observation):

    init_image = rgb2gray(observation)
コード例 #10
0
def main():
    # =========================
    # Settings
    # =========================
    learning_mode = "SampleAveraging"

    if learning_mode == "SampleAveraging":

        from SampleAveraging_BrainClass import Brain
        N_episodes_train = 100000
        N_episodes_test = 30
        agent_info = {"name": "hunter", "epsilon": 0.5}
        env_info = {"N_global": 7}
        brain_info = {}

    elif learning_mode == "QLearning":

        from QLearning_BrainClass import Brain
        N_episodes_train = 10000
        N_episodes_test = 30
        agent_info = {"name": "hunter", "epsilon": 0.5}
        env_info = {"N_global": 7}
        brain_info = {
            "learning_rate": 0.8,
            "discount": 0.9
        }  # only relevant for Q-learning

    else:
        raise IOError("Error: Invalid learning mode!")

    save_video = True
    video_file = "results/hunterprey.mp4"
    convert_mp4_to_gif = True
    gif_file = "results/hunterprey.gif"

    # =========================
    # Set up environment, agent, memory and brain
    # =========================
    agent = Agent(agent_info)
    env = Environment(env_info)
    brain = Brain(env, brain_info)
    memory = Memory(env)

    # =========================
    # Train agent
    # =========================
    print(
        "\nTraining '{}' agent on '{}' environment for {} episodes, testing for {} episodes (epsilon = {})...\n"
        .format(agent.name, env.name, N_episodes_train, N_episodes_test,
                agent.epsilon))

    memory.reset_run_counters()  # reset run counters once only
    state_global_history_video = []
    state_target_global_history_video = []
    for episode in range(N_episodes_train + N_episodes_test):
        if (episode >= N_episodes_train):
            agent.epsilon = 0  # set no exploration for test episodes
        memory.reset_episode_counters()  # reset episodic counters

        # state = position of hunter relative to prey (want to get to [0,0])
        # state_global = global position of hunter
        # state_target_global = global position of prey
        if episode == 0:
            (state, state_global, state_target_global) = env.get_random_state()
        else:
            (state, state_global, state_target_global) = env.get_random_state(
                set_state_global=state_global)
        env.set_state_terminal_global(state_target_global)

        state_global_history = [state_global]
        n_iter_episode = 0
        while not env.is_terminal(
                state
        ):  # NOTE: terminates when hunter hits local coordinates of (0,0)
            # Get action from policy
            action = agent.get_action(state, brain,
                                      env)  # get action from policy
            # Collect reward from environment
            reward = env.get_reward(state, action)  # get reward
            # Update episode counters
            memory.update_episode_counters(
                state, action, reward)  # update our episodic counters
            # Compute and observe next state
            state_next = env.perform_action(state, action)
            state_global_next = env.perform_action_global(state_global, action)
            # Update Q after episode (if needed)
            if "update_Q_during_episode" in utils.method_list(Brain):
                brain.update_Q_during_episode(state, action, state_next,
                                              reward)
            # Transition to next state
            state = state_next
            state_global = state_global_next
            # Track states for video
            state_global_history.append(state_global)
            # Exit program if testing fails (bad policy)
            n_iter_episode += 1
            if (episode >= N_episodes_train) and (n_iter_episode > 2000):
                raise IOError("Bad policy found! Non-terminal episode!")

        # Append for video output
        if episode >= N_episodes_train:
            state_global_history_video.append(state_global_history)
            state_target_global_history_video.append([state_target_global] *
                                                     len(state_global_history))

        # Update run counters first (before updating Q)
        memory.update_run_counters(
        )  # use episode counters to update run counters

        # Update Q after episode (if needed)
        if "update_Q_after_episode" in utils.method_list(Brain):
            brain.update_Q_after_episode(memory)

        # Give output to user on occasion
        if (episode + 1) % (N_episodes_train / 20) == 0 or (episode >=
                                                            N_episodes_train):
            n_optimal = np.abs(
                env.ygrid_global[state_global_history[0][0]] -
                env.ygrid_global[state_target_global[0]]) + np.abs(
                    env.xgrid_global[state_global_history[0][1]] -
                    env.xgrid_global[state_target_global[1]])

            # =====================
            # Print text
            # =====================
            mode = "train" if (episode < N_episodes_train) else "test"
            print(
                " [{} episode = {}/{}] epsilon = {}, total reward = {:.1F}, n_actions = {}, n_optimal = {}, grid goal: [{},{}] -> [{},{}]"
                .format(mode, episode + 1, N_episodes_train + N_episodes_test,
                        agent.epsilon, memory.R_total_episode,
                        memory.N_actions_episode, n_optimal,
                        env.ygrid_global[state_global_history[0][0]],
                        env.xgrid_global[state_global_history[0][1]],
                        env.ygrid_global[state_target_global[0]],
                        env.xgrid_global[state_target_global[1]]))

    # =====================
    # Make video animation
    # =====================
    if save_video:
        print("\nSaving file to '{}'...".format(video_file))
        plot_hunter_prey(state_global_history_video,
                         state_target_global_history_video,
                         env,
                         video_file=video_file)

        if convert_mp4_to_gif:
            print("\nConverting '{}' to '{}'...".format(video_file, gif_file))
            import moviepy.editor as mp
            clip = mp.VideoFileClip(video_file)
            clip.write_gif(gif_file)
コード例 #11
0
ファイル: keepMemory.py プロジェクト: scrambleegg7/ATARI
def keepMemory(memory_size=10000, pretrain_length=5000,render=False):

    #print("CartPole main start..")
    #env = gym.make('CartPole-v0')

    envs = setEnv()

    #env = envs["BreakGame"]
    env = envs["SpaceInvador"]

    # Initialize the simulation
    #observation = env.reset()
    stateCls = SteteClass(env)
    stateCls.initial_buffer()

    # current state == initial screen state --> nothing to active 0 action
    curr_state = stateCls.convertAndConcatenateBuffer()
    curr_state = curr_state[np.newaxis,:,:,:]

    #print("initial state size ...", state.shape)
    # Take one random step to get the pole and cart moving
    #state, reward, done, _ = env.step(env.action_space.sample())

    memory = Memory(max_size=memory_size)

    # AgentClass section
    myAgent = AgentClass(6)
    # initialize Q Network

    MINIBATCH_SIZE = 32
    MIN_OBSERVATION = 500

    epsilon = 1.0
    EPSILON_DECAY = 300
    FINAL_EPS = 0.1

    NUM_FRAMES = 3

    observation_num = 0
    alive_frame = 0
    total_reward = 0

    curr_state_actions = []

    MEMORY_FULL = False
    # Make a bunch of random actions and store the experiences
    for ii in range(pretrain_length):
        # Uncomment the line below to watch the simulation
        #if render:
        #    env.render()
        #stateCls.render()

        init_state = stateCls.convertAndConcatenateBuffer()
        action, q_values = myAgent.get_action(curr_state)
        #curr_state_actions.append(action)

        #print("** action and q_value ... ",action, q_values)
        #myAgent.copyTargetQNetwork()
        #return False,False,False
        #next_state, reward, done, _ = env.step(action)

        obs,rewards,done = stateCls.add_frame(action,NUM_FRAMES)

        #if observation_num % 500 == 0:
        #    print("observation_num / q_values ..",observation_num,q_values)

        if done:
            # The simulation fails so no next state
            if MEMORY_FULL:
                print("memory full.....")

            print("** rewards from done ...", total_reward)
            print("** maxium lived frame .. ", alive_frame)

            stateCls.envReset()
            # Start new episode
            # Take one random step to get the pole and cart moving
            alive_frame = 0
            total_reward = 0

        new_state = stateCls.convertAndConcatenateBuffer()
        #memory add
        memory.add((init_state, action, rewards, done, new_state))
        total_reward += rewards

        if memory.checklength() > MIN_OBSERVATION:
            MEMORY_FULL = True
            # Sample mini-batch from memory
            # pick up m = 32
            mini_batch = memory.sample(MINIBATCH_SIZE)
            myAgent.train(mini_batch)

            #s_batch, a_batch, r_batch, d_batch, s2_batch = memory.sample(MINIBATCH_SIZE)
            #self.deep_q.train(s_batch, a_batch, r_batch, d_batch, s2_batch, observation_num)
            #self.deep_q.target_train()


        observation_num += 1
        alive_frame += 1


    print(memory.checklength())
    #print("curr action", curr_state_actions)

    #print("Total rewards from all episodes..", total_reward)

    return curr_state_actions