def process(render=False): print("CartPole main start..") env = gym.make('CartPole-v0') # Initialize the simulation env.reset() # Take one random step to get the pole and cart moving state, reward, done, _ = env.step(env.action_space.sample()) memory = Memory(max_size=memory_size) # Make a bunch of random actions and store the experiences for ii in range(pretrain_length): # Uncomment the line below to watch the simulation if render: env.render() # Make a random action action = env.action_space.sample() next_state, reward, done, _ = env.step(action) if done: # The simulation fails so no next state next_state = np.zeros(state.shape) # Add experience to memory memory.add((state, action, reward, next_state)) # Start new episode env.reset() # Take one random step to get the pole and cart moving state, reward, done, _ = env.step(env.action_space.sample()) else: # Add experience to memory memory.add((state, action, reward, next_state)) state = next_state #memory.checkBuffer() return memory, state, env
import matplotlib.pyplot as plt from MemoryClass import Memory from StateClass import SteteClass #from env import setEnv from AgentClass_v4duel import AgentClass # parameters ... # num_consecutive_iterations = 100 num_episodes = 500 initial_training = 20000 # start point to train the data memory_size = 30000 memory = Memory(max_size=memory_size) MINIBATCH_SIZE = 32 ENV_NAME = 'SpaceInvaders-v0' SAVE_NETWORK_PATH = 'saved_networks/' + ENV_NAME SAVE_SUMMARY_PATH = 'summary/' + ENV_NAME env = gym.make(ENV_NAME) STATE_LENGTH = 4 myAgent = AgentClass(env.action_space.n, STATE_LENGTH) def rgb2gray(rgb): return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
import gym import numpy as np from skimage.transform import resize import matplotlib.pyplot as plt from MemoryClass import Memory from StateClass import SteteClass #from env import setEnv from AgentClass import AgentClass from PIL import Image myMemory = Memory(max_size=10) x = range(20) for item in x: myMemory.add(item) print(myMemory.checkBuffer())
def main(): # ========================= # Settings # ========================= learning_mode = "QLearning" # "RewardAveraging", "QLearning" if learning_mode == "RewardAveraging": from RewardAveraging_BrainClass import Brain N_episodes = 100000 env_info = {"Ny": 7, "Nx": 7} brain_info = {} agent_info = { "name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes } elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes = 10000 env_info = {"Ny": 7, "Nx": 7} brain_info = { "Q_learning_rate": 0.95, "Q_discount": 1.0 } # only relevant for Q-learning agent_info = { "name": "epsilon-greedy", "epsilon": 1.0, "epsilon_decay": 2.0 * np.log(10.0) / N_episodes } else: raise IOError("Error: Invalid learning mode!") # ========================= # Set up environment, agent, memory and brain # ========================= env = Environment( env_info) # set up environment rewards and state-transition rules agent = Agent(agent_info) # set up epsilon-greedy agent brain = Brain(env, brain_info) # stores and updates Q(s,a) and policy(s) memory = Memory(env) # keeps track of run and episode (s,a) histories # ========================= # Train agent # ========================= print( "\nTraining '{}' agent on '{}' environment for {} episodes using '{}' learning mode...\n" .format(agent.name, env.name, N_episodes, learning_mode, agent.epsilon)) memory.reset_run_counters() # reset run counters once only for episode in range(N_episodes): memory.reset_episode_counters() # reset episodic counters state = env.starting_state() # starting state while not env.is_terminal(state): # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters( state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) # Update Q during episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next # Update run counters first (before updating Q) memory.update_run_counters( ) # use episode counters to update run counters agent.episode += 1 # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Print if (episode + 1) % (N_episodes / 20) == 0: print( " episode = {}/{}, epsilon = {:.3F}, reward = {:.1F}, n_actions = {}" .format(episode + 1, N_episodes, agent.epsilon_effective, memory.R_total_episode, memory.N_actions_episode)) # ======================= # Print final policy # ======================= print("\nFinal policy:\n") print(brain.compute_policy(env)) print("") for (key, val) in sorted(env.action_dict.items(), key=operator.itemgetter(1)): print(" action['{}'] = {}".format(key, val))
def main(): # ============================== # Settings # ============================== N_episodes = 200 load_model = False # load model save_model = True # save model on last episode save_model_filename = os.path.join("model", "model.h5") info = { "env": { "Ny": 20, "Nx": 20 }, "agent": { "policy_mode": "epsgreedy", # "epsgreedy", "softmax" "eps": 1.0, "eps_decay": 2.0 * np.log(10.0) / N_episodes }, "brain": { "discount": 0.99, "learning_rate": 0.9 }, "memory": {} } # ============================== # Setup environment and agent # ============================== env = Environment(info) agent = Agent(env, info) brain = Brain(env, info) memory = Memory(info) if load_model: brain.load_model(save_model_filename) # ============================== # Train agent # ============================== for episode in range(N_episodes): iter = 0 state = env.starting_state() while env.is_terminal_state(state) == False: # Pick an action by sampling action probabilities action, model_output, prob = agent.get_action(state, brain, env) # Collect reward and observe next state reward = env.get_reward(state, action) state_next = env.perform_action(state, action) # Append quantities to memory memory.append_to_memory(state, state_next, action, model_output, prob, reward) # Transition to next state state = state_next iter += 1 # Print policy_mode = agent.agent_info["policy_mode"] if (policy_mode == "epsgreedy"): print( "[episode {}] mode = {}, iter = {}, eps = {:.4F}, reward = {:.2F}" .format(episode, policy_mode, iter, agent.eps_effective, sum(memory.reward_memory))) elif (policy_mode == "softmax"): print("[episode {}] mode = {}, iter = {}, reward = {:.2F}".format( episode, policy_mode, iter, sum(memory.reward_memory))) # Update model when episode finishes brain.update(memory, env) agent.episode += 1 # Save model if save_model and (episode == N_episodes - 1): brain.save_model(save_model_filename) # Clear memory for next episode memory.clear_memory()
error = tf.abs(y - q_value) clipped_error = tf.clip_by_value(error, 0.0, 1.0) linear_error = 2 * (error - clipped_error) loss = tf.reduce_mean(tf.square(clipped_error) + linear_error) global_step = tf.Variable(0, trainable=False, name='global_step') optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True) training_op = optimizer.minimize(loss, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver() # Let's implement a simple replay memory memory = Memory(max_size=pre_train_steps * 2) mspacman_color = np.array([210, 164, 74]).mean() def preprocess_observation(obs): img = obs[1:176:2, ::2] # crop and downsize img = img.mean(axis=2) # to greyscale img[img == mspacman_color] = 0 # Improve contrast img = (img - 128) / 128 - 1 # normalize from -1. to 1. return img.reshape(88, 80, 1) def get_initial_state(observation, last_observation): init_image = rgb2gray(observation)
def main(): # ========================= # Settings # ========================= learning_mode = "SampleAveraging" if learning_mode == "SampleAveraging": from SampleAveraging_BrainClass import Brain N_episodes_train = 100000 N_episodes_test = 30 agent_info = {"name": "hunter", "epsilon": 0.5} env_info = {"N_global": 7} brain_info = {} elif learning_mode == "QLearning": from QLearning_BrainClass import Brain N_episodes_train = 10000 N_episodes_test = 30 agent_info = {"name": "hunter", "epsilon": 0.5} env_info = {"N_global": 7} brain_info = { "learning_rate": 0.8, "discount": 0.9 } # only relevant for Q-learning else: raise IOError("Error: Invalid learning mode!") save_video = True video_file = "results/hunterprey.mp4" convert_mp4_to_gif = True gif_file = "results/hunterprey.gif" # ========================= # Set up environment, agent, memory and brain # ========================= agent = Agent(agent_info) env = Environment(env_info) brain = Brain(env, brain_info) memory = Memory(env) # ========================= # Train agent # ========================= print( "\nTraining '{}' agent on '{}' environment for {} episodes, testing for {} episodes (epsilon = {})...\n" .format(agent.name, env.name, N_episodes_train, N_episodes_test, agent.epsilon)) memory.reset_run_counters() # reset run counters once only state_global_history_video = [] state_target_global_history_video = [] for episode in range(N_episodes_train + N_episodes_test): if (episode >= N_episodes_train): agent.epsilon = 0 # set no exploration for test episodes memory.reset_episode_counters() # reset episodic counters # state = position of hunter relative to prey (want to get to [0,0]) # state_global = global position of hunter # state_target_global = global position of prey if episode == 0: (state, state_global, state_target_global) = env.get_random_state() else: (state, state_global, state_target_global) = env.get_random_state( set_state_global=state_global) env.set_state_terminal_global(state_target_global) state_global_history = [state_global] n_iter_episode = 0 while not env.is_terminal( state ): # NOTE: terminates when hunter hits local coordinates of (0,0) # Get action from policy action = agent.get_action(state, brain, env) # get action from policy # Collect reward from environment reward = env.get_reward(state, action) # get reward # Update episode counters memory.update_episode_counters( state, action, reward) # update our episodic counters # Compute and observe next state state_next = env.perform_action(state, action) state_global_next = env.perform_action_global(state_global, action) # Update Q after episode (if needed) if "update_Q_during_episode" in utils.method_list(Brain): brain.update_Q_during_episode(state, action, state_next, reward) # Transition to next state state = state_next state_global = state_global_next # Track states for video state_global_history.append(state_global) # Exit program if testing fails (bad policy) n_iter_episode += 1 if (episode >= N_episodes_train) and (n_iter_episode > 2000): raise IOError("Bad policy found! Non-terminal episode!") # Append for video output if episode >= N_episodes_train: state_global_history_video.append(state_global_history) state_target_global_history_video.append([state_target_global] * len(state_global_history)) # Update run counters first (before updating Q) memory.update_run_counters( ) # use episode counters to update run counters # Update Q after episode (if needed) if "update_Q_after_episode" in utils.method_list(Brain): brain.update_Q_after_episode(memory) # Give output to user on occasion if (episode + 1) % (N_episodes_train / 20) == 0 or (episode >= N_episodes_train): n_optimal = np.abs( env.ygrid_global[state_global_history[0][0]] - env.ygrid_global[state_target_global[0]]) + np.abs( env.xgrid_global[state_global_history[0][1]] - env.xgrid_global[state_target_global[1]]) # ===================== # Print text # ===================== mode = "train" if (episode < N_episodes_train) else "test" print( " [{} episode = {}/{}] epsilon = {}, total reward = {:.1F}, n_actions = {}, n_optimal = {}, grid goal: [{},{}] -> [{},{}]" .format(mode, episode + 1, N_episodes_train + N_episodes_test, agent.epsilon, memory.R_total_episode, memory.N_actions_episode, n_optimal, env.ygrid_global[state_global_history[0][0]], env.xgrid_global[state_global_history[0][1]], env.ygrid_global[state_target_global[0]], env.xgrid_global[state_target_global[1]])) # ===================== # Make video animation # ===================== if save_video: print("\nSaving file to '{}'...".format(video_file)) plot_hunter_prey(state_global_history_video, state_target_global_history_video, env, video_file=video_file) if convert_mp4_to_gif: print("\nConverting '{}' to '{}'...".format(video_file, gif_file)) import moviepy.editor as mp clip = mp.VideoFileClip(video_file) clip.write_gif(gif_file)
def keepMemory(memory_size=10000, pretrain_length=5000,render=False): #print("CartPole main start..") #env = gym.make('CartPole-v0') envs = setEnv() #env = envs["BreakGame"] env = envs["SpaceInvador"] # Initialize the simulation #observation = env.reset() stateCls = SteteClass(env) stateCls.initial_buffer() # current state == initial screen state --> nothing to active 0 action curr_state = stateCls.convertAndConcatenateBuffer() curr_state = curr_state[np.newaxis,:,:,:] #print("initial state size ...", state.shape) # Take one random step to get the pole and cart moving #state, reward, done, _ = env.step(env.action_space.sample()) memory = Memory(max_size=memory_size) # AgentClass section myAgent = AgentClass(6) # initialize Q Network MINIBATCH_SIZE = 32 MIN_OBSERVATION = 500 epsilon = 1.0 EPSILON_DECAY = 300 FINAL_EPS = 0.1 NUM_FRAMES = 3 observation_num = 0 alive_frame = 0 total_reward = 0 curr_state_actions = [] MEMORY_FULL = False # Make a bunch of random actions and store the experiences for ii in range(pretrain_length): # Uncomment the line below to watch the simulation #if render: # env.render() #stateCls.render() init_state = stateCls.convertAndConcatenateBuffer() action, q_values = myAgent.get_action(curr_state) #curr_state_actions.append(action) #print("** action and q_value ... ",action, q_values) #myAgent.copyTargetQNetwork() #return False,False,False #next_state, reward, done, _ = env.step(action) obs,rewards,done = stateCls.add_frame(action,NUM_FRAMES) #if observation_num % 500 == 0: # print("observation_num / q_values ..",observation_num,q_values) if done: # The simulation fails so no next state if MEMORY_FULL: print("memory full.....") print("** rewards from done ...", total_reward) print("** maxium lived frame .. ", alive_frame) stateCls.envReset() # Start new episode # Take one random step to get the pole and cart moving alive_frame = 0 total_reward = 0 new_state = stateCls.convertAndConcatenateBuffer() #memory add memory.add((init_state, action, rewards, done, new_state)) total_reward += rewards if memory.checklength() > MIN_OBSERVATION: MEMORY_FULL = True # Sample mini-batch from memory # pick up m = 32 mini_batch = memory.sample(MINIBATCH_SIZE) myAgent.train(mini_batch) #s_batch, a_batch, r_batch, d_batch, s2_batch = memory.sample(MINIBATCH_SIZE) #self.deep_q.train(s_batch, a_batch, r_batch, d_batch, s2_batch, observation_num) #self.deep_q.target_train() observation_num += 1 alive_frame += 1 print(memory.checklength()) #print("curr action", curr_state_actions) #print("Total rewards from all episodes..", total_reward) return curr_state_actions