def process(render=False): print("CartPole main start..") env = gym.make('CartPole-v0') # Initialize the simulation env.reset() # Take one random step to get the pole and cart moving state, reward, done, _ = env.step(env.action_space.sample()) memory = Memory(max_size=memory_size) # Make a bunch of random actions and store the experiences for ii in range(pretrain_length): # Uncomment the line below to watch the simulation if render: env.render() # Make a random action action = env.action_space.sample() next_state, reward, done, _ = env.step(action) if done: # The simulation fails so no next state next_state = np.zeros(state.shape) # Add experience to memory memory.add((state, action, reward, next_state)) # Start new episode env.reset() # Take one random step to get the pole and cart moving state, reward, done, _ = env.step(env.action_space.sample()) else: # Add experience to memory memory.add((state, action, reward, next_state)) state = next_state #memory.checkBuffer() return memory, state, env
import gym import numpy as np from skimage.transform import resize import matplotlib.pyplot as plt from MemoryClass import Memory from StateClass import SteteClass #from env import setEnv from AgentClass import AgentClass from PIL import Image myMemory = Memory(max_size=10) x = range(20) for item in x: myMemory.add(item) print(myMemory.checkBuffer())
#print("** selected action : ", a) #print(qout) #a = myAgent.sess.run(myAgent.predict,feed_dict={myAgent.x:state_image})[0] observation, reward, done, info = env.step(action) # total step + 1 total_steps += 1 # episode reward + episode_reward += reward processed_image = preprocess(observation) next_image = np.append(state_image[:, :, 1:], processed_image[:,:,np.newaxis], axis=2) state_image /= 255.0 next_image /= 255.0 memory.add((state_image, action, reward, done, next_image)) # pre_train_steps = 10000 if total_steps > pre_train_steps: # initial : e & endE # e = 1.0 # endE = 0.1 if e > endE: e -= stepDrop # # every total_steps by 4, batch is run # if total_steps % (update_freq) == 0:
def keepMemory(memory_size=10000, pretrain_length=5000,render=False): #print("CartPole main start..") #env = gym.make('CartPole-v0') envs = setEnv() #env = envs["BreakGame"] env = envs["SpaceInvador"] # Initialize the simulation #observation = env.reset() stateCls = SteteClass(env) stateCls.initial_buffer() # current state == initial screen state --> nothing to active 0 action curr_state = stateCls.convertAndConcatenateBuffer() curr_state = curr_state[np.newaxis,:,:,:] #print("initial state size ...", state.shape) # Take one random step to get the pole and cart moving #state, reward, done, _ = env.step(env.action_space.sample()) memory = Memory(max_size=memory_size) # AgentClass section myAgent = AgentClass(6) # initialize Q Network MINIBATCH_SIZE = 32 MIN_OBSERVATION = 500 epsilon = 1.0 EPSILON_DECAY = 300 FINAL_EPS = 0.1 NUM_FRAMES = 3 observation_num = 0 alive_frame = 0 total_reward = 0 curr_state_actions = [] MEMORY_FULL = False # Make a bunch of random actions and store the experiences for ii in range(pretrain_length): # Uncomment the line below to watch the simulation #if render: # env.render() #stateCls.render() init_state = stateCls.convertAndConcatenateBuffer() action, q_values = myAgent.get_action(curr_state) #curr_state_actions.append(action) #print("** action and q_value ... ",action, q_values) #myAgent.copyTargetQNetwork() #return False,False,False #next_state, reward, done, _ = env.step(action) obs,rewards,done = stateCls.add_frame(action,NUM_FRAMES) #if observation_num % 500 == 0: # print("observation_num / q_values ..",observation_num,q_values) if done: # The simulation fails so no next state if MEMORY_FULL: print("memory full.....") print("** rewards from done ...", total_reward) print("** maxium lived frame .. ", alive_frame) stateCls.envReset() # Start new episode # Take one random step to get the pole and cart moving alive_frame = 0 total_reward = 0 new_state = stateCls.convertAndConcatenateBuffer() #memory add memory.add((init_state, action, rewards, done, new_state)) total_reward += rewards if memory.checklength() > MIN_OBSERVATION: MEMORY_FULL = True # Sample mini-batch from memory # pick up m = 32 mini_batch = memory.sample(MINIBATCH_SIZE) myAgent.train(mini_batch) #s_batch, a_batch, r_batch, d_batch, s2_batch = memory.sample(MINIBATCH_SIZE) #self.deep_q.train(s_batch, a_batch, r_batch, d_batch, s2_batch, observation_num) #self.deep_q.target_train() observation_num += 1 alive_frame += 1 print(memory.checklength()) #print("curr action", curr_state_actions) #print("Total rewards from all episodes..", total_reward) return curr_state_actions