def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.00005 #Learning rate for Actor LRC = 0.0005 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 200000. if train_indicator: episode_count = 1000 else: episode_count = 20 max_steps = 4000 step = 0 if train_indicator: epsilon = 1 else: epsilon = 0 min_laptime = 10000000 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight # loading networks print("Now we load the weight") saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks/") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. # totalLaptime = 0. for j in range(max_steps): loss = 0 if train_indicator: epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, 0.10) a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0], train_indicator) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_predict(new_states, actor.target_predict(new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.train_on_batch(states, actions, y_t) a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 100) == 0: print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime) step += 1 if i == 0: break if done: break # if np.mod(i, 3) == 0: if (train_indicator) and i > 0: if env.lapTime < min_laptime and env.num_lap == 10: min_laptime = env.lapTime print("Now we save model") saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i)) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) # set target yi = ri + gamma*target_critic_network(si+1, target_actor_network(si+1)) #print "Debuggg" #print len(batch) #print new_states.shape new_states = new_states.reshape([len(batch), new_states.shape[1]]) s_t1.reshape([s_t1.shape[0], s_t1.shape[1]]) #print new_states.shape #print states.shape #print s_t1.shape target_q_values = critic.target_predict(new_states, actor.target_predict(new_states)) y_t = [] for i in range(len(batch)): if dones[i]: y_t.append(rewards[i]) else: y_t.append(rewards[i] + GAMMA*target_q_values[i]) # update critic network by minimizing los L = 1/N sum(yi - critic_network(si,ai))**2 y_t = np.array(y_t).reshape([len(y_t), 1]) critic.train(y_t, states, actions)
if done: dt = 1 else: dt = 0 # store transition in replay buffer buff.add(s_t, a_t[0], r_t, s_t1, dt) # sample a random minibatch of N transitions (si, ai, ri, si+1) from replay buffer batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) target_q_values = critic.target_predict(new_states, actor.target_predict(new_states)) rt = rewards.reshape(rewards.size,1) dones = dones.reshape(dones.size,1) y_t = rt + GAMMA*target_q_values*(1-dones) # update critic network by minimizing los L = 1/N sum(yi - critic_network(si,ai))**2 critic.train(y_t, states, actions) # update actor policy using sampled policy gradient a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) # update the target networks actor.target_train() critic.target_train()
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = [64, 64, 3] # of sensors input since only one frame as observation np.random.seed(1337) vision = True #changing vsion to true EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight try: saved_actor_weights = tf.train.Saver.restore(sess, 'actor_weights.ckpt') sess.run(tf.assign(actor.weights, saved_actor_weights)) saved_critic_weights = tf.train.Saver.restore(sess, 'critic_weights.ckpt') sess.run(tf.assign(critic.weights, saved_critic_weights)) print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) s_t = ob.img total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.predict(s_t.reshape(state_dim)) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) #s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) s_t1 = ob.img.reshape(state_dim) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer print "Do the batch update" batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_predict( new_states, actor.target_predict(new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.train(states, actions, y_t) a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") # actor.model.save_weights("actormodel.h5", overwrite=True) saver_actor = tf.train.Saver(var_list=actor.weights, filename='actor_weights') # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) saver_critic = tf.train.Saver(var_list=critic.weights, filename='critic_weights') # critic.model.save_weights("criticmodel.h5", overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
class DriverAgent: def __init__(self, env_name, state_dim, action_dim): self.name = 'DriverAgent' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Tensorflow Session config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Actor & Critic Network self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) self.critic = CriticNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) # Replay Memory self.memory = ReplayMemory(MEMORY_SIZE) # Loss value self.loss = 0 # loading networks. modify as you want self.saver = tf.train.Saver() if not os.path.exists(ckp_dir): print("Could not find old network weights") else: self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name)) print("Successfully loaded:", ckp_name) # Train code def train(self, state, action, reward, next_state, done): # Add information to the replay memory if (not (math.isnan(reward))): self.memory.add(state, action, reward, next_state, done) if self.memory.count() <= START_REPLAY: return # Get batch from the replay memory batch = self.memory.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) # Get target Q value of the critic network target_Q = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Calculate answer(???) < I cannot rememeber name y_t = [] for i in range(len(batch)): if dones[i]: y_t.append(rewards[i]) else: y_t.append(rewards[i] + GAMMA * target_Q[i]) y_t = np.resize(y_t, [BATCH_SIZE, 1]) # Calculate loss value and gradient for each network, and train both _, loss = self.critic.train([states, actions], y_t) a_for_grad = self.actor.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() # save your own network def saveNetwork(self, episode): if not os.path.exists(ckp_dir): os.mkdir(ckp_dir) ckp_name_real = ckp_name + '_' + str(episode) self.saver.save(self.sess, os.path.join(ckp_dir, ckp_name_real)) pass def action(self, state): # return an action by state. action = np.zeros([self.action_dim]) action_pre = self.actor.predict([state]) # ACTION: without noise action[0] = np.clip(action_pre[0][0], -1, 1) action[1] = np.clip(action_pre[0][1], 0, 1) action[2] = np.clip(action_pre[0][2], 0, 1) return action def noise_action(self, state, epsilon): # return an action according to the current policy and exploration noise action = np.zeros([self.action_dim]) noise = np.zeros([self.action_dim]) action_pre = self.actor.predict([state]) noise[0] = epsilon * OU.function(action_pre[0][0], 0.0, 0.80, 0.60) noise[1] = epsilon * OU.function(action_pre[0][1], 0.7, 1.00, 0.10) noise[2] = epsilon * OU.function(action_pre[0][2], -0.1, 1.00, 0.05) # ACTION: with noise action[0] = np.clip(action_pre[0][0] + noise[0], -1, 1) action[1] = np.clip(action_pre[0][1] + noise[1], 0, 1) action[2] = np.clip(action_pre[0][2] + noise[2], 0, 1) return action