class APLDDPGAgent(AbstractAgent): name = "apl_ddpg" def __init__(self, env, iter=200000, *args, **kwargs): # create the actor model # create the critic model self.env = env self.action_dim = sum( sum(1 for i in row if i) for row in self.env.action_space.sample()) self.observation = env.reset() self.state_dim = self.observation.shape print ">>>>>>>>>>>>>>>>>>>>>state dim " + str(self.state_dim) self.nn_action_dim = 6 # limit ddpg network output to 3 DOF self.noise = OUProcess(self.nn_action_dim, mu=OU_MEAN, theta=OU_THETA, sigma=EPSILON_RANGE[0]) def fit(self, *args, **kwargs): MEM_SZ = MEM_SIZE_FCL sess = K.get_session() K.set_learning_phase(1) self.actor = ActorNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRA, convolutional=CONVOLUTIONAL, output_activation=ACTION_ACTIVATION) self.critic = CriticNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRC, convolutional=CONVOLUTIONAL) self.memory = Memory(MEM_SZ) self.actor.target_model.summary() self.critic.target_model.summary() if LOAD_WEIGHTS: self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") print("Weights Loaded!") #==================================================== #Initialize noise processes #self.noise_procs = [] #for i in range(NUM_NOISE_PROCS): # self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV)) #==================================================== PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS steps = STARTING_EPISODE * EPISODE_LENGTH start_time = time.time() last_ep_time = time.time() if MAKE_PLOT: reward_graph = Grapher() for ep in range(STARTING_EPISODE, EPISODES): #reset noise processes #for ou in self.noise_procs: # ou.reset() self.noise.reset() #start time counter if (ep == PRE_LEARNING_EPISODES): start_time = time.time() print("Episode: " + str(ep) + " Frames: " + str(ep * EPISODE_LENGTH) + " Uptime: " + str( (time.time() - start_time) / 3600.0) + " hrs ===========") state = self.env.reset() play_only = (ep % 10 == 0) total_reward = 0 if play_only or ALREADY_TRAINED: for step in range(TEST_EPISODE_LENGTH): #print ">>>>>>>>>>>>>", state.shape #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center #img = np.multiply(img, 1.0/128.0) #scale [-1,1] #img = np.transpose(state, (1,2,0)) #img = np.array(state) #img = np.transpose(img, (1,2,0)) #print ">>>>>>>>>>>>>", state.shape state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction( state, can_be_random=False, use_target=True) nstate, reward, done, info = self.env.step(control_action) total_reward += reward state = nstate else: for step in range(EPISODE_LENGTH): # ACT ============================== epsilon = (float(steps) / float(EPSILON_STEPS)) * ( EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0] state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction(state, epsilon=epsilon) new_state, reward, done, info = self.env.step( control_action) done = done or (step >= EPISODE_LENGTH) self.memory.addMemory(state, action, reward, new_state, done) state = new_state # LEARN ============================ if ep > PRE_LEARNING_EPISODES: batch, idxs = self.memory.getMiniBatch(BATCH_SIZE) self.learnFromBatch(batch) if done: break # CLEANUP ========================== steps += 1 #we need to consider the episodes without noise to actually tell how the system is doing if play_only and MAKE_PLOT: reward_graph.addSample(total_reward) reward_graph.displayPlot() #calculate fph on total frames total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH elapsed = time.time() - start_time fps = total_frames / elapsed fph = fps * 3600.0 #re-calculate fps on this episode, so it updates quickly fps = EPISODE_LENGTH / (time.time() - last_ep_time) last_ep_time = time.time() print("fps: " + str(fps) + " fph: " + str(fph) + "\n") #save plot and weights if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY == 0) and not ALREADY_TRAINED: #plot if MAKE_PLOT: reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" + str(ep) + ".jpg") #weights self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".h5", overwrite=True) self.actor.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".h5", overwrite=True) self.critic.model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5", overwrite=True) self.critic.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".h5", overwrite=True) #network structures (although I don't think I ever actually use these) with open( SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.target_model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.target_model.to_json(), outfile) def learnFromBatch(self, miniBatch): dones = np.asarray([sample['isFinal'] for sample in miniBatch]) states = np.asarray([sample['state'] for sample in miniBatch]) actions = np.asarray([sample['action'] for sample in miniBatch]) new_states = np.asarray([sample['newState'] for sample in miniBatch]) Y_batch = np.asarray([sample['reward'] for sample in miniBatch]) new_states = np.reshape(new_states, new_states.shape + (1, )) target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) for i in range(len(miniBatch)): if not dones[i]: Y_batch[i] = Y_batch[i] + GAMMA * target_q_values[i] self.critic.model.train_on_batch([states, actions], Y_batch) #additional operations to train actor temp_actions = self.actor.model.predict(states) grads = self.critic.gradients(states, temp_actions) self.actor.train(states, grads) #update target networks self.actor.target_train() self.critic.target_train() ''' This is wrong I think def OU(x, mu, theta, sigma): return theta * (mu - x) + sigma * np.random.randn(1) ''' def clip(self, x, minx, maxx): return max(minx, min(maxx, x)) def selectAction(self, state, can_be_random=True, use_target=False, epsilon=1.0, permutation_num=0): state = np.array([state]) #add dimension to make a "batch" of 1 if use_target: actions = self.actor.target_model.predict(state) else: actions = self.actor.model.predict(state) actions = np.squeeze(actions) #print control_actions #print("+++++++++++") #print(actions) if can_be_random: self.noise.sigma = epsilon noise = self.noise.noise() #print noise i = 0 for idx, a in enumerate(actions): actions[i] = actions[i] + noise[i] actions[i] = self.clip( actions[i], -3.14, 3.14) #need to assign to actions[i], not just a. i += 1 #get noise #noise = [] #iterate over all noise procs for non-coop, or a single agent's procs for co-op #for n in range(permutation_num*ACTIONS_PER_AGENT, permutation_num*ACTIONS_PER_AGENT + self.action_dim): # ou = self.noise_procs[n] # noise.append(ou.step()) # for idx, a in enumerate(actions): # ou = self.noise_procs[0] # noise = ou.step() # a = a + epsilon*noise # #print epsilon * noise # actions[i] = self.clip(a, -3.14, 3.14) #need to assign to actions[i], not just a. # i += 1 # #print(actions) #fill in zeros for all non-learned outputs control_actions = np.pad(actions, (0, self.action_dim - len(actions)), 'constant') #print actions #print control_actions return actions, control_actions #Constructs an image from state vector def constructImageRepresentation(self, state): img = np.empty([IMAGE_SIDE_LENGTH, IMAGE_SIDE_LENGTH], dtype=np.uint8) img.fill(128) color = 255 delta_color = int(math.floor(128 / NUM_TARGETS)) for j in range(NUM_TARGETS): tar = [state[2 * j], state[2 * j + 1]] cv2.circle(img, (int( tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)), 5, 0, -1) cv2.circle(img, (int( tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)), 4, color, -1) color -= delta_color color = 0 for j in range(NUM_AGENTS): offset = 2 * NUM_TARGETS agent = [state[offset + 2 * j], state[offset + 2 * j + 1]] #draw blank agent, no thrust display cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 4, int(agent[1] * IMAGE_SIDE_LENGTH) - 1), (int(agent[0] * IMAGE_SIDE_LENGTH) + 4, int(agent[1] * IMAGE_SIDE_LENGTH) + 1), color, -1) cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 1, int(agent[1] * IMAGE_SIDE_LENGTH) - 4), (int(agent[0] * IMAGE_SIDE_LENGTH) + 1, int(agent[1] * IMAGE_SIDE_LENGTH) + 4), color, -1) #first agent ia 0 since we control it, others are same color color = 64 ''' cv2.namedWindow('perm_image',cv2.WINDOW_NORMAL) cv2.resizeWindow('perm_image', 600,600) cv2.imshow('perm_image', img) cv2.waitKey(1) ''' img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center img = np.multiply(img, 1.0 / 128.0) #scale [-1,1] img = np.transpose(img, (1, 2, 0)) return img #for co-op case, get an arrangement of the state vector for each agent. def getStatePermutations(self, state): perms = [] for i in range(NUM_AGENTS): if CONVOLUTIONAL and not DRAW_STATE: perms.append(state) else: pstate = [] #copy over target data for j in range(NUM_TARGETS * 2): pstate.append(state[j]) #copy agent data, rotated for j in range(NUM_AGENTS * 2): rot_j = (j + (i * 2)) % (NUM_AGENTS * 2) + (NUM_TARGETS * 2) pstate.append(state[rot_j]) if DRAW_STATE: perms.append(constructImageRepresentation(pstate)) else: perms.append(np.asarray(pstate, dtype=np.float32)) return perms
def run_ddpg(amodel, cmodel, train_indicator=0, seeded=1337, track_name='practgt2.xml'): OU = FunctionOU() BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic ALPHA = 0.9 action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input np.random.seed(seeded) vision = False EXPLORE = 100000. if train_indicator: episode_count = 600 else: episode_count = 3 max_steps = 20000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=track_name) if not train_indicator: # Now load the weight #logging.info("Now we load the weight") print("Now we load the weight") try: actor.model.load_weights(amodel) critic.model.load_weights(cmodel) actor.target_model.load_weights(amodel) critic.target_model.load_weights(cmodel) #logging.info(" Weight load successfully") print("Weight load successfully") except: #ogging.info("Cannot find the weight") print("Cannot find the weight") exit() #logging.info("TORCS Experiment Start.") print("TORCS Experiment Start.") best_lap = 500 for i_episode in range(episode_count): print("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) if np.mod(i_episode, 3) == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. for j_iter in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i_episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) if np.mod(step, 1000) == 0: logging.info("Episode {}, Distance {}, Last Lap {}".format( i_episode, ob.distRaced, ob.lastLapTime)) if ob.lastLapTime > 0: if best_lap < ob.lastLapTime: best_lap = ob.lastLapTime step += 1 if done: break if train_indicator and i_episode > 20: if np.mod(i_episode, 3) == 0: logging.info("Now we save model") actor.model.save_weights("ddpg_actor_weights_periodic.h5", overwrite=True) critic.model.save_weights("ddpg_critic_weights_periodic.h5", overwrite=True) print("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("Best Lap {}".format(best_lap)) print("") logging.info("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) logging.info("Best Lap {}".format(best_lap)) env.end() # This is for shutting down TORCS logging.info("Finish.")
class NeuralAgent(): def __init__(self, track_name='practgt2.xml'): BUFFER_SIZE = 100000 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic state_dim = 29 # of sensors input self.batch_size = 32 self.lambda_mix = 10.0 self.action_dim = 3 # Steering/Acceleration/Brake # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRC) self.buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer self.track_name = track_name self.save = dict(total_reward=[], total_step=[], ave_reward=[], distRaced=[], distFromStart=[], lastLapTime=[], curLapTime=[], lapTimes=[], avelapTime=[], ave_sp=[], max_sp=[], min_sp=[], test_total_reward=[], test_total_step=[], test_ave_reward=[], test_distRaced=[], test_distFromStart=[], test_lastLapTime=[], test_curLapTime=[], test_lapTimes=[], test_avelapTime=[], test_ave_sp=[], test_max_sp=[], test_min_sp=[]) def rollout(self, env): max_steps = 10000 vision = False # zhichen: it is not stable to have two torcs env and UDP connections # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. sp = [] lastLapTime = [] for j_iter in range(max_steps): a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) a_t = a_t[0] # print('test a_t:', a_t) a_t[0] = clip(a_t[0], -1, 1) a_t[1] = clip(a_t[1], 0, 1) a_t[2] = clip(a_t[2], 0, 1) ob, r_t, done, info = env.step(a_t) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) if np.mod(j_iter + 1, 20) == 0: logging.info('step: ' + str(j_iter + 1)) print('\n ob: ', ob) s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t if done: break logging.info("Test Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + # " Episode Length: " + str(j_iter+1) + " Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime)) #env.end() # This is for shutting down TORCS ave_sp = np.mean(sp) max_sp = np.max(sp) min_sp = np.min(sp) return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime def update_neural(self, controllers, episode_count=200, tree=False, seed=1337): OU = FunctionOU() vision = False GAMMA = 0.99 EXPLORE = 100000. max_steps = 10000 reward = 0 done = False step = 0 epsilon = 1 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Experiment Start with Lambda = " + str(self.lambda_mix)) for i_episode in range(episode_count): logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(self.buff.count())) if np.mod(i_episode, 3) == 0: logging.info('relaunch TORCS') ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: logging.info('reset TORCS') ob = env.reset() #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)] s_t = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] sp = [] lastLapTime = [] for j_iter in range(max_steps): if tree: tree_obs = [ sensor for obs in tempObs[:-1] for sensor in obs ] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, self.action_dim]) noise_t = np.zeros([1, self.action_dim]) a_t_original = self.actor.model.predict( s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = max(epsilon, 0) * OU.function( a_t_original[0][2], 0, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) self.buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = self.buff.getBatch(self.batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.zeros((states.shape[0], 1)) target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] loss += self.critic.model.train_on_batch([states, actions], y_t) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() total_reward += r_t s_t = s_t1 # Control prior mixing term if j_iter > 0 and i_episode > 50: lambda_track = lambda_max * (1 - np.exp(-factor * np.abs( r_t + GAMMA * np.mean(target_q_values[-1] - base_q[-1])))) lambda_track = np.squeeze(lambda_track) else: lambda_track = 10. lambda_store[j_iter] = lambda_track base_q = copy.deepcopy(target_q_values) if np.mod(step, 2000) == 0: logging.info("Episode " + str(i_episode) + " Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break #else: # env.end() self.lambda_mix = np.mean(lambda_store) logging.info('Episode ends! \n' + "Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) #logging.info(" Lambda Mix: " + str(self.lambda_mix)) self.save['total_reward'].append(total_reward) self.save['total_step'].append(j_iter + 1) self.save['ave_reward'].append(total_reward / (j_iter + 1)) self.save['distRaced'].append(info['distRaced']) self.save['distFromStart'].append(info['distFromStart']) self.save['lastLapTime'].append(info['lastLapTime']) self.save['curLapTime'].append(info['curLapTime']) self.save['lapTimes'].append(lastLapTime) if lastLapTime == []: self.save['avelapTime'].append(0) else: self.save['avelapTime'].append(np.mean(lastLapTime)) self.save['ave_sp'].append(np.mean(sp)) self.save['max_sp'].append(np.max(sp)) self.save['min_sp'].append(np.min(sp)) # test if np.mod(i_episode + 1, 10) == 0: logging.info("Start Testing!") test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout( env) self.save['test_total_reward'].append(test_total_reward) self.save['test_total_step'].append(test_step) self.save['test_ave_reward'].append(test_total_reward / test_step) self.save['test_distRaced'].append(test_info['distRaced']) self.save['test_distFromStart'].append( test_info['distFromStart']) self.save['test_lastLapTime'].append(test_info['lastLapTime']) self.save['test_curLapTime'].append(test_info['curLapTime']) self.save['test_lapTimes'].append(test_lastLapTime) if test_lastLapTime == []: self.save['test_avelapTime'].append(0) else: self.save['test_avelapTime'].append( np.mean(test_lastLapTime)) self.save['test_ave_sp'].append(test_ave_sp) self.save['test_max_sp'].append(test_max_sp) self.save['test_min_sp'].append(test_min_sp) if np.mod(i_episode + 1, 5) == 0: print("Now we save model") #os.remove("actormodel.h5") self.actor.model.save_weights("actormodel_" + str(seed) + ".h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) #os.remove("criticmodel.h5") self.critic.model.save_weights("criticmodel_" + str(seed) + ".h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) filename = "./model/actormodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.actor.model.save_weights(filename, overwrite=True) filename = "./model/criticmodel_" + str(seed) + '_' + str( i_episode + 1) + ".h5" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) self.critic.model.save_weights(filename, overwrite=True) if np.mod(i_episode + 1, 10) == 0: filename = "./Fig/iprl_save_" + str(seed) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as f: pickle.dump(self.save, f) if i_episode > 1000 and all( np.array(self.save['total_reward'][-20:]) < 20): print('model degenerated. Stop at Epsisode ' + str(i_episode)) break env.end() # This is for shutting down TORCS logging.info("Neural Policy Update Finish.") return None def collect_data(self, controllers, tree=False): vision = False max_steps = 10000 step = 0 if not tree: steer_prog, accel_prog, brake_prog = controllers # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name) ob = env.reset(relaunch=True) print("S0=", ob) window = 5 lambda_store = np.zeros((max_steps, 1)) lambda_max = 40. factor = 0.8 logging.info("TORCS Collection started with Lambda = " + str(self.lambda_mix)) s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] observation_list = [] actions_list = [] lastLapTime = [] sp = [] for j_iter in range(max_steps): if tree: tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs] act_tree = controllers.predict([tree_obs]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0])) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] if tree: newobs = [item for sublist in tempObs[:-1] for item in sublist] observation_list.append(newobs[:]) else: observation_list.append(window_list[:]) actions_list.append(mixed_act[:]) ob, r_t, done, info = env.step(mixed_act) sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward += r_t s_t = s_t1 #if np.mod(step, 2000) == 0: # logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) step += 1 if done: break logging.info("Data Collection Finished!") logging.info('Episode ends! \n' + "Episode Reward: " + str(total_reward) + " Episode Length: " + str(j_iter + 1) + " Ave Reward: " + str(total_reward / (j_iter + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) env.end() return observation_list, actions_list def label_data(self, controllers, observation_list, tree=False): if not tree: steer_prog, accel_prog, brake_prog = controllers actions_list = [] net_obs_list = [] logging.info("Data labelling started with Lambda = " + str(self.lambda_mix)) for window_list in observation_list: if tree: act_tree = controllers.predict([window_list]) steer_action = clip_to_range(act_tree[0][0], -1, 1) accel_action = clip_to_range(act_tree[0][1], 0, 1) brake_action = clip_to_range(act_tree[0][2], 0, 1) net_obs_list.append(window_list) else: steer_action = clip_to_range( steer_prog.pid_execute(window_list), -1, 1) accel_action = clip_to_range( accel_prog.pid_execute(window_list), 0, 1) brake_action = clip_to_range( brake_prog.pid_execute(window_list), 0, 1) net_obs = [sensor for obs in window_list[-1] for sensor in obs] net_obs_list.append(net_obs[:29]) action_prior = [steer_action, accel_action, brake_action] s_t = np.hstack([[net_obs[:29]]]) a_t = self.actor.model.predict(s_t.reshape(1, 29)) mixed_act = [ a_t[0][k_iter] / (1 + self.lambda_mix) + (self.lambda_mix / (1 + self.lambda_mix)) * action_prior[k_iter] for k_iter in range(3) ] actions_list.append(mixed_act[:]) return net_obs_list, observation_list, actions_list