def sample_one(self): """ MODIFIED SAMPLING FOR TORCS! """ print print('START PLOTTING MODULE'.center(80, '=')) roll_distance = [] print print("TORCS Experiment Start".center(80, '=')) env = TorcsEnv(vision=self.config.vision, throttle=self.config.throttle) try: ob = env.reset() sonar, grayscale = self.image_to_sonar(ob.img) sonar = np.reshape(sonar, [19]) state = np.concatenate( [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0) obs, states, actions, rewards,sonars,grayscales = [], [], [], [],[],[] done = False #has the episode ended? start_time = time.time() while not done and (time.time() - start_time < 300): states.append(state) obs.append(ob) sonars.append(sonar) grayscales.append(grayscale) state = np.concatenate( [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0) action = self.sess.run( self.sampled_action, feed_dict={ self.observation_placeholder: np.reshape(state, [1, self.observation_dim]) })[0] ob, reward, done, info = env.step(action) sonar, grayscale = self.image_to_sonar(ob.img) sonar = np.reshape(sonar, [19]) #print('Action: ', action) actions.append(action) rewards.append(reward) roll_distance.append(env.distance_travelled) #print('Roll distance: ', roll_distance) except: raise finally: env.end() # This is for shutting down TORCS print("Finished TORCS session".center(80, '=')) print('Final distance: ', roll_distance[-1], ' [m]') print('END PLOTTING MODULE'.center(80, '=')) #Plot some of the frames: self.grayscales = grayscales self.sonars = sonars self.obs = obs self.actions = actions self.roll_distance = roll_distance return
def main(): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) generator = Generator(sess, feat_dim, aux_dim, encode_dim, action_dim) base_model = ResNet50(weights='imagenet', include_top=False) feat_extractor = Model( input=base_model.input, output=base_model.get_layer('activation_40').output ) try: generator.model.load_weights(param_path) print("Weight load successfully") except: print("cannot find weight") env = TorcsEnv(throttle=True, gear_change=False) print("Start driving ...") ob = env.reset(relaunch=True) feat, aux = get_state(ob, aux_dim, feat_extractor) encode = np.zeros((1, encode_dim), dtype=np.float32) encode[0, code] = 1 print "Encode:", encode[0] pre_actions = np.load(pre_actions_path)["actions"] for i in xrange(MAX_STEP_LIMIT): if i < MIN_STEP_LIMIT: action = np.zeros(3, dtype=np.float32) elif i < MIN_STEP_LIMIT + PRE_STEP: action = pre_actions[i - MIN_STEP_LIMIT] else: action = generator.model.predict([feat, aux, encode])[0] ob, reward, done, _ = env.step(action) feat, aux = get_state(ob, aux_dim, feat_extractor) if i == MIN_STEP_LIMIT + PRE_STEP: print "Start deciding ..." print "Step:", i, "DistFromStart:", ob.distFromStart, \ "TrackPos:", ob.trackPos, "Damage:", ob.damage.item(), \ "Action: %.6f %.6f %.6f" % (action[0], action[1], action[2]), \ "Speed:", ob.speedX * 200 if done: break env.end() print("Finish.")
def main(): # Creating necessary directories track_no = 5 experiment_name = "tensorboard-4" experiment_dir = "experiment-%s/" % experiment_name datas_dir = experiment_dir + "datas-track-no-%d/" % track_no models_dir = datas_dir + "model/" if os.path.exists(experiment_dir) == False: print("%s dosen't exists" % experiment_dir) return if os.path.exists(datas_dir) == False: print("%s dosen't exists" % datas_dir) return if os.path.exists(models_dir) == False: print("%s dosen't exists" % models_dir) return state_dim = 4 img_dim = [304, 412, 3] sess = tf.InteractiveSession() agent = Supervise(sess, state_dim, img_dim, models_dir) agent.load_network() MAX_STEP = 10000 step = 0 vision = True env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=track_no, random_track=False, track_range=(5, 8)) for i in range(1): if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, 0.0)) i_t = ob.img # print(i_t) while step < MAX_STEP: action = agent.action(s_t, i_t) ob, reward, done, info = env.step([action, 0.16, 0]) s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, action)) i_t = ob.img print("Step", step, "Action", action, "Reward", reward) if done == True: break env.end()
def test(): env = TorcsEnv(vision=True, throttle=False) ob = env.reset(relaunch=True) reward_sum = 0.0 done = False count = 0 while not done: act = model.predict(img_reshape(ob.img).astype('float32') / 255) #print(act) count += 1 ob, reward, done, _ = env.step(act) reward_sum += reward env.end() print("Steps before crash: ", count, reward_sum) return count, reward_sum
def programmatic_game(tree_program, track_name='practgt2.xml'): episode_count = 2 max_steps = 100000 window = 5 # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False, track_name=track_name) logging.info("TORCS Experiment Start with Priors on " + track_name) for i_episode in range(episode_count): ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] newobs = [item for sublist in tempObs[:-1] for item in sublist] for j in range(max_steps): act_tree = tree_program.predict([newobs]) action_prior = [act_tree[0][0], act_tree[0][1], act_tree[0][2]] tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] newobs = [item for sublist in tempObs[:-1] for item in sublist] ob, r_t, done, info = env.step(action_prior) if np.mod(j, 1000) == 0: logging.info("Episode " + str(i_episode) + " Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime)) if done: print('Done. Steps: ', j) break env.end() # This is for shutting down TORCS logging.info("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run time.sleep(1) BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) pre_model = load_model("weights_rescale_all-0000.hdf5") # x = np.array([ 4.82767379e-01, 5.92105016e-02, 3.61700505e-01, 2.74807483e-01, # 2.31401995e-01, 2.07236990e-01, 1.95800006e-01, 1.89892501e-01, # 1.84837490e-01, 1.81293502e-01, 1.77807003e-01, 1.74377009e-01, # 1.71005994e-01, 1.66384503e-01, 1.61247000e-01, 1.52030498e-01, # 1.35238498e-01, 1.11962005e-01, 8.79574940e-02, 4.76383008e-02, # 4.78339800e-01, 6.97819047e-01, 4.60800716e-01, 5.00754069e-01, # -1.00000000e+00, 9.99979496e-01, 8.71338917e-13]) # x_s = np.array([x, x]) # pre_y = pre_model.predict(x_s) # print(x_s[0]) # print(pre_y[0]) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] attack_valid = 1 gap = (i / 10) / 100.0 attack_step = -1 attack_target = 0 for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # os.system("scrot saved_pic/{}.png".format(j)) if j == 80: print("cp attack!") a_t[0][0] = -1.0 if j == 83: os.system("scrot saved_pic/{}.png".format(j)) # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 # if(step == 60): # a_t[0][0] = 1.0 # s_t_scaled = rescale_state(s_t) # # print(s_t[0]) # s_t_0 = restore_state(s_t_scaled) # # print(s_t_0[0]) # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0])) # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)]) # actions = np.array([np.copy(a_t[0]) for val in range(21)]) # for val in range(21): # actions[val][0] = -1.0 + val/10.0 # # print(actions) # x_0 = np.hstack((s_t_scaled_list, actions)) # # print(x_0.shape, s_t_scaled_list.shape, actions.shape) # pre_y = pre_model.predict(x_0) # # print(x_0[0]) # # print(pre_y[0]) # steer_index = int(a_t[0][0]*10.0 + 10.0) # for pre_step in range(2): # restore_new_Y = restore_states(pre_y) # actions = actor.model.predict(restore_new_Y) # x_step1 = np.hstack((pre_y, actions)) # pre_y = pre_model.predict(x_step1) # for index in range(21): # diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index]) # pro = np.random.random() # if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50: # a_t[0][0] = -1.0 + index/10.0 # print("adv!", diff, "pro:", pro) # attack_step = j # attack_target = a_t[0][0] # attack_valid -= 1 # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y]) # print("{:.2f}".format(max(dis_list)*100000)) # print("{}".format(max(dis_list)*100000)) # s_t_scaled = np.copy(s_t1) # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5) # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5) # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7) # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7) # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7) # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0])) # print(actions[0][0]) # ob, r_t, done, info = env.step(new_a_t[0]) ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print(a_t[0][0]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # action_states = [] # for i in range(-5, 6): # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done] # cur_sample.append(cur_step_sample) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 500: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target, i, j, total_reward) attack_valid = 1 attack_step = -1 attack_target = 0 with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file: the_file.write(s) overall_scores.append(total_reward) plt.clf() plt.plot(overall_scores) plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step / 10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train=0): #1 means Train, 0 means simply Run load_from = "." save_to = os.path.join("data", "saved") save_thresh = 100000 # Save if total reward for the episode is more BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input EXPLORE = 100000. episode_count = 2000 max_steps = 100000 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) ou = OU().function #Ornstein-Uhlenbeck Process buff = ReplayBuffer(BUFFER_SIZE) env = TorcsEnv(vision=False, throttle=True, gear_change=False) def state(ob): return np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) def load_weights(dir): print("Loading weights from ", dir) try: actor.model.load_weights(os.path.join(dir, "actormodel.h5")) critic.model.load_weights(os.path.join(dir, "criticmodel.h5")) actor.target_model.load_weights(os.path.join(dir, "actormodel.h5")) critic.target_model.load_weights( os.path.join(dir, "criticmodel.h5")) print("Weight load successfully") except: print("Cannot find the weight") def save_weights(dir): if not os.path.exists(dir): os.makedirs(dir) print("Saving weights in ", dir) actor.model.save_weights(os.path.join(dir, "actormodel.h5"), overwrite=True) critic.model.save_weights(os.path.join(dir, "criticmodel.h5"), overwrite=True) with open(os.path.join(dir, "actormodel.json"), "w") as outfile: json.dump(actor.model.to_json(), outfile) with open(os.path.join(dir, "criticmodel.json"), "w") as outfile: json.dump(critic.model.to_json(), outfile) load_weights(load_from) # Generate a Torcs environment print("TORCS Experiment Start.") np.random.seed(1337) done = False step = 0 epsilon = 1 for episode in range(episode_count): print("Episode : " + str(episode) + " Replay Buffer " + str(buff.count())) ob = env.reset() s_t = state(ob) total_reward = 0. progress = tqdm.trange(max_steps, disable=not train) for _ in progress: loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train * max(epsilon, 0) * ou( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train * max(epsilon, 0) * ou( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train * max(epsilon, 0) * ou( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = state(ob) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.update_target() critic.update_target() total_reward += r_t s_t = s_t1 progress.set_description("Episode %4i, TR %6.0f, loss %7.0f" % (episode, total_reward, loss)) #print("Episode", i, "Step", step, "Action", [ "%.3f" % x for x in a_t[0]], "Reward", r_t, "Loss", loss) step += 1 if done: break #print("Episode %i, TOTAL REWARD %.0f" % (episode, total_reward)) if train and total_reward > save_thresh: save_weights(save_to + str(episode)) save_thresh = min(1000000, 2 * save_thresh) env.end() # This is for shutting down TORCS print("Finish.")
agent = Agent(env) print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i)) if np.mod(i, 3) == 0: # Sometimes you need to relaunch TORCS because of the memory leak error ob = env.reset(relaunch=True) else: ob = env.reset() total_reward = 0. for j in range(max_steps): action = agent.act(ob) ob, reward, done, _ = env.step(action) # print(ob) total_reward += reward step += 1 if done: break print("TOTAL REWARD @ " + str(i) + " -th Episode : " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i)) if np.mod(i, 3) == 0: # Sometimes you need to relaunch TORCS because of the memory leak error ob = env.reset(relaunch=True) else: ob = env.reset() total_reward = 0. for j in range(max_steps): action = agent.act(ob, reward, done, vision) ob, reward, done, _ = env.step(action) #print(ob) total_reward += reward step += 1 if done: break print("TOTAL REWARD @ " + str(i) +" -th Episode : " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") attacks = [] for i in range(-10, 0): val = i / 10.0 attacks.append([77, val]) # for i in range(45, 55): # attacks.append([i, -1.5]) # attacks.append([i, 1.5]) for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) # if np.mod(i, 3) == 0: # ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error # else: # ob = env.reset() ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # if j == 71: # print("cp attack!") # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 if (j == attacks[i][0]): print('cp attack on {} with {}'.format(attacks[i][0], attacks[i][1])) a_t[0][0] = attacks[i][1] ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer cur_step_sample = [ s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done ] cur_sample.append(cur_step_sample) # #Do the batch update # batch = buff.getBatch(BATCH_SIZE) # states = np.asarray([e[0] for e in batch]) # actions = np.asarray([e[1] for e in batch]) # rewards = np.asarray([e[2] for e in batch]) # new_states = np.asarray([e[3] for e in batch]) # dones = np.asarray([e[4] for e in batch]) # y_t = np.asarray([e[1] for e in batch]) # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) # for k in range(len(batch)): # if dones[k]: # y_t[k] = rewards[k] # else: # y_t[k] = rewards[k] + GAMMA*target_q_values[k] # if (train_indicator): # loss += critic.model.train_on_batch([states,actions], y_t) # a_for_grad = actor.model.predict(states) # grads = critic.gradients(states, a_for_grad) # actor.train(states, grads) # actor.target_train() # critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 200: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0], attacks[i][1]) with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file: the_file.write(s) # overall_scores.append(total_reward) # plt.clf() # plt.plot(overall_scores) # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input vision = False episode_count = 1 max_steps = 1000 #100000 done = False step = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, 1, TAU, LRA) # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight print("Now we load Actor model's weights") try: actor.model.load_weights("actormodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) ob, r_t, done, info = env.step(a_t_original[0]) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward += r_t s_t = s_t1 if np.mod(j, 100) == 0: print("Episode", i, "Step", step, "Action", a_t_original[0], "Reward", r_t) step += 1 if done: break print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(f_diagnostics, train_indicator, agent, port=3101): # 1 means Train, 0 means simply Run action_dim = 3 #Steering/Acceleration/Brake state_dim = 65 #of sensors input env_name = 'Torcs_Env' save_location = "./weights/" # Generate a Torcs environment print("I have been asked to use port: ", port) env = TorcsEnv(vision=False, throttle=True, gear_change=False, main=1) ob = None while ob is None: try: client = snakeoil3.Client(p=port, vision=False) # Open new UDP in vtorcs client.MAX_STEPS = np.inf client.get_servers_input(0) # Get the initial input from torcs obs = client.S.d # Get the current full-observation from torcs ob = env.make_observation(obs) s_t = np.hstack((ob.angle, ob.track, ob.trackPos, \ ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) except: pass EXPLORE = total_explore episode_count = max_eps max_steps = max_steps_eps epsilon = epsilon_start done = False epsilon_steady_state = 0.01 # This is used for early stopping. totalSteps = 0 best_reward = -100000 running_avg_reward = 0. print("TORCS Experiment Start.") for i in range(episode_count): save_indicator = 0 early_stop = 1 total_reward = 0. info = {'termination_cause': 0} distance_traversed = 0. speed_array = [] trackPos_array = [] print('\n\nStarting new episode...\n') print("Initial memory consumption: ") for step in range(max_steps): # Take noisy actions during training if (train_indicator == 1): epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, epsilon_steady_state) a_t = agent.noise_action( s_t, epsilon) #Take noisy actions during training else: a_t = agent.action(s_t) try: ob, r_t, done, info = env.step(step, client, a_t, early_stop) if done: break analyse_info(info, printing=False) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \ ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) distance_traversed += ob.speedX * np.cos( ob.angle) #Assuming 1 step = 1 second if (math.isnan(r_t)): r_t = 0.0 for bad_r in range(50): print('Bad Reward Found') break #Introduced by Anirban # Add to replay buffer only if training if (train_indicator): agent.perceive(s_t, a_t, r_t, s_t1, done) # Add experience to replay buffer except Exception as e: print("Exception caught at port " + str(i) + str(e)) ob = None while ob is None: try: client = snakeoil3.Client( p=port, vision=False) # Open new UDP in vtorcs client.MAX_STEPS = np.inf client.get_servers_input( 0) # Get the initial input from torcs obs = client.S.d # Get the current full-observation from torcs ob = env.make_observation(obs) except: pass continue total_reward += r_t s_t = s_t1 # Displaying progress every 15 steps. if ((np.mod(step, 15) == 0)): print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t) totalSteps += 1 if done: break # Saving the best model. running_avg_reward = running_average(running_avg_reward, i + 1, total_reward) if train_indicator == 1: #Save network after every 20 episodes and store the data if np.mod(i, 20) == 0: agent.saveNetwork(i) #Saving training data for client for analysis if train_indicator == 1 and np.mod(i, 5) == 0: f1 = open(str(port) + ".csv", "a+") client.printAnalysis(f1, i) f1.close() print("TOTAL REWARD @ " + str(i) +"-th Episode : Num_Steps= " + str(step) + "; Max_steps= " \ + str(max_steps) +"; Reward= " + str(total_reward) + \ "; Running average reward= " + str(running_avg_reward)) print("Total Step: " + str(totalSteps)) print("") print(info) try: if 'termination_cause' in info.keys( ) and info['termination_cause'] == 'hardReset': print('Hard reset by some agent') ob, client = env.reset(client=client, relaunch=True) else: ob, client = env.reset(client=client, relaunch=True) except Exception as e: print("Exception caught at point B at port " + str(i) + str(e)) ob = None while ob is None: try: client = snakeoil3.Client( p=port, vision=False) # Open new UDP in vtorcs client.MAX_STEPS = np.inf client.get_servers_input( 0) # Get the initial input from torcs obs = client.S.d # Get the current full-observation from torcs ob = env.make_observation(obs) except: print("Exception caught at at point C at port " + str(i) + str(e)) s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \ ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) env.end() # This is for shutting down TORCS f1.close() print("Finish.")
def playGame(trainFlag=0): bufferLength = 50000 #values from Google paper #http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html gamma = 0.97 tau = 0.001 epsilon = 1 actorLearnRate = 0.0001 criticLearnRate = 0.001 episodeMax = 2500 maxIter = 100000 epsilonDelta = 1 / (maxIter * 2) batchLength = 32 step = 0 flag = 0 avgSpeed = 0 damage = 0 totalLoss = 0 complete = False actionDimensions = 3 stateDimensions = 29 #set TensorFlow to use GPU, add speedups configProto = tensor.ConfigProto() configProto.gpu_options.allow_growth = True session = tensor.Session(config=configProto) K.set_session(session) #Initialize actor and critic actor = SteveTheActor(actionDimensions, stateDimensions, batchLength, session, actorLearnRate, tau) critic = SteveTheCritic(actionDimensions, stateDimensions, batchLength, session, criticLearnRate, tau) #initialize framebuffer for replays frameBuffer = FrameBuffer(bufferLength) #launch game torcsEnv = TorcsEnv(vision=False, throttle=True, gear_change=False) #load weights from file try: if os.path.isfile("steveActor.h5"): actor.model.load_weights("steveActor.h5") actor.targetModel.load_weights("steveActor.h5") if os.path.isfile("steveCritic.h5"): critic.model.load_weights("steveCritic.h5") critic.targetModel.load_weights("steveCritic.h5") except: print("Error loading weight files") for x in range(episodeMax): print("Start of Ep:" + str(x) + " Buffer:" + str(frameBuffer.getCount())) #to get rid of memory leaks every few dozen launches if np.mod(x, 25) == 0: observ = torcsEnv.reset(relaunch=True) else: observ = torcsEnv.reset() stack = stackSensors(observ) avgSpeed = observ.speedX rewardSum = 0.0 for y in range(maxIter): epsilon = epsilon - epsilonDelta act = np.zeros([1, actionDimensions]) loss = 0 actPredict = actor.model.predict(stack.reshape(1, stack.shape[0])) #During training, apply Orn-Uhl noise to generate variance if trainFlag: noise = calcNoise(actionDimensions, actPredict, epsilon) act[0][0] = actPredict[0][0] + noise[0][0] act[0][1] = actPredict[0][1] + noise[0][1] if observ.track[9] < 100 and random.random() <= 0.1: #Add opposite of noise (~0.1 centered rather than 0.1) #To simulate pressing brake slightly #"Feeling the brake" act[0][2] = actPredict[0][2] - noise[0][2] else: act[0][2] = actPredict[0][2] + noise[0][2] else: act[0][0] = actPredict[0][0] act[0][1] = actPredict[0][1] if observ.track[9] < 100 and random.random() <= 0.1: noise = calcNoise(actionDimensions, actPredict, epsilon) act[0][2] = actPredict[0][2] - noise[0][2] #perform action based on predicted input #get updated state information observ, newReward, complete, info = torcsEnv.step(act[0]) #stack new sensor information newStack = stackSensors(observ) #rolling average over buffer length avgSpeed -= avgSpeed / batchLength avgSpeed += observ.speedX / batchLength damage = observ.damage #add new frame to frameBuffer frameBuffer.addFrame(stack, act[0], newReward, newStack, complete) #if frameBuffer.getSize() > batchLength: batch = frameBuffer.getBatch(batchLength) state = np.asarray([i[0] for i in batch]) actions = np.asarray([i[1] for i in batch]) reward = np.asarray([i[2] for i in batch]) newState = np.asarray([i[3] for i in batch]) completeVector = np.asarray([i[4] for i in batch]) yTrain = np.asarray([i[1] for i in batch]) targetQVal = critic.getRewards(newState, actor.targetModel.predict(newState)) for z in range(len(batch)): if not completeVector[z]: yTrain[z] = reward[z] + gamma * targetQVal[z] else: yTrain[z] = reward[z] if (trainFlag): #update loss based on critic analyzing last action/state result loss += critic.model.train_on_batch([state, actions], yTrain) #actor predicts new input based on new state actorGradient = actor.model.predict(state) #critic is updated based on actor gradient result gradient = critic.gradients(state, actorGradient) #actor trained based on critic gradient actor.train(state, gradient) actor.trainTarget() critic.trainTarget() rewardSum = rewardSum + newReward stack = newStack step += 1 totalLoss += loss if complete: break if trainFlag and np.mod(x, 5) == 0: print("Saving Actor and Critic Models") try: actor.model.save_weights("steveActor.h5", overwrite=True) with open("steveActor.json", "w") as actorFile: dump(actor.model.to_json(), actorFile) critic.model.save_weights("steveCritic.h5", overwrite=True) with open("steveCritic.json", "w") as criticFile: dump(critic.model.to_json(), criticFile) except: print("Error saving Actor and Critic Models") print("***Episode:" + str(x) + " Reward Sum:" + str(rewardSum) + "Loss:" + str(totalLoss) + " avgSpeed:" + str(avgSpeed) + " damage:" + str(damage)) print("***Steps:" + str(step)) with open('speedwaynewresults.csv', 'a') as csvfile: wr = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) wr.writerow([x, step, rewardSum, totalLoss, avgSpeed, damage, act]) totalLoss = 0 step = 0 torcsEnv.end() print("Race Ended!")
def run_ddpg(amodel, cmodel, train_indicator=0, seeded=1337, track_name='practgt2.xml'): OU = FunctionOU() BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic ALPHA = 0.9 action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input np.random.seed(seeded) vision = False EXPLORE = 100000. if train_indicator: episode_count = 600 else: episode_count = 3 max_steps = 20000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=track_name) if not train_indicator: # Now load the weight #logging.info("Now we load the weight") print("Now we load the weight") try: actor.model.load_weights(amodel) critic.model.load_weights(cmodel) actor.target_model.load_weights(amodel) critic.target_model.load_weights(cmodel) #logging.info(" Weight load successfully") print("Weight load successfully") except: #ogging.info("Cannot find the weight") print("Cannot find the weight") exit() #logging.info("TORCS Experiment Start.") print("TORCS Experiment Start.") best_lap = 500 for i_episode in range(episode_count): print("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) if np.mod(i_episode, 3) == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. for j_iter in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i_episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) if np.mod(step, 1000) == 0: logging.info("Episode {}, Distance {}, Last Lap {}".format( i_episode, ob.distRaced, ob.lastLapTime)) if ob.lastLapTime > 0: if best_lap < ob.lastLapTime: best_lap = ob.lastLapTime step += 1 if done: break if train_indicator and i_episode > 20: if np.mod(i_episode, 3) == 0: logging.info("Now we save model") actor.model.save_weights("ddpg_actor_weights_periodic.h5", overwrite=True) critic.model.save_weights("ddpg_critic_weights_periodic.h5", overwrite=True) print("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("Best Lap {}".format(best_lap)) print("") logging.info("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) logging.info("Best Lap {}".format(best_lap)) env.end() # This is for shutting down TORCS logging.info("Finish.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 512 #of sensors input np.random.seed(61502) vision = True EXPLORE = 100000. episode_count = 600000 max_steps = 1800 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 esar2 = [] esar4 = [] #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) #We insert the Deep Q Image Processing Module args = { 'save_model_freq': 10000, 'target_model_update_freq': 10000, 'normalize_weights': True, 'learning_rate': .00025, 'model': None } # print(args["save_model_freq"]) C = DeepQNetwork(512, sess, '/home/lou/DDPG-Keras-Torcs', args=args) # print(C) x, h_fc1 = C.buildNetwork('test', trainable=True, numActions=1) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodelIMG.h5") critic.model.load_weights("criticmodelIMG.h5") actor.target_model.load_weights("actormodel2IMG.h5") critic.target_model.load_weights("criticmodel2IMG.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 500) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 500 episode because of the memory leak error else: ob = env.reset() imgfinal = np.zeros((1, 128, 128, 4), dtype=np.int32) s_t = C.getFC7(imgfinal) total_reward = 0. imglst = [] speed = 0 stepreset = 0 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) # a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) a_t_original = actor.model.predict(C.getFC7(imgfinal)) #print('ATORIGINAL', a_t_original) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.05: print("********Now we apply the brake***********") noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) imglst.append(ob.img) if len(imglst) == 4: imgcopy = imglst[:] imgfinal = np.stack(imgcopy) # print("Original stacked matrix", imgfinal) imgfinal = np.reshape(imgfinal, (4, 128, 128)) # print("Reshaped stacked matrix", imgfinal) imgfinal = np.transpose(imgfinal, (1, 2, 0)) # print("Transposed stacked matrix", imgfinal) imgfinal = np.reshape(imgfinal, (1, 128, 128, 4)) # print("Shape of imgfinal", imgfinal.shape) s_t1 = C.getFC7(imgfinal) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) # print('NEW STATES', new_states) # target_q_values = critic.target_model.predict([C.getFC7(imgfinal), actor.target_model.predict(C.getFC7(imgfinal))]) # print('ACTOR TARGET MODEL PREDICT', C.getFC7(imgfinal)) new_states = np.reshape(new_states, (-1, 512)) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # print('TARGET Q VALUES', target_q_values) # print('NEW STATES', new_states) # print('ACTOR MODEL PREDICT NEW STATES', actor.target_model.predict(new_states)) # print('REWARDS', rewards) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): states = np.reshape(states, (-1, 512)) print('STATESSHAPE', np.shape(states)) print('ACTIONSSHAPE', np.shape(actions)) print('YT', np.shape(y_t)) loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 speed += ob.speedX * 300 speedavg = speed / stepreset #print("SPEED X", ob.speedX) print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss, "Average Speed", speedavg) esar = (i, step, a_t, r_t, loss, speedavg) esar2.append(esar) step += 1 stepreset += 1 if len(imglst) >= 4: del imglst[0] # print("Length of imglist", len(imglst)) # print("List itself", imgfinal) if done: break if np.mod(i, 50) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodelIMG.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodelIMG.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") esar3 = (i, step, total_reward, speedavg) esar4.append(esar3) if np.mod(i, 50) == 0: save_object(esar2, 'IntraEpisode.pkl') save_object(esar4, 'InterEpisode.pkl') env.end() # This is for shutting down TORCS print("Finish.") print("Saving esars.")
def programmatic_game(steer, accel, brake, track_name='practice.xml'): episode_count = 1 max_steps = 10000 window = 5 # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False, track_name=track_name) logging.info("TORCS Experiment Start with Priors on " + track_name) observation_list = [] actions_list = [] for i_episode in range(episode_count): ob = env.reset(relaunch=True) tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), [0, 0, 0]] window_list = [tempObs[:] for _ in range(window)] total_reward = 0 sp = [] lastLapTime = [] for j in range(max_steps): steer_action = clip_to_range(steer.pid_execute(window_list), -1, 1) accel_action = clip_to_range(accel.pid_execute(window_list), 0, 1) brake_action = clip_to_range(brake.pid_execute(window_list), 0, 1) action_prior = [steer_action, accel_action, brake_action] observation_list.append(window_list[:]) actions_list.append(action_prior) #(mixed_act[:]) tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY], [ob.speedZ], [ob.rpm], list(ob.wheelSpinVel / 100.0), list(ob.track), action_prior] window_list.pop(0) window_list.append(tempObs[:]) ob, r_t, done, info = env.step(action_prior) #if np.mod(j, 1000) == 0: total_reward += r_t sp.append(info['speed']) if lastLapTime == []: if info['lastLapTime'] > 0: lastLapTime.append(info['lastLapTime']) elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[ 'lastLapTime']: lastLapTime.append(info['lastLapTime']) if done: print('Done. Steps: ', j) break #logging.info("Episode: " + str(i_episode) + " step: " + str(j+1) + " Distance: " + str(ob.distRaced) + ' ' + str(ob.distFromStart) + " Lap Times: " + str(ob.lastLapTime)) logging.info(" step: " + str(j + 1) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) + " Ave Reward: " + str(total_reward / (j + 1)) + "\n Distance: " + str(info['distRaced']) + ' ' + str(info['distFromStart']) + "\n Last Lap Times: " + str(info['lastLapTime']) + " Cur Lap Times: " + str(info['curLapTime']) + " lastLaptime: " + str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) + " max sp: " + str(np.max(sp))) env.end() # This is for shutting down TORCS logging.info("Finish.") return observation_list, actions_list
def work(self, max_episode_length, gamma, sess, coord, saver): """Does the actual work, as the name says ;) Runs the episodes""" vision = False self.local_AC.is_training = False #env = TorcsDockerEnv(self.docker_client, self.name, self.docker_port, training=True) env = TorcsEnv(vision=False, throttle=True,gear_change=False,port=self.docker_port) episode_count = sess.run(self.global_episodes) total_steps = 0 print("Starting {}".format(self.name)) with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): # Update with global weights, the action A3C sess.run(self.update_local_ops) episode_buffer = [] episode_values = [] episode_frames = [] episode_reward = 0 episode_step_count = 0 # reset docker every third episode to avoid the mmemory leak local_episodes = 0 if np.mod(local_episodes, 9) == 0: observation = env.reset(relaunch=True) else: observation = env.reset() state_t = obs_to_state(observation) #ob = observation #print(ob) #s_t = np.hstack((0, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) #state_t = s_t done = False while not done: # Get the action and apply it to the environment action_t, value_t = sess.run( [self.local_AC.action, self.local_AC.value], feed_dict={self.local_AC.inputs: [state_t]}) observation, reward_t, done, _ = env.step(action_t[0][0]) if not done: state_t1 = obs_to_state(observation) episode_frames.append(state_t1) else: state_t1 = state_t # Store the episode episode_buffer.append( [state_t, action_t, reward_t, state_t1, done, value_t[0, 0]]) episode_values.append(value_t[0, 0]) episode_reward += reward_t state_t = state_t1 total_steps += 1 episode_step_count += 1 if (total_steps % 30) == 0: print( self.name, "Episode", episode_count, "Step", episode_step_count, "Total_Steps", total_steps, "Action", action_t[0][0], "Reward", reward_t) summary = tf.Summary() summary.value.add( tag='summary/reward_1', simple_value=float(reward_t)) self.summary_writer.add_summary( summary, total_steps) self.summary_writer.flush() # If the episode buffer is full, flush it and update # the network weights if (len(episode_buffer) == 15 and not done and episode_step_count != max_episode_length-1): value_t1 = sess.run( self.local_AC.value, feed_dict={self.local_AC.inputs: [state_t]})[0, 0] (value_loss, policy_loss, gradient_norm, variable_norm) = self.train( episode_buffer, sess, gamma, value_t1) episode_buffer = [] sess.run(self.update_local_ops) if (done or episode_step_count >= max_episode_length): break local_episodes += 1 self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) self.episode_mean_values.append( np.mean(episode_values)) if len(episode_buffer) != 0: # Train the netowkr use the recent episodes (value_loss, policy_loss, gradient_norm, variable_norm) = self.train( episode_buffer, sess, gamma, 0.0) if episode_count != 0: if (self.name == 'worker_0'): saver.save( sess, os.path.join(self.modeldir, 'model-{:d}.cptk'.format( episode_count))) mean_reward = np.mean(self.episode_rewards[-5:]) mean_length = np.mean(self.episode_lengths[-5:]) mean_value = np.mean(self.episode_mean_values[-5:]) print( "Worker", self.name, "Episode", episode_count, "Reward", mean_reward, "value_Loss", value_loss, "policy_loss", policy_loss) summary = tf.Summary() summary.value.add( tag='Perf/Reward', simple_value=float(mean_reward)) summary.value.add( tag='Perf/Length', simple_value=float(mean_length)) summary.value.add( tag='Perf/Value', simple_value=float(mean_value)) summary.value.add( tag='Losses/Value Loss', simple_value=float(value_loss)) summary.value.add( tag='Losses/Policy Loss', simple_value=float(policy_loss)) summary.value.add( tag='Losses/Grad Norm', simple_value=float(gradient_norm)) summary.value.add( tag='Losses/Var Norm', simple_value=float(variable_norm)) self.summary_writer.add_summary( summary, episode_count) self.summary_writer.flush() if self.name == 'worker_0': sess.run(self.increment) episode_count += 1 env.end()
def main(): # Creating necessary directories collect_track_no = 5 experiment_name = "tensorboard-4" experiment_dir = "experiment-%s/" % experiment_name models_dir = experiment_dir + "model/" datas_dir = experiment_dir + "datas-track-no-%d/" % collect_track_no if os.path.exists(experiment_dir) == False: print("%s dosen't exists" % experiment_dir) return if os.path.exists(models_dir) == False: print("%s dosen't exists" % models_dir) return if os.path.exists(datas_dir) == False: os.mkdir(datas_dir) action_dim = 1 state_dim = 30 env_name = 'torcs' sess = tf.InteractiveSession() agent = ddpg(env_name, sess, state_dim, action_dim, models_dir) agent.load_network() vision = True env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=collect_track_no, random_track=False, track_range=(0, 3)) print("Collecting Start.") max_data_entry_count = 2000 data_entry_count = 0 start_time = time.time() i = 0 step = 0 try: file = open(datas_dir + 'state-action-scalar', 'w') while data_entry_count < max_data_entry_count: if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, 0.0)) pre_a_t = 0.0 while data_entry_count < max_data_entry_count: a_t = agent.action(s_t) ob, r_t, done, info = env.step([a_t[0], 0.16, 0]) print("Step", step, "Action", a_t, "Reward", r_t) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, a_t[0])) image = ob.img if step > 20: plt.imsave( datas_dir + ("%d-%d.jpg" % (collect_track_no, data_entry_count)), image) ret = file.write( "%f %f %f %f %f\n" % (ob.speedX, ob.speedY, ob.speedZ, pre_a_t, a_t[0])) if ret == 0: print("File Write error") data_entry_count += 1 s_t = s_t1 step += 1 pre_a_t = a_t[0] if done: break print(("TOTAL REWARD @ " + str(i) + "Collect", data_entry_count)) print(("Total Step: " + str(step))) print("") except: traceback.print_exc() with open((datas_dir + "exception"), 'w') as file: file.write(str(traceback.format_exc())) finally: file.close() env.end() end_time = time.time() with open(datas_dir + "log", 'w') as file: file.write("total_step = %d\n" % step) file.write("total_time = %s (s)\n" % str(end_time - start_time)) print("Finish.")
def main(): """main method log runtime and print it at the end """ s_time = timeit.default_timer() global iteration env = TorcsEnv(vision=False, throttle=True, gear_change=False) memory = ReplayBuffer() epsilon = 1 train_indicator = True modelPATH = os.path.join('.',"models",'E0011.pt') q,q_target = QNet(state_dim,action_dim),QNet(state_dim,action_dim) q_target.load_state_dict(q.state_dict()) mu, mu_target = MuNet(state_dim), MuNet(state_dim) mu_target.load_state_dict(mu.state_dict()) steer_noise = OUN(np.zeros(1),theta = 0.6) accel_noise = OUN(np.zeros(1),theta = 0.6) mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu) q_optimizer = optim.Adam(q.parameters(), lr=lr_q) #tensorboard writer current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = os.path.join("logs", "ddpg_torch", current_time+'E0011t') writer = SummaryWriter(log_dir) samplestate = torch.rand(1,29) sampleaction = torch.rand(1,2) #writer.add_graph(mu,samplestate) writer.add_graph(q,(samplestate,sampleaction)) writer.close if train_indicator ==False: mu = torch.load(modelPATH) mu.eval() ob = env.reset() score = 0 for n_step in range(100000): s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) a_t = mu(torch.from_numpy(s_t.reshape(1,-1)).float()).detach().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t if done: print("score:",score) break env.end() return 0 for n_epi in range(max_episode): print("Episode : " + str(n_epi) + " Replay Buffer " + str(memory.size())) if np.mod(n_epi, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() a_t = np.zeros([1,action_dim]) s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) score = 0 q_value_writer(q, mu, s_t, writer, 'Episode Start Q value') q_value_writer(q_target, mu_target, s_t, writer, 'Episode Start target Q value') #t_start = timeit.default_timer() for n_step in range(max_step): #epsilon -= 1.0/EXPLORE a_origin = mu(torch.from_numpy(s_t.reshape(1,-1)).float()) if train_indicator == True:#add noise for train # sn = max(epsilon,0)*steer_noise() sn = steer_noise() # an = max(epsilon,0)*accel_noise() an = accel_noise() a_s = a_origin.detach().numpy()[0][0] + sn a_t[0][0] = np.clip(a_s,-1,1) # fit in steer arange a_a = a_origin.detach().numpy()[0][1] + an a_t[0][1] = np.clip(a_a,0,1) # fit in accel arange #record noise movement if iteration%10==0: writer.add_scalar('Steer noise', sn, iteration) writer.add_scalar('Accel_noise', an, iteration) else: a_t = a_origin.detatch().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) memory.put((s_t,a_t[0],r_t,s_t1,done)) s_temp = copy.deepcopy(s_t) # for end q value log s_t = s_t1 if train_indicator and memory.size()>train_start_size: train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer,writer) soft_update(mu, mu_target) soft_update(q, q_target) iteration+=1 if done: q_value_writer(q,mu,s_temp,writer,'Episode End Q value') q_value_writer(q_target,mu_target,s_temp,writer,'Episode End target Q value') break #t_end = timeit.default_timer() print("TOTAL REWARD @ " + str(n_epi) +"-th Episode : Reward " + str(score)) print("Total Step: " + str(n_step)) print("") #print('{}steps, {} time spent'.format(i,t_end-t_start)) torch.save(mu,modelPATH) env.end() e_time = timeit.default_timer() print("Total step {} and time spent {}".format(iteration, e_time-s_time))
def sample_path(self, num_episodes=None): """ MODIFIED FOR TORCS! Sample path for the environment. Args: num_episodes: the number of episodes to be sampled if none, sample one batch (size indicated by config file) Returns: paths: a list of paths. Each path in paths is a dictionary with path["observation"] a numpy array of ordered observations in the path path["actions"] a numpy array of the corresponding actions in the path path["reward"] a numpy array of the corresponding rewards in the path total_rewards: the sum of all rewards encountered during this "path" """ episode = 0 episode_rewards = [] episode_roll_distances = [] paths = [] t = 0 i = 0 print print("TORCS Experiment Start".center(80, '=')) env = TorcsEnv(vision=self.config.vision, throttle=self.config.throttle) #print('Num episodes', num_episodes) print('Using a batch size of: ', self.config.batch_size) try: while (num_episodes or t < self.config.batch_size): i += 1 print('t', t, 'i', i) #Avoid a memory leak in TORCS by relaunching if np.mod(i, 10) == 0: state = env.reset() else: state = env.reset(relaunch=True) state = np.concatenate([ state.track, np.array([state.speedX, state.speedY, state.speedZ]) ], axis=0) states, actions, rewards = [], [], [] episode_reward = 0 for step in range(self.config.max_ep_len): states.append(state) #print('State', state) action = self.sess.run(self.sampled_action, feed_dict={ self.observation_placeholder: np.reshape( states[-1], [1, self.observation_dim]) })[0] state, reward, done, info = env.step(action) #print('\n State track', state.track) #print('\n State focus', state.focus) state = np.concatenate([ state.track, np.array([state.speedX, state.speedY, state.speedZ]) ], axis=0) #print('State', state) #print('Reward', reward) #print('info', info) actions.append(action) rewards.append(reward) episode_reward += reward t += 1 if (done or step == self.config.max_ep_len - 1): episode_rewards.append(episode_reward) episode_roll_distances.append(env.distance_travelled) break if (not num_episodes) and t == self.config.batch_size: break path = { "observation": np.array(states), "reward": np.array(rewards), "action": np.array(actions) } paths.append(path) episode += 1 if num_episodes and episode >= num_episodes: break finally: env.end() # This is for shutting down TORCS print("Finished TORCS session".center(80, '=')) return paths, episode_rewards, episode_roll_distances
def playGame(train_indicator=1, safety_constrain_flag=False): #1 means Train, 0 means simply Run #initialization = 0 episode_trained = 0 BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.9999 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 2 #Steering/Acceleration/Brake state_dim = 29 + 36 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 1000 max_steps = 300 reward = 0 done = False step = 0 epsilon = 1.0 indicator = 0 plt.ion() #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel_following.h5") critic.model.load_weights("criticmodel_following.h5") actor.target_model.load_weights("actormodel_following.h5") critic.target_model.load_weights("criticmodel_following.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") cumreward_list = [] average_step_reward_list = [] damage_rate_list = [] epsilon_list = [] results_list = [] trackPos_list = [] speed_list = [] epreward_list = [] damage_time = [] for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) print("Epsilon is: ", epsilon) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) epsilon = epsilon * 0.998 total_reward = 0. damage_steps = 0 for j in range(max_steps): loss = 0 damage = 0 #epsilon -= 1 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) if train_indicator: a_t_original = actor.target_model.predict( s_t.reshape(1, s_t.shape[0])) else: a_t_original = actor.target_model.predict( s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0.1) * OU.function2( a_t_original[0][0], 0.5, 0.90, 0.2) #noise_t[0][1] = train_indicator * max(epsilon, 0.0) * OU.function(a_t_original[0][1], 1.0 , 1.00, 0.10) noise_t[0][1] = train_indicator * max(epsilon, 0.1) * OU.function1( a_t_original[0][1], 0.9, 1.0, 0.60) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) ''' if np.random.randn() < max(epsilon,0.05): a_t[0][0] = np.random.randn()*2-1 else: a_t[0][0] = a_t_original[0][0] ''' a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t_primitive = Get_actions(a_t[0][0], a_t[0][1], ob, safety_constrain=safety_constrain_flag) ob, r_t, done, info = env.step(a_t_primitive) if r_t == -5.0 or r_t == -1.0: damage_steps += 1 damage = 1 trackPos_list.append(ob.trackPos) speed_list.append(ob.speedX) epreward_list.append(r_t) damage_time.append(damage) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break damage_rate = (float)(damage_steps / j * 100) if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel_following.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel_following.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) if train_indicator: # Save the results cumreward_list.append(total_reward) average_step_reward_list.append(total_reward / j) damage_rate_list.append(damage_rate) epsilon_list.append(epsilon) sio.savemat( 'results_overtaking.mat', { 'total_reward': cumreward_list, 'average_reward': average_step_reward_list, 'epsilon': epsilon_list, 'damage': damage_rate_list }) else: sio.savemat( 'info.mat', { 'ep_reward': epreward_list, 'trackPos': trackPos_list, 'speed': speed_list, 'damage_rate': damage_rate, 'damage_time': damage_time }) print('damage rate is:', damage_rate) plt.figure(1) plt.hold(True) plt.subplot(511) plt.plot(i, total_reward, 'ro') plt.xlabel("Episodie") plt.ylabel("Episodic total reward") plt.subplot(512) plt.plot(i, total_reward / j, 'bo') plt.xlabel("Episodie") plt.ylabel("Expected reward each step") plt.subplot(513) plt.plot(i, damage_rate, 'go') plt.xlabel("Episodie") plt.ylabel("Damage rate per episode [%]") plt.subplot(514) plt.plot(i, max(epsilon, 0.1), 'yo') plt.xlabel("Episodie") plt.ylabel("epsilon") plt.subplot(515) plt.plot(i, loss / j, 'yo') plt.xlabel("Episodie") plt.ylabel("Average loss") plt.draw() plt.show() plt.pause(0.001) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS plt.savefig('test.png') print("Finish.")
def main(): global play, replay_buffer, q_act_net, q_target_net, step #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) tstr = time.strftime('_%H_%M_%S_') first_run = False #don't use saved nets/buffer reset_buffer = True #don't use saved buffer #dqn params GAMMA = 0.99 TAU = .0001 #.0001 #exploration noise params act_noise_init = 0.5 #.75 #.25 act_noise_final = .01 # .25 act_noise_interval = 100000 rnd_range = 1 #lag augmentation packet_lost = 0 #0.01 #action smoothing augmentation lambda_spatial_q = 0 action_smoother = .33 action_limiter = .33 episode_count = 10000 max_steps = 5000 save_in_iters = 15000 #100000 #start training after accumulating train_start_num samples train_start_num = 1 * BATCH_SIZE caffe.set_mode_gpu() caffe.set_device(0) #balance track samples in replay buffer track_balance = .9 #n-steps dqn, steps=max_n_batches*batch size max_n_batches = 16 # use n-steps dqn for this ratio, rest 1-step n_step_ratio = .75 #priority buffer k_priority_try = 2 if n_step_ratio > 0 and max_n_batches > 0: n_steps_dqn = True else: n_steps_dqn = False #average speed c_speed = 35 # 50 #speed variance delta_speed = 5 # 7.5 #switch speed in swith_count_min = 50 #target frame time frame_rate = .4 #fail if lag is more then t_delta_fail = frame_rate * 1.75 #for rebound handling rebound_count_max = 5 start_run = 50 # for error handling after_start_check = 100 max_errors = 50 #solver for current net if not play: critic_solver = caffe.get_solver( current_dir + 'resnet_torcs/dqn_critic_solver.prototxt') if first_run: # target net: q_target_net = caffe.Net( current_dir + 'resnet_torcs/critic_batch_dqn.prototxt', current_dir + 'r18nb.caffemodel', caffe.TEST) # current net: q_act_net = caffe.Net( current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt', caffe.TEST) if not play: ParamCopy(critic_solver.net.params, q_target_net.params) ParamCopy(q_act_net.params, q_target_net.params) else: #target net: q_target_net = caffe.Net( current_dir + 'resnet_torcs/critic_batch_dqn.prototxt', 'qq_target.caffemodel', caffe.TEST) #current net: q_act_net = caffe.Net( current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt', 'q_solver.caffemodel', caffe.TEST) if not play: ParamCopy(critic_solver.net.params, q_act_net.params) if not play and not reset_buffer: print 'loading replay_buffer buffer' replay_buffer = load_replay() replay_buffer.size_reduce(BUFFER_SIZE) print 'replay_buffer buffer loaded' print 'models loaded ***************************' if not play: assert q_target_net.blobs['state'].data.shape[0] == BATCH_SIZE assert q_act_net.blobs['state'].data.shape[0] == 1 assert critic_solver.net.blobs['state'].data.shape[0] == BATCH_SIZE assert q_target_net.blobs['state'].data.shape[1] == CHANNELS assert q_act_net.blobs['state'].data.shape[1] == CHANNELS assert critic_solver.net.blobs['state'].data.shape[1] == CHANNELS assert q_target_net.blobs['q_action'].data.shape[1] == DISCR_A assert q_act_net.blobs['q_action'].data.shape[1] == DISCR_A assert critic_solver.net.blobs['q_action'].data.shape[1] == DISCR_A max_reached_step = 150 #used for track balance images_history = [] #used for input image step = 0 #total number of simulation steps save_count = 0 #used for saving nets/buffer n_batch = 0 #used for n-steps q_loss = 0 #main loss # Generate a Torcs environment env = TorcsEnv(vision=True, throttle=False, observer=False) time_start = time.time() track_id = 0 #track #n-step temp vars n_steps_cont_from_prev = False prev_start_pos = -1 prev_track_id = -1 Qlast = -1 episod_steps = 0 n_steps_used = 0 batches_used = 0 #for error failure rest_fail = 0 rebound_events = 0 for i in range(episode_count): #balance tracks if episod_steps >= max_reached_step * track_balance: track = t_list[track_id] change_track( "/usr/local/share/games/torcs/config/raceman/quickrace.xml", track) print "Track: ", track, "track_id", track_id episod_steps = 0 print("Episode : " + str(i)) ob = env.reset(relaunch=True) s_t = None #input image total_reward = 0. #for randomizing velocity switch_count = swith_count_min + random.randint(0, swith_count_min) #for handling out-of-lane rebound = False rebound_count = 0 track_pos = 0 error_count = 0 act_prev = np.array([0.]) t_delta = 0 for j in range(max_steps): max_reached_step = max(max_reached_step, j) a_t = np.array([0.]) #action skip_state = False error_present = False #exploration noise params act_noise = act_noise_init + ( act_noise_final - act_noise_init) * min( step * 1. / act_noise_interval, 1.) rnd_noise = 1 if rnd_range > 1: rnd_noise = int( (rnd_range + 1) * max(1., float(act_noise_interval - step) / act_noise_interval)) #get action ======================================================= if s_t is None: action_index = random.randrange(DISCR_A) print '----------Random Action---------- action_index', action_index a_t[0] = ind2a(action_index, DISCR_A, DELTA_A) else: a_t[0] = qchoice(q_act_net, s_t, CHANNELS, DISCR_A, DELTA_A) #apply exploration noise if not play and random.random() <= act_noise: ind = a2ind(a_t[0], DISCR_A, DELTA_A) r = 1 if rnd_noise > 1: r = randint(1, rnd_noise) ind += randint(-r, r) ind = min(max(ind, 0), DISCR_A - 1) a_t[0] = ind2a(ind, DISCR_A, DELTA_A) #if still no action use random if a_t is None: action_index = random.randrange(DISCR_A) print 'rnd action_index', action_index a_t[0] = ind2a(action_index, DISCR_A, DELTA_A) #starting area if j < start_run: a_t[0] = 0 #action limiter if not play and abs( a_t[0]) > DELTA_A / 2 and random.random() < action_limiter: ind = a2ind(a_t[0], DISCR_A, DELTA_A) dind = ind - DISCR_A / 2 if dind > (DISCR_A - 1) / 4: dind = (DISCR_A - 1) / 4 if dind < -(DISCR_A - 1) / 4: dind = -(DISCR_A - 1) / 4 a_t[0] = ind2a(dind + DISCR_A / 2, DISCR_A, DELTA_A) #save action a_0_list.append(a_t) #fail on render delay if not play and t_delta > t_delta_fail and i > rest_fail + 10 and j >= after_start_check: error_present = True if error_count >= max_errors / 2: print 'delta fail **************************' rest_fail = i break else: error_count += 1 #randomize speed if (j % switch_count and not play) == 0: tag_speed_rnd = c_speed - delta_speed + random.uniform( 0, delta_speed * 2) else: tag_speed_rnd = c_speed #render delay compensation if t_delta > frame_rate: tag_speed = frame_rate / t_delta * tag_speed_rnd else: tag_speed = tag_speed_rnd #handle out-of-lane event if rebound: rebound_count = rebound_count_max else: rebound_count = max(0, rebound_count - 1) if (rebound_count > rebound_count_max / 2 and abs(track_pos) > .7) or rebound: angle = -observation.angle if angle * track_pos > 0 and abs(angle) > .2: a_t[0] = -sign(track_pos) * 4 * DELTA_A / 5 if angle * track_pos > 0 and abs(angle) <= .2: a_t[0] = -sign(track_pos) * 2 * DELTA_A / 5 if angle * track_pos < 0 and abs(angle) <= .15: a_t[0] = -sign(track_pos) * DELTA_A / 5 if angle * track_pos < 0 and abs(angle) > .15: a_t[0] = 0 if angle * track_pos < 0 and abs(angle) >= .35: a_t[0] = sign(track_pos) * DELTA_A / 5 tag_speed = min(tag_speed, 20) print "############ rebound, action", a_t[ 0], "V angle", angle, "###############" #smooth action if not play and action_smoother > 0 and random.random( ) < action_smoother: ind_prev = a2ind(act_prev[0], DISCR_A, DELTA_A) ind = a2ind(a_t[0], DISCR_A, DELTA_A) if abs(ind - ind_prev) > 1: print "smooth ind", ind, "->", np.rint(.5 * (ind_prev + ind)) ind = int(.5 * (ind_prev + ind)) a_t[0] = ind2a(ind, DISCR_A, DELTA_A) a_act = a_t #lag augemntaion if not play and random.random < packet_lost and t_delta < frame_rate: a_act = act_prev #===================== main enviroment step ========================================= obs0 = time.time() prev_rebound = rebound observation, r_t, done, rebound, _ = env.step(a_act, tag_speed) curr_time = time.time() t_delta = curr_time - time_start time_start = curr_time #==================================================================================== if rebound and not prev_rebound: rebound_events += 1 print 't_delta', t_delta, "step", j, "step time", curr_time - obs0, "tag_speed_rnd", tag_speed_rnd, "rebound_events", rebound_events if rebound: r_t = 0 if prev_rebound and r_t == 0: skip_state = True #speed failure, could be moved to gym_torcs if observation.speedX < .01 and j >= after_start_check and t_delta < t_delta_fail: skip_state = True error_present = True r_t = 0 if error_count >= max_errors: print 'speed too slow fail, speed', 300 * observation.speedX, '**************************' break else: error_count += 1 #make state ======================================================== image = observation.img images_history.append(image) while len(images_history) > CHANNELS + 1: images_history.pop(0) s_t1 = make_state(images_history, CHANNELS) track_pos = observation.trackPos #save stat reward_list.append(r_t) track_list.append(track_pos) yspeed_list.append(observation.speedY) #store data into replay buffer ====================================== do_store = not play and s_t is not None and s_t1 is not None and not skip_state if do_store: print 'add data, action', a_t[0], 'reward ', r_t w_p = j replay_buffer.add(s_t, a_t, r_t, s_t1, done, w_p, track_id, -1, -1) print '***** stored: track_pos', track_pos, 'angle', observation.angle,\ 'max_step', max_reached_step, 'Episode', i elif not play: print 'skipped state track_pos', track_pos, 'angle', observation.angle,\ 'max_step', max_reached_step, 'Episode', i #training ====================================== if not play and replay_buffer.num_experiences > train_start_num: #get batch using n-steps if previous batch was using n-step use_n_steps_now = n_steps_dqn if n_batch >= max_n_batches: use_n_steps_now = False n_batch = 0 if n_steps_cont_from_prev and use_n_steps_now and max_n_batches > 1: assert prev_start_pos >= 0 batch, n_steps_collected, prev_start_pos, prev_track_id =\ replay_buffer.getBatch4Pos(BATCH_SIZE, prev_start_pos, prev_track_id) n_step_continued = n_steps_collected else: n_step_continued = False if n_steps_used >= n_step_ratio * batches_used and not n_step_continued: use_n_steps_now = False #get batch if previous batch was *not* using n-step if not n_step_continued: batch, n_steps_collected, prev_start_pos, prev_track_id =\ replay_buffer.getBatch(BATCH_SIZE, max_n_batches, k_priority_try, n_steps=use_n_steps_now) #net training ============= q_loss, Qlast = train_on_batch(batch, q_target_net, critic_solver, DISCR_A, DELTA_A, BATCH_SIZE, GAMMA, n_steps_collected, n_step_continued, Qlast, lambda_spatial_q) #update n-step vars if n_steps_collected: n_batch += 1 n_steps_used += 1 else: n_batch = 0 batches_used += 1 n_steps_cont_from_prev = n_steps_collected and prev_start_pos >= 0 # target update ============== SoftUpdate(q_target_net.params, critic_solver.net.params, TAU) ParamCopy(q_act_net.params, critic_solver.net.params) save_count += 1 #save loss if not play: q_loss_list.append(q_loss) #update local vars s_t = s_t1 act_prev = a_t if done: s_t = None if not error_present: error_count = max(0, error_count - 1) total_reward += r_t episod_steps += 1 step += 1 if done: break #save nets and buffer if not play and save_count >= save_in_iters: print "start save", save_count, step save_count = 0 save_nets(q_act_net, q_target_net, step, replay_buffer) save_state(a_0_list, a_1_list, q_loss_list, reward_list, track_list, yspeed_list, tstr + str(step)) track_id = (track_id + 1) % len(t_list) print("TOTAL REWARD @ " + str(i) + " -th Episode : " + str(total_reward)) print("Total Step: " + str(step)) print("") print("Finishing torcs.") env.end() # This is for shutting down TORCS #save nets and buffer if not play: save_state(a_0_list, a_1_list, q_loss_list, reward_list, track_list, yspeed_list, tstr + str(step)) save_nets(q_act_net, q_target_net, step, replay_buffer, "_finished") print 'Finish'
def playGame(train_indicator=is_training): # 1 means Train, 0 means simply Run action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input env_name = 'Torcs_Env' agent = DDPG(env_name, state_dim, action_dim) # Generate a Torcs environment vision = False env = TorcsEnv(vision=vision, throttle=True, gear_change=False) EXPLORE = total_explore episode_count = max_eps max_steps = max_steps_eps epsilon = epsilon_start done = False step = 0 best_reward = -100000 print("TORCS Experiment Start.") for i in range(episode_count): ##Occasional Testing if ((np.mod(i, 10) == 0) and (i > 20)): train_indicator = 0 else: train_indicator = is_training # relaunch TORCS every 3 episode because of the memory leak error if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() # Early episode annealing for out of track driving and small progress # During early training phases - out of track and slow driving is allowed as humans do ( Margin of error ) # As one learns to drive the constraints become stricter random_number = random.random() eps_early = max(epsilon, 0.10) if (random_number < (1.0 - eps_early)) and (train_indicator == 1): early_stop = 1 else: early_stop = 0 print("Episode : " + str(i) + " Replay Buffer " + str(agent.replay_buffer.count()) + ' Early Stopping: ' + str( early_stop) + ' Epsilon: ' + str(eps_early) + ' RN: ' + str(random_number)) # Initializing the first state s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) # Counting the total reward and total steps in the current episode total_reward = 0. step_eps = 0. for j in range(max_steps): # Take noisy actions during training if (train_indicator): epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, 0.1) a_t = agent.noise_action(s_t, epsilon) #输入状态s得到动作的Q值 else: a_t = agent.action(s_t) # ob, r_t, done, info = env.step(a_t[0],early_stop) ob, r_t, done, info = env.step(a_t, early_stop) # 得到游戏反馈 s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) # Add to replay buffer only if training (Is it necessay - don't think so) if (train_indicator): agent.perceive(s_t, a_t, r_t, s_t1, done) # Cheking for nan rewards if (math.isnan(r_t)): r_t = 0.0 for bad_r in range(50): print('Bad Reward Found') total_reward += r_t s_t = s_t1 # Displaying progress every 15 steps. if ((np.mod(step, 15) == 0)): print("Episode", i, "Step", step_eps, "Epsilon", epsilon, "Action", a_t, "Reward", r_t) step += 1 step_eps += 1 if done: break # Saving the best model. if total_reward >= best_reward: if (train_indicator == 1): print("Now we save model with reward " + str(total_reward) + " previous best reward was " + str( best_reward)) best_reward = total_reward agent.saveNetwork() print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(checkpoints=None, train_indicator=1, eps=1.0): #1 means Train, 0 means simply Run BUFFER_SIZE = 40000 BATCH_SIZE = 16 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.01 #Learning rate for Actor LRC = 0.05 #Lerning rate for Critic vision = True action_dim = 3 #Steering/Acceleration/Brake if vision: state_dim = (64, 64, 3) #of sensors input else: state_dim = 29 np.random.seed(1337) EXPLORE = 1000000. episode_count = 2000 max_steps = 8000000 reward = 0 done = False step = 0 epsilon = eps indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA, vision, summary_writer) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC, vision) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer history = History() # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) log_file = open('train_log.log', 'w') #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints)) actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.target_model.load_weights( "criticmodel_{}.h5".format(checkpoints)) print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") max_reward = 0 min_reward = 0 for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() if vision: history.fill((ob.img)) s_t = history.get() else: s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. total_damage = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) if vision: a_t_original = actor.model.predict( s_t.reshape((-1, ) + state_dim)) else: a_t_original = actor.model.predict(s_t.reshape( 1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.30, 0.30) noise_t[0][1] = 0.1 + train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) damage = ob.damage if vision: last_s_t = history.get().copy() history.add((ob.img)) next_s_t = history.get().copy() if np.mod(step, 4) == 0: buff.add(last_s_t, a_t[0], r_t, next_s_t, done) #Add replay buffer s_t1 = history.get() else: s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) if vision: target_q_values = critic.target_model.predict([ new_states.reshape((-1, ) + state_dim), actor.target_model.predict(new_states).reshape( (-1, ) + (action_dim, )) ]) else: target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator and buff.count() >= 1000: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t total_damage += damage s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel_{}.h5".format(i), overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel_{}.h5".format(i), overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) max_reward = max(max_reward, total_reward) min_reward = min(min_reward, total_reward) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward) + " EPS " + str(epsilon)) print("Total Step: " + str(step) + ' Max: ' + str(max_reward) + ' Min: ' + str(min_reward)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def signal_handler(signal, frame): print('You pressed Ctrl+C!') # Generate a Torcs environment env = TorcsEnv(vision=False, throttle=True, gear_change=False) env.end() sys.exit(0)
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Now load the weight # print("Now we load the weight") # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) print ob.track total_reward = 0. stucked = 0 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.1: print("********Now we apply the brake***********") noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
print('Collecting data...') for i in range(steps): if i == 0: act = np.array([0.0]) else: act = get_teacher_action(ob) if i%100 == 0: print(i) ob, reward, done, _ = env.step(act) img_list.append(ob.img) action_list.append(act) reward_list.append(np.array([reward])) env.end() print('Packing data into arrays...') for img, act, rew in zip(img_list, action_list, reward_list): images_all = np.concatenate([images_all, img_reshape(img)], axis=0) actions_all = np.concatenate([actions_all, np.reshape(act, [1,action_dim])], axis=0) rewards_all = np.concatenate([rewards_all, rew], axis=0) from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D from keras.optimizers import Adam #model from https://github.com/fchollet/keras/blob/master/examples/cifar10_cnn.py model = Sequential()
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1,action_dim]) noise_t = np.zeros([1,action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 30) == 0: print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1): # 1 means Train, 0 means simply Run BUFFER_SIZE = 100000 # 缓存能力,网络储存能力 BATCH_SIZE = 32 # 批尺寸,一次处理样本数 GAMMA = 0.99 # 折扣系数 TAU = 0.001 # Target Network HyperParameters 目标网络超系数 LRA = 0.0001 # Learning rate for Actor Actor网络学习率 LRC = 0.001 # Lerning rate for Critic Critic网络学习率 action_dim = 3 # Steering/Acceleration/Brake 加速/转向/刹车 state_dim = 29 # of sensors input 29个传感器输入 np.random.seed(1337) # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。 vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU 管理策略,此处使用动态内存申请策略 config = tf.ConfigProto() config.gpu_options.allow_growth = True # 硬性限制GPU使用率为0.4 # config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) # Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") theTime = datetime.datetime.now() # 获取系统当前时间 theTime = theTime.strftime('%y-%m-%d_%H:%M:%S') # 转换为字符串形式作为CSV文件头 folder_path = "practise_progress/" + theTime + "/" # 只适用于Linux系统 if not os.path.exists(folder_path): os.makedirs(folder_path) print("folder created") else: print("folder existed") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv" fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ", "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"] csvFile = open(csvfileHeader, "w") writer = csv.writer(csvFile) writer.writerow(fileHeader) for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) # The following code do the stochastic brake # if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300, a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss] """ 参数记录 轮次 步骤计数 车辆位置 X轴速度 Y轴速度 Z轴速度 加速输出 转向输出 刹车输出 回报 损失函""" writer.writerow(csvData) print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: csvFile.close() break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def preTrain(): # train the NN of actor and ciritc using existing rules BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) # Generate a driver driver = DriverExample() #Now load the weight print("Now we load the weight") try: actor.model.load_weights("pre_actormodel.h5") critic.model.load_weights("pre_criticmodel.h5") actor.target_model.load_weights("pre_actormodel.h5") critic.target_model.load_weights("pre_criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss_actor = 0 loss_critic = 0 a_t = np.zeros([1,action_dim]) # the driver produce the actions a_t = driver.action(s_t.reshape(state_dim, )) ob, r_t, done, info = env.step(a_t) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t, r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] """ if (train_indicator == 1): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() """ loss_actor += actor.model.train_on_batch(states, actions) # train actor loss_critic += critic.model.train_on_batch([states,actions], y_t) # train critic actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, ": ") print("Action", a_t, "Reward", r_t) print("loss_actor", loss_actor, "loss_critic", loss_critic) step += 1 if np.mod(step, 100) == 0: print("Now we save model") actor.model.save_weights("pre_actormodel.h5", overwrite=True) with open("pre_actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("pre_criticmodel.h5", overwrite=True) with open("pre_criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) if done: break print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
for episode in range(4000): print('Episode: ', episode) if episode %1 ==0: ob = env.reset(relaunch=True) # with torcs relaunch (avoid memory leak bug in torcs) else: ob = env.reset() for move in range(10000): if TARGET_MODEL: action = act(target_actor_model, observation_formatter(ob)) else: action = act(actor_model, observation_formatter(ob)) action = action.flatten() new_ob, reward, done, _ = env.step(action) reward = reward/400 print('\nq-value: ', target_critic_model.predict(observation_formatter(ob, action))) print('reward: ', reward, '\n') if np.isnan(reward): break buffer.loc[len(buffer), :] = [ob, action, reward, new_ob, done] update_actor_critic_model(sess, [actor_model, critic_model, target_actor_model, target_critic_model], buffer, [action_gradient_holder, update_op, gradient_op], ITERATIONS, BATCH_SIZE) ob = new_ob EPSILON = max(EPSILON*EPSILON_DECAY, MINIMUM_EPSILON) #print('\nepsilon: ', EPSILON, '\n') if done: break # shut down torcs env.end()
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.00005 #Learning rate for Actor LRC = 0.0005 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 200000. if train_indicator: episode_count = 1000 else: episode_count = 20 max_steps = 4000 step = 0 if train_indicator: epsilon = 1 else: epsilon = 0 min_laptime = 10000000 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight # loading networks print("Now we load the weight") saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks/") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. # totalLaptime = 0. for j in range(max_steps): loss = 0 if train_indicator: epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, 0.10) a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0], train_indicator) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_predict(new_states, actor.target_predict(new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.train_on_batch(states, actions, y_t) a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 100) == 0: print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime) step += 1 if i == 0: break if done: break # if np.mod(i, 3) == 0: if (train_indicator) and i > 0: if env.lapTime < min_laptime and env.num_lap == 10: min_laptime = env.lapTime print("Now we save model") saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i)) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1, safety_constrain_flag=True): #1 means Train, 0 means simply Run plt.ion() args = parser.parse_args() np.random.seed(1337) #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Define two intra-policies overtaking_policy = ActorNetwork(sess, args.state_size, args.action_size) following_policy = ActorNetwork(sess, args.state_size, args.action_size) try: overtaking_policy.model.load_weights("actormodel_overtaking.h5") overtaking_policy.target_model.load_weights("actormodel_overtaking.h5") following_policy.model.load_weights("actormodel_following.h5") following_policy.target_model.load_weights("actormodel_following.h5") print("Weight load successfully") except: print("Cannot find the weight") # with fixed following policy #option_policies = [overtaking_policy,overtaking_policy,overtaking_policy,following_policy(0.5),following_policy(0.5),following_policy(0.5)] # with learned following policy option_policies = [ overtaking_policy, overtaking_policy, overtaking_policy, following_policy, following_policy, following_policy ] termination_steps = [10, 20, 30, 10, 20, 30] # Define option-value function Q_Omega(s,omega): estimate values upon arrival critic = OptionValueCritic(args.state_size, args.option_size, args.discount, args.learning_rate_critic, args.epsilon, args.epsilon_min, args.epsilon_decay, args.tau) try: critic.load("option_value_model.h5") print("Critic Weight load successfully") except: print("Cannot find the critic weight") history = np.zeros((args.nepisodes, 2)) # Define a buffer space to store samples buff = ReplayBuffer(args.buffer_size) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=args.vision, throttle=True, gear_change=False) print("TORCS Experiment Start.") cumreward_list = [] average_step_reward_list = [] damage_rate_list = [] epsilon_list = [] results_list = [] option_list = [] trackPos_list = [] speed_list = [] epreward_list = [] for episode in range(args.nepisodes): # Define variables to store values cumreward = 0. duration = 1 option_switches = 0 avgduration = 0. reward_option = 0 total_options = 0 damage_times = 0 danger_time = 0 collision_time = 0 primitive_action_step = 0 if np.mod(episode, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() state = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) state = state.reshape(1, state.shape[0]) for step in range(args.nsteps): total_options += 1 option = critic.get_option(state, train_indicator) reward_option = 0 for i in range(termination_steps[option]): primitive_action_step += 1 action = option_policies[option].target_model.predict(state) ''' if option == 0 or option == 1 or option == 2: action = option_policies[option].target_model.predict(state) else: action = option_policies[option].act(ob) ''' print(action) action = Low_level_controller(action[0][0], action[0][1], ob, safety_constrain_flag) print("Option: {} Action:{}".format(option, action)) ob, r_t_primitive, done, _ = env.step(action) if r_t_primitive == -30.0: collision_time += 1 elif r_t_primitive == -5.0: danger_time += 1 damage_times = collision_time + danger_time option_list.append(option) trackPos_list.append(ob.trackPos) speed_list.append(ob.speedX) epreward_list.append(r_t_primitive) reward_option = reward_option + args.discount**( i) * r_t_primitive state_ = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) state_ = state_.reshape(1, state_.shape[0]) state = state_ if done: break buff.add(state, option, reward_option, state_, done) cumreward += reward_option reward_ep_per_step = cumreward / primitive_action_step damage_rate = damage_times / primitive_action_step if done: break if train_indicator: batch = buff.getBatch(args.batch_size) critic.replay(batch) if episode % 10 == 0: critic.save("option_value_model.h5") if train_indicator: # Save the results cumreward_list.append(cumreward) average_step_reward_list.append(reward_ep_per_step) damage_rate_list.append(damage_rate) epsilon_list.append(critic.epsilon) results_list = [ cumreward_list, average_step_reward_list, damage_rate_list, epsilon_list ] sio.savemat( 'results_both_learned.mat', { 'total_reward': cumreward_list, 'average_reward': average_step_reward_list, 'epsilon': epsilon_list, 'damage_rate': damage_rate_list }) else: sio.savemat( 'test1lf1r.mat', { 'ep_reward': epreward_list, 'option': option_list, 'trackPos': trackPos_list, 'speed': speed_list }) print('damage rate is:', damage_rate) history[episode, 0] = step history[episode, 1] = avgduration plt.figure(1) plt.hold(True) plt.subplot(311) plt.plot(episode, cumreward, 'ro') plt.xlabel('episode') plt.ylabel('Total reward per epsiode') plt.subplot(312) plt.hold(True) plt.plot(episode, cumreward / total_options, 'bo') plt.xlabel('episode') plt.ylabel('Average reward per option') plt.subplot(313) plt.hold(True) plt.plot(episode, critic.epsilon, 'go') plt.xlabel('episode') plt.ylabel('epsilon') plt.draw() plt.show() plt.pause(0.001) env.end() # This is for shutting down TORCS plt.savefig('test.png') print("Finish.")