def __init__(self, env_name, state_dim, action_dim): self.name = 'DDPG' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Ensure action bound is symmetric self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(save_location) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights")
def actionAddNoise(a_t_original, train_indicator, epsilon, numCars=2): noise_t = np.zeros_like(a_t_original) a_t = np.zeros_like(a_t_original) for i in xrange(numCars): noise_t[i][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[i][0], 0.0, 0.6, 0.2) noise_t[i][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[i][1], 0.5, 1.0, 0.10) noise_t[i][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[i][2], 0.3, 1.0, 0.05) a_t[i][0] = a_t_original[i][0] + noise_t[i][0] a_t[i][1] = a_t_original[i][1] + noise_t[i][1] a_t[i][2] = a_t_original[i][2] + noise_t[i][2] return a_t
def noise_action(self, state, epsilon): # return an action according to the current policy and exploration noise action = np.zeros([self.action_dim]) noise = np.zeros([self.action_dim]) action_pre = self.actor.predict([state]) noise[0] = epsilon * OU.function(action_pre[0][0], 0.0, 0.80, 0.60) noise[1] = epsilon * OU.function(action_pre[0][1], 0.7, 1.00, 0.10) noise[2] = epsilon * OU.function(action_pre[0][2], -0.1, 1.00, 0.05) # ACTION: with noise action[0] = np.clip(action_pre[0][0] + noise[0], -1, 1) action[1] = np.clip(action_pre[0][1] + noise[1], 0, 1) action[2] = np.clip(action_pre[0][2] + noise[2], 0, 1) return action
def get_exploration_noiseV1(self, current_value, wheel_side): #print("Get noise") if(wheel_side == 1): self.mu = self.working_point_left if(wheel_side == 2): self.mu = self.working_point_right return OU.function(current_value, self.mu, self.theta, self.sigma)
def __init__(self, nodes_num, type, capacity): self.nodes_num = nodes_num self.prev_traffic = None self.type = type self.capacity = capacity * nodes_num / (nodes_num - 1) self.dictionary = {} self.dictionary['NORM'] = self.normal_traffic self.dictionary['UNI'] = self.uniform_traffic self.dictionary['CONTROLLED'] = self.controlled_uniform_traffic self.dictionary['EXP'] = self.exp_traffic self.dictionary['OU'] = self.ou_traffic self.dictionary['STAT'] = self.stat_traffic self.dictionary['STATEQ'] = self.stat_eq_traffic self.dictionary['FILE'] = self.file_traffic self.dictionary['DIR'] = self.dir_traffic self.dictionary['STATIC'] = self.static_traffic if self.type.startswith('DIR:'): self.dir = sorted(listdir(self.type.split('DIR:')[-1]), key=lambda x: natural_key((x))) self.static = None self.total_ou = OU(1, self.capacity/2, 0.1, self.capacity/2) self.nodes_ou = OU(self.nodes_num**2, 1, 0.1, 1)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0
def __init__(self): self.OU = OU() self.total_correct = 0 self.total_wrong = 0 self.accuracy_all = [] self.if_done = False self.epsilon = 1 self.total_reward = None self.loss = None self.sim_inter = UpdateInter() self.state_t = [] self.state_dim = self.sim_inter.state_dim self.action_t = [] self.action_acc = None self.action_time = None self.Tau = self.sim_inter.Tau self.actor = None self.critic = None self.buff = None self.batch = None self.states = None self.actions = None self.rewards = None self.new_states = None self.if_dones = None self.y_t = None # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) K.set_session(self.sess)
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 30 GAMMA = 0.99 TAU = 0.0001 #Target Network HyperParameters LRA = 0.00001 #Learning rate for Actor LRC = 0.0001 #Lerning rate for Critic action_dim = 1 #Steering/Acceleration/Brake state_dim = 15 #of sensors input np.random.seed(1337) vision = False EXPLORE = 1000000. episode_count = 3000 max_steps = 1000000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 t_dt = 0.0005 #TCP/IP communication for MATLAB - Python HOST = '0.0.0.0' PORT = 40000 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 4096) s.bind((HOST, PORT)) #Matlab client waiting s.listen(1) print("waiting for response from client at port ", PORT) conn, addr = s.accept() #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) Lateral = 0 #Carsim export(input factor) variable catch s_t try: ob_exports = conn.recv(4096) except KeyboardInterrupt: #conn.shutdown() conn.close() break ob_exports1 = json.loads(ob_exports.decode('utf-8')) print('export=', ob_exports1) if not ob_exports: #conn.shutdown() conn.close() break t_current = ob_exports1[0] T_bar_Tq = ob_exports1[1] / 10 LatG = ob_exports1[2] YawRate = ob_exports1[3] / 50 Yaw = ob_exports1[4] / 3.14 Lateral = ob_exports1[5] / 20 Steer_SW = ob_exports1[6] / 6000 StrAV_SW = ob_exports1[7] / 5000 Steer_L1 = ob_exports1[8] / 180 Steer_R1 = ob_exports1[9] / 180 Steer_L2 = ob_exports1[10] / 4 Steer_R2 = ob_exports1[11] / 4 Xcg_TM = ob_exports1[12] / 1000 Ycg_TM = ob_exports1[13] / 300 Zcg_TM = ob_exports1[14] / 45 curv = ob_exports1[15] # print('T_bar_Tq=',T_bar_Tq) # print('LatG=',LatG) s_t = np.hstack((T_bar_Tq, LatG, YawRate, Yaw, Lateral, Steer_SW, StrAV_SW, Steer_L1, Steer_R1, Steer_L2, Steer_R2, Xcg_TM, Ycg_TM, Zcg_TM, curv)) print('s_t=', s_t) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) print('a_t_original=', a_t_original) print('a_t_original=', a_t_original) a_t_inv = a_t_original[0][0] print(a_t_inv.shape) critic_gradient = critic.gradients(s_t.reshape(1, s_t.shape[0]), a_t_original) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.00, 0.00) # noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] # a_t[0][1] = a_t_original[0][1] + noise_t[0][1] # a_t[0][2] = a_t_original[0][2] + noise_t[0][2] a_t[0][0] = a_t[0][0] * 3500 t_current = t_current + t_dt print('t_next=', t_current) print(a_t[0]) at = np.array(a_t[0]) # print("at=",at) at1 = np.insert(at, 0, t_current) # print('at1=,',at1) at2 = list(at1) print('at2=,', at2) #provide action value to matlab try: at_json = json.dumps(at2) a = '\r\n' at_json1 = at_json + a # print('at_json1',at_json1) at_json2 = at_json1.encode('utf-8') # print('at_json2',at_json2) conn.sendall(at_json2) except KeyboardInterrupt: #conn.shutdown() conn.close() break #Carsim export(input factor) variable catch s_t1 try: ob_exports = conn.recv(4096) except KeyboardInterrupt: #conn.shutdown() conn.close() break ob_exports1 = json.loads(ob_exports.decode('utf-8')) print('s_t1=', ob_exports1) if not ob_exports: #conn.shutdown() conn.close() break T_bar_Tq1 = ob_exports1[0] / 10 LatG1 = ob_exports1[1] YawRate1 = ob_exports1[2] / 50 Yaw1 = ob_exports1[3] / 3.14 Lateral1 = ob_exports1[4] / 20 Steer_SW1 = ob_exports1[5] / 6000 StrAV_SW1 = ob_exports1[6] / 5000 Steer_L11 = ob_exports1[7] / 180 Steer_R11 = ob_exports1[8] / 180 Steer_L21 = ob_exports1[9] / 4 Steer_R21 = ob_exports1[10] / 4 Xcg_TM1 = ob_exports1[11] / 1000 Ycg_TM1 = ob_exports1[12] / 300 Zcg_TM1 = ob_exports1[13] / 45 curv = ob_exports1[14] r_t = ob_exports1[15] done = ob_exports1[16] # print('T_bar_Tq1=',T_bar_Tq1) print('r_t=', r_t) # if abs(Lateral1) > 1 or abs(Yaw1) > 1 : if t_current > 20 or abs(Yaw1) > 1: break s_t1 = np.hstack( (T_bar_Tq1, LatG1, YawRate1, Yaw1, Lateral1, Steer_SW1, StrAV_SW1, Steer_L11, Steer_R11, Steer_L21, Steer_R21, Xcg_TM1, Ycg_TM1, Zcg_TM1, curv)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) # print ("Rewards=",rewards) # print ("Actions=",actions) # print ("states=",states) # print (states.shape) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # print("rt1=",target_q_values) # print(target_q_values.shape) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) # print("a_for_grad=",a_for_grad) # print(a_for_grad.shape) grads = critic.gradients(states, a_for_grad) # print("grads=",grads) # print(grads.shape) if step > 30: grads_factor = gradient_inverter(critic_gradient, a_t_inv, p_min=-1, p_max=1, BATCH_SIZE=30) else: grads_factor = 1 # print("grads_factor=",grads_factor) grads_factor1 = np.asarray(grads_factor) grads3 = grads * grads_factor1 # print("grads3=",grads3) actor.train(states, grads3) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "t_current", t_current, "Action", a_t, "Reward", r_t, "Loss", loss, "step", step) step += 1 if done: break #s.shutdown() if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") # s.close() # TCP/IP socket close s.close() # TCP/IP socket close print("Finish.")
def train(train_indicator=1): env = Env() BUFFER_SIZE = 200000 BATCH_SIZE = 128 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic action_dim = env.action_dim state_dim = env.observation_space() np.random.seed(1337) EXPLORE = 100000. episode_count = 100 max_steps = 10000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) print("load model weight") try: actor.model.load_weights("model/actormodel.h5") critic.model.load_weights("model/criticmodel.h5") actor.target_model.load_weights("model/actormodel.h5") critic.target_model.load_weights("model/criticmodel.h5") print("load successfully") except: print("Cannot find the model weight") s_t = env.reset() for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 10.0, 1, 7) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0, 1, 3) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] s_t1, r_t, _ = env.step(a_t[0]) buff.add(s_t, a_t[0], r_t, s_t1, done) # env.get_memory(buff) batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if train_indicator: print("save model model") actor.model.save_weights("model/actormodel.h5", overwrite=True) # actor.model.save_weights("model/actormodel.h5", overwrite=True) # with open("model/actormodel.json", "wb") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("model/criticmodel.h5", overwrite=True) # critic.model.save_weights("model/criticmodel.h5", overwrite=True) # with open("model/criticmodel.json", "wb") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") print("Finish.") return actor
def playGame(train_indicator=1): # 1 means Train, 0 means simply Run cur_path = os.path.abspath(os.path.curdir) model_path = "/Models/" result_path = "/Results/" curr_test = "Large_Noise_Result/" actor_name = "actormodel{}.h5" critic_name = "criticmodel{}.h5" BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 1e-4 # Learning rate for Actor LRC = 1e-3 # Lerning rate for Critic action_dim = 4 # Steering/Acceleration/Brake state_dim = 131 # of sensors input np.random.seed(2333) EXPLORE = 10000 episode_count = 10000 max_steps = 100000 reward = 0 done = 0 step = 0 epsilon = 1 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) buff = Buffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = Simulator() # Now load the weight print("Now we load the weight") try: actor.model.load_weights(cur_path + "/Models/actormodel.h5") critic.model.load_weights(cur_path + "/Models/criticmodel.h5") actor.target_model.load_weights(cur_path + "/Models/actormodel.h5") critic.target_model.load_weights(cur_path + "/Models/criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") for i in range(episode_count): start_time = time.time() print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if i % 1000 == 0: losses = np.zeros((1000, )) total_rewards = np.zeros((1000, )) s_t = env.reset() total_reward = 0 loss = 0 for j in range(max_steps): epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.5, 1.00, 0.15) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.15) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], 0.5, 1.00, 0.15) noise_t[0][3] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][3], 0.5, 1.00, 0.15) # The following code do the stochastic brake # if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] a_t[0][3] = a_t_original[0][3] + noise_t[0][3] a_t = np.around(a_t, decimals=1) s_t1, r_t, done = env.step(a_t) buff.add(s_t, a_t, r_t, np.array([[done]]), s_t1) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = batch[:, :state_dim] actions = batch[:, state_dim:state_dim + action_dim] rewards = batch[:, state_dim + action_dim] new_states = batch[:, state_dim + action_dim + 2:] dones = batch[:, state_dim + action_dim + 1] y_t = actions.copy() target_q_values = critic.target_model.predict([ new_states, np.around(actor.target_model.predict(new_states), decimals=1) ]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = np.around(actor.model.predict(states), decimals=1) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break losses[i % 1000] = loss total_rewards[i % 1000] = total_reward if np.mod((i + 1), 100) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights(cur_path + "/Models/actormodel.h5", overwrite=True) critic.model.save_weights(cur_path + "/Models/criticmodel.h5", overwrite=True) if np.mod((i + 1), 1000) == 0: if (train_indicator): losses_path = (cur_path + result_path + curr_test + 'losses{}.txt').format(i) rewards_path = (cur_path + result_path + curr_test + 'rewards{}.txt').format(i) np.savetxt(losses_path, losses) np.savetxt(rewards_path, total_rewards) print("Now we save model") actor.model.save_weights((cur_path + model_path + curr_test + "actormodel{}.h5").format(i), overwrite=True) critic.model.save_weights((cur_path + model_path + curr_test + "criticmodel{}.h5").format(i), overwrite=True) actor.target_model.save_weights( (cur_path + model_path + curr_test + "actortarmodel{}.h5").format(i), overwrite=True) critic.target_model.save_weights( (cur_path + model_path + curr_test + "crititarcmodel{}.h5").format(i), overwrite=True) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("Took {} S".format(time.time() - start_time)) # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=0): # 1 means Train, 0 means simply Run BUFFER_SIZE = 1000000 # BUFFER_SIZE1 = 50000 # BUFFER_SIZE2 = 5000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic action_dim = 2 # Acceleration/LaneChanging state_dim = 26 # of sensors input np.random.seed(1337) EXPLORE = 1000000 episode_count = 2018 max_steps = 5299 done = 0 step = 0 epsilon = 1 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # buff0 = ReplayBuffer(BUFFER_SIZE0) # Create replay buffer # buff1 = ReplayBuffer(BUFFER_SIZE1) # buff2 = ReplayBuffer(BUFFER_SIZE2) # Now load the weight print("Now we load the weight") try: # actor.model.load_weights("train_actor_lanechanging.h5") actor.model.load_weights("actormodel.h5") actor.target_model.load_weights("actor_target_model.h5") print("actor Weight load successfully") except: print("Cannot find the actor weight") try: critic.model.load_weights("criticmodel.h5") critic.target_model.load_weights("critic_target_model.h5") print("critic Weight load successfully") except: print("Cannot find the critic weight") HOST = '127.0.0.1' PORT = 5099 BUFSIZ = 1024 ADDR = (HOST, PORT) socketserver.TCPServer.allow_reuse_address = True tcpSerSock = socket(AF_INET, SOCK_STREAM) tcpSerSock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) tcpSerSock.bind(ADDR) tcpSerSock.listen(5) # while True: print('waiting for connection...') tcpCliSock, addr = tcpSerSock.accept() print('...connected from:', addr) # save Reward file with open("r_l_q_everyeposide.txt", "w") as f: print("Vissim Experiment Start.") for i in range(episode_count): display = [] print("Episode : " + str(i) + " Replay Buffer " + str(buff.num_experiences)) data0 = tcpCliSock.recv(BUFSIZ) Vx0 = struct.unpack("30d", data0)[0] Vy0 = struct.unpack("30d", data0)[1] Dl0 = struct.unpack("30d", data0)[2] Dr0 = struct.unpack("30d", data0)[3] Vx2_diff0 = struct.unpack("30d", data0)[4] Dx2_diff0 = struct.unpack("30d", data0)[5] Vy2_diff0 = struct.unpack("30d", data0)[6] Dy2_diff0 = struct.unpack("30d", data0)[7] Vx1_diff0 = struct.unpack("30d", data0)[8] Dx1_diff0 = struct.unpack("30d", data0)[9] Vy1_diff0 = struct.unpack("30d", data0)[10] Dy1_diff0 = struct.unpack("30d", data0)[11] Vx3_diff0 = struct.unpack("30d", data0)[12] Dx3_diff0 = struct.unpack("30d", data0)[13] Vy3_diff0 = struct.unpack("30d", data0)[14] Dy3_diff0 = struct.unpack("30d", data0)[15] Vx6_diff0 = struct.unpack("30d", data0)[16] Dx6_diff0 = struct.unpack("30d", data0)[17] Vy6_diff0 = struct.unpack("30d", data0)[18] Dy6_diff0 = struct.unpack("30d", data0)[19] Vx4_diff0 = struct.unpack("30d", data0)[20] Dx4_diff0 = struct.unpack("30d", data0)[21] Vy4_diff0 = struct.unpack("30d", data0)[22] Dy4_diff0 = struct.unpack("30d", data0)[23] Vx5_diff0 = struct.unpack("30d", data0)[24] Dx5_diff0 = struct.unpack("30d", data0)[25] Vy5_diff0 = struct.unpack("30d", data0)[26] Dy5_diff0 = struct.unpack("30d", data0)[27] done0 = struct.unpack("30d", data0)[28] aux0 = struct.unpack("30d", data0)[29] raw_obs0 = [Vx0, Vy0, Dl0, Dr0, Vx2_diff0, Dx2_diff0, Vy2_diff0, Dy2_diff0, Vx1_diff0, Dx1_diff0, Vy1_diff0, Dy1_diff0, Vx3_diff0, Dx3_diff0, Vy3_diff0, Dy3_diff0, Vx6_diff0, Dx6_diff0, Vy6_diff0, Dy6_diff0, Vx4_diff0, Dx4_diff0, Vy4_diff0, Dy4_diff0, Vx5_diff0, Dx5_diff0, Vy5_diff0, Dy5_diff0] print('raw_obs0=', raw_obs0) # Generate a Vissim environment env = VissimEnv(raw_obs0) s_t = env.make_observaton(raw_obs0) total_loss = 0 total_reward_cf = 0 total_reward_lc = 0 total_q_value = 0 Dx2_diff = Dx2_diff0 Vx = Vx0 Vx2_diff = Vx2_diff0 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.02, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] if a_t[0][1] > 0: acceleration = a_t[0][1] * 3.5 else: acceleration = a_t[0][1] * 8 r_t_first = 0 # if Dx2_diff < 2*Vx + 4.25: # if acceleration > 0: # r_t_first = -1 # acceleration = -acceleration if Dx2_diff < 2 * Vx + 4.25: # if Vx2_diff > 0: if acceleration < -abs(Vx2_diff) / 2: pass else: acceleration = -abs(Vx2_diff) / 2 r_t_first = -1 if 0 <= a_t[0][0] and a_t[0][0] <= 0.1739523314093953: LaneChanging = 1 elif a_t[0][0] > 0.1739523314093953 and a_t[0][0] <= 1 - 0.1739523314093953: LaneChanging = 0 else: LaneChanging = 2 # 1 represent left lane changing # 0 represent no lane changing # 2 represent right lane changing ACTION = [LaneChanging, acceleration] print("acceleration=", acceleration) print("LaneChanging=", LaneChanging) # while True: tcpCliSock.send(str(ACTION).encode()) data = tcpCliSock.recv(BUFSIZ) # print(data) Vx = struct.unpack("30d", data)[0] Vy = struct.unpack("30d", data)[1] Dl = struct.unpack("30d", data)[2] Dr = struct.unpack("30d", data)[3] Vx2_diff = struct.unpack("30d", data)[4] Dx2_diff = struct.unpack("30d", data)[5] Vy2_diff = struct.unpack("30d", data)[6] Dy2_diff = struct.unpack("30d", data)[7] Vx1_diff = struct.unpack("30d", data)[8] Dx1_diff = struct.unpack("30d", data)[9] Vy1_diff = struct.unpack("30d", data)[10] Dy1_diff = struct.unpack("30d", data)[11] Vx3_diff = struct.unpack("30d", data)[12] Dx3_diff = struct.unpack("30d", data)[13] Vy3_diff = struct.unpack("30d", data)[14] Dy3_diff = struct.unpack("30d", data)[15] Vx6_diff = struct.unpack("30d", data)[16] Dx6_diff = struct.unpack("30d", data)[17] Vy6_diff = struct.unpack("30d", data)[18] Dy6_diff = struct.unpack("30d", data)[19] Vx4_diff = struct.unpack("30d", data)[20] Dx4_diff = struct.unpack("30d", data)[21] Vy4_diff = struct.unpack("30d", data)[22] Dy4_diff = struct.unpack("30d", data)[23] Vx5_diff = struct.unpack("30d", data)[24] Dx5_diff = struct.unpack("30d", data)[25] Vy5_diff = struct.unpack("30d", data)[26] Dy5_diff = struct.unpack("30d", data)[27] done = struct.unpack("30d", data)[28] aux = struct.unpack("30d", data)[29] raw_obs = [Vx, Vy, Dl, Dr, Vx2_diff, Dx2_diff, Vy2_diff, Dy2_diff, Vx1_diff, Dx1_diff, Vy1_diff, Dy1_diff, Vx3_diff, Dx3_diff, Vy3_diff, Dy3_diff, Vx6_diff, Dx6_diff, Vy6_diff, Dy6_diff, Vx4_diff, Dx4_diff, Vy4_diff, Dy4_diff, Vx5_diff, Dx5_diff, Vy5_diff, Dy5_diff] print('vel=', Vx) print('vel_diff=', Vx2_diff) print('d=', Dx2_diff) print('done=', done) if raw_obs == []: print('No data') break if LaneChanging == 1 or LaneChanging == 2: r_t_lanechange = aux if aux == -0.8: if r_t_first == 0: r_t_follow = env.step(acceleration, raw_obs) else: r_t_follow = r_t_first else: r_t_follow = 0 elif LaneChanging == 0: if r_t_first == 0: r_t_follow = env.step(acceleration, raw_obs) else: r_t_follow = r_t_first r_t_lanechange = 0 if i == 0 and j == 0: r_t_lanechange, r_t_follow = 0, 0 print('r_t_follow=', r_t_follow, 'r_t_lanechange=', r_t_lanechange) # save some variables for display display.append([i, j, Vx, Vx2_diff, r_t_follow + r_t_lanechange]) r_t = [r_t_follow, r_t_lanechange] s_t1 = env.make_observaton(raw_obs) q_value = critic.model.predict_on_batch( [np.array(s_t).reshape(1, 26), np.array(a_t_original).reshape(1, 2)]) target_q_value = critic.target_model.predict_on_batch( [np.array(s_t).reshape(1, 26), np.array(a_t_original).reshape(1, 2)]) # f.write("Episode" + str(i) + " " + "Step" + str(j) + " " + "Action=" + str(ACTION) + " " + "aIDM=" + str(aIDM) + "\n") error = abs(r_t + GAMMA * target_q_value - q_value) error = np.mean(error) # Add replay buffer buff.add(s_t, a_t[0], r_t, s_t1, done) batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[2] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict([new_states])]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward_cf += r_t_follow total_reward_lc += r_t_lanechange total_loss += loss total_q_value += q_value s_t = s_t1 print("Episode", i, "Step", j, "Total Step", step, "acceleration=", acceleration, "LaneChanging=", LaneChanging, "Reward", r_t, "Loss", loss) step += 1 if done == 1: break display = np.array(display) np.savetxt('epi'+str(i)+'.txt', display) if np.mod(i, 5) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) critic.target_model.save_weights("critic_target_model.h5", overwrite=True) with open("critic_target_model.json", "w") as outfile: json.dump(critic.target_model.to_json(), outfile) actor.model.save_weights("actor_target_model.h5", overwrite=True) with open("actor_target_model.json", "w") as outfile: json.dump(actor.target_model.to_json(), outfile) ave_loss = total_loss / (j + 1) ave_q = total_q_value / (j + 1) f.write("Episode" + str(i) + " " + "TotalReward_follow=" + str( total_reward_cf) + " " + "TotalReward_lanechange=" + str(total_reward_lc) + " " + "AverageLoss=" + str( ave_loss) + " " + "AverageValue=" + str(ave_q) + "\n") print("TOTAL REWARD @ " + str(j) + "/" + str(i) + "-th Episode : Reward_follow " + str( total_reward_cf) + "Reward_follow :" + str(total_reward_lc)) print("Total Step: " + str(step)) print("") tcpCliSock.close() tcpSerSock.close() # env.end() # This is for shutting down TORCS print("Finish.")
#!/usr/bin/env python # coding=utf-8 import numpy as np import tensorflow as tf from OU import OU from ReplayBuffer import ReplayBuffer from ActorNetwork import ActorNetwork from CriticNetwork import CriticNetwork from env_step import Env OU = OU() def train(train_indicator=1): env = Env() BUFFER_SIZE = 200000 BATCH_SIZE = 128 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic action_dim = env.action_dim state_dim = env.observation_space() np.random.seed(1337) EXPLORE = 100000.
def playGame(train_indicator=1): # 1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input np.random.seed(61502) base_dir = "/home/sergio/Projects/apclypsr/DDPG-Keras-Torcs/" vision = True EXPLORE = 100000. episode_count = 2000 max_steps = 10000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 esar2 = [] esar4 = [] # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #tf.set_random_seed(61502) actor = ActorNetwork(sess, state_dim, action_dim, LRA, TAU, BATCH_SIZE) critic = CriticNetwork(sess, state_dim, action_dim, LRC, TAU, GAMMA, actor.get_num_trainable_vars()) #actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) #critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Now load the weight restore = False if restore: print("Now we load the weight") # tf.reset_default_graph() # Tensorflow saver saver = tf.train.Saver() try: saver.restore(sess, base_dir + "ddpg.ckpt") print("model restored") except: print("Cannot find the weight") else: print("No weight loaded") init = tf.global_variables_initializer() sess.run(init) # Tensorflow saver saver = tf.train.Saver() print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 500) == 0: ob = env.reset(relaunch=True) # relaunch TORCS every 500 episode because of the memory leak error else: ob = env.reset() # 0. BUILD THE 4 images. s_t = np.hstack((ob.img)) s_t_four_images_list = [] for j in range(4): s_t_four_images_list.append(np.zeros((128, 128), dtype=np.float64)) s_t_phi = get_phi_from_four_images(s_t_four_images_list) ep_ave_max_q = 0 total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.predict(s_t_phi) # print("a_t_original") print(a_t_original) # print(a_t_original.shape) # print(a_t_original[0,1]) # print(a_t_original[0][1]) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05) # The following code do the stochastic brake # if random.random() <= 0.05: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) # 0. UPDATE THE LAST FOUR IMAGES s_t1 = np.hstack((ob.img)) if len(s_t_four_images_list) >= 4: s_t_four_images_list.pop(0) image = np.reshape(ob.img, (128, 128)) s_t_four_images_list.append(image) # print greyscale image # plt.imshow(image, origin='lower') # plt.draw() # plt.pause(0.001) # get phi for the new observed state s_t1_phi = get_phi_from_four_images(s_t_four_images_list) buff.add(s_t_phi, a_t[0], r_t, s_t1_phi, done) # Add replay buffer # Do the batch update if buff.size() > BATCH_SIZE: batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) states = np.concatenate(states, axis=0) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) new_states = np.concatenate(new_states, axis=0) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) actor_predicted_actions = actor.predict_target(new_states) #print("Actor predicted actions: ", actor_predicted_actions.shape) #print("New states: ", new_states.shape) target_q_values = critic.predict_target(new_states, actor_predicted_actions) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): # loss += critic.model.train_on_batch([states, actions], y_t) # a_for_grad = actor.model.predict(states) # grads = critic.gradients(states, a_for_grad) # actor.train(states, grads) # actor.target_train() # critic.target_train() # Update the critic given the targets print("y_t") print(y_t.shape) predicted_q_value, _, loss, loss2 = critic.train(states, actions, y_t) print("LOSS:", loss) print("LOSS2:", loss2) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(states) grads = critic.action_gradients(states, a_outs) actor.train(states, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() #batch update #step end total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) esar = (i, step, a_t, r_t, loss) esar2.append(esar) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): # print("Now we save model") # actor.model.save_weights("actormodel2.h5", overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) # # critic.model.save_weights("criticmodel2.h5", overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) save_path = saver.save(sess, base_dir + "ddpg.ckpt") print("Model saved in file: %s" % save_path) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") esar3 = (i, step, total_reward) esar4.append(esar3) def save_object(obj, filename): with open(filename, 'wb') as output: pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) save_object(esar2, 'IntraEpisode.pkl') save_object(esar4, 'InterEpisode.pkl') env.end() # This is for shutting down TORCS print("Finish.") print("Saving esars.")
import argparse from gym import wrappers from keras.models import model_from_json, Model from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.optimizers import Adam import tensorflow as tf #from keras.engine.training import collect_trainable_weights from keras import backend as K from mmstore import MMStore from mujoco_actor_nn import ActorNetwork from mujoco_critic_nn import CriticNetwork from OU import OU import time noise_func = OU() MAX_INTERACTION = 10000000 MAX_EPI_INT = 500 MEM_SIZE = 500 BATCH_SIZE = 64 gamma = 0.99 SOFT_UPDATE = 1e-3 ALR = 1e-4 CLR = 1e-3 REPEAT = 10 env = gym.make('HalfCheetah-v1') eval_state = env.reset() output_shape=env.action_space.shape input_shape=env.observation_space.shape
from gym_torcs import TorcsEnv import numpy as np import random import argparse import tensorflow as tf import json from ReplayBuffer import ReplayBuffer from ActorNetwork import ActorNetwork from CriticNetwork import CriticNetwork from OU import OU import timeit OU = OU() #Ornstein-Uhlenbeck Process def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.00005 #Learning rate for Actor LRC = 0.0005 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 200000.
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1,action_dim]) noise_t = np.zeros([1,action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 30) == 0: print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
vnc_kwargs={ 'encoding': 'tight', 'fine_quality_level': 0, 'subsample_level': 3, 'quality_level': 0, }, ) # show VNC window? render = False ### SET UP AGENT train_indicator = 1 # 1 if training OU = OU() # Ornstein-Uhlenbeck Process BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.1 # Target Network HyperParameter LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic av_pos = -10 # todo av_xpos = 2 # todo av_angle = np.pi / 2 dist_t0, area_t0 = current_state(av_pos) l1 = 8 l2 = 8 l3 = 8
class MainTrain(object): buffer_size = 100000 batch_size = 100 gamma = 0.99 tau = 0.0001 # Target Network HyperParameters LRA = 0.001 # Learning rate for Actor LRC = 0.001 # Learning rate for Critic explore_iter = 100000. episode_count = 20000 max_steps = 2000 action_dim = 4 # Steering/Acceleration/Brake parameter_acc_dim = 2 parameter_time_dim = action_dim action_size = action_dim + parameter_acc_dim + parameter_time_dim def __init__(self): self.OU = OU() self.total_correct = 0 self.total_wrong = 0 self.accuracy_all = [] self.if_done = False self.epsilon = 1 self.total_reward = None self.loss = None self.sim_inter = UpdateInter() self.state_t = [] self.state_dim = self.sim_inter.state_dim self.action_t = [] self.action_acc = None self.action_time = None self.Tau = self.sim_inter.Tau self.actor = None self.critic = None self.buff = None self.batch = None self.states = None self.actions = None self.rewards = None self.new_states = None self.if_dones = None self.y_t = None # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) K.set_session(self.sess) def load_weights(self): print("Now we load the weight") try: self.actor.model.load_weights("actormodel.h5") self.critic.model.load_weights("criticmodel.h5") self.actor.target_model.load_weights("actormodel.h5") self.critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") def update_weights(self): self.actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) self.critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) def update_batch(self): self.batch = self.buff.getBatch(self.batch_size) self.states = np.squeeze(np.asarray([e[0] for e in self.batch]), axis=1) self.actions = np.asarray([e[1] for e in self.batch]) self.rewards = np.asarray([e[2] for e in self.batch]) self.new_states = np.squeeze(np.asarray([e[3] for e in self.batch]), axis=1) self.if_dones = np.asarray([e[4] for e in self.batch]) self.y_t = np.asarray([e[2] for e in self.batch]) target_q_values = self.critic.target_model.predict( [self.new_states, self.actor.target_model.predict(self.new_states)]) for k, done in enumerate(self.if_dones): self.y_t[k] = self.rewards[k] if done else self.rewards[k] + self.gamma * target_q_values[k] def update_loss(self): self.loss += self.critic.model.train_on_batch([self.states, self.actions], self.y_t) a_for_grad = self.actor.model.predict(self.states) grads = self.critic.gradients(self.states, a_for_grad) self.actor.train(self.states, grads) self.actor.target_train() self.critic.target_train() def action_noise(self, train_indicator): self.epsilon -= 1.0 / self.explore_iter noise_t = np.zeros([1, self.action_size]) action_t_original = self.actor.model.predict(self.state_t) print("Action ", action_t_original) for i in range(self.action_dim): noise_t[0][i] = train_indicator * max(self.epsilon, 0) * \ self.OU.function(action_t_original[0][i], 0.00, 0.10, 0.20) noise_t[0][4] = train_indicator * max(self.epsilon, 0) * \ self.OU.function(action_t_original[0][4], -0.05, self.sim_inter.Max_Acc, 1.00) noise_t[0][5] = train_indicator * max(self.epsilon, 0) * \ self.OU.function(action_t_original[0][5], 0.05, - self.sim_inter.Max_Acc, 1.00) for i in range(self.parameter_time_dim): noise_t[0][i + self.action_dim + self.parameter_acc_dim] = \ train_indicator * max(self.epsilon, 0) * \ self.OU.function(action_t_original[0][i + self.action_dim + self.parameter_acc_dim], 0.01, 0.50, 0.10) action = np.zeros([1, self.action_size]) for i in range(self.action_size): action[0][i] = action_t_original[0][i] + noise_t[0][i] return action def update_action(self, action, train_indicator, e): if action == 0: process = 'Approach Process' self.action_acc = self.action_t[0][4] self.action_time = self.action_t[0][6] elif action == 1: process = 'Observe Process' self.action_time = self.action_t[0][7] elif action == 2: process = 'Wait Process' self.action_time = self.action_t[0][8] else: process = 'Traverse Process' self.action_acc = self.action_t[0][5] self.action_time = self.action_t[0][9] time_step = int(np.ceil(max(self.action_time / self.Tau, 1.0))) collision = False if_pass = False for ts in range(time_step): old_av_y = self.sim_inter.av_y old_av_velocity = self.sim_inter.av_velocity if action == 1: self.action_acc = (self.sim_inter.observe_vel - self.sim_inter.av_velocity) / self.sim_inter.Tau elif action == 2: self.action_acc = (- self.sim_inter.av_velocity) / self.sim_inter.Tau reward, collision = self.sim_inter.reward_function(self.action_acc) state_t1 = self.sim_inter.update_vehicle(self.action_acc) self.buff.add(self.state_t, self.action_t[0], reward, state_t1, self.if_done) self.update_batch() if train_indicator: self.update_loss() self.total_reward += reward print process, " (", self.action_acc, ", ", self.action_time, ") ", "AV = ", old_av_y, \ "Velocity = ", old_av_velocity, "Episode", e, "Reward", reward, "Loss", self.loss if action == 1 and self.state_t[0][0] <= 0: self.state_t = state_t1 break if old_av_y > self.sim_inter.Pass_Point or collision > 0: if_pass = old_av_y > self.sim_inter.Pass_Point self.if_done = True break self.state_t = state_t1 return collision, if_pass def launch_train(self, train_indicator=1): # 1 means Train, 0 means simply Run print 'Launch Training Process' np.random.seed(1337) self.state_t = self.sim_inter.get_state() self.state_dim = self.sim_inter.state_dim self.actor = ActorNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRA) self.critic = CriticNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRC) self.buff = ReplayBuffer(self.buffer_size) self.load_weights() for e in range(self.episode_count): print("Episode : " + str(e) + " Replay Buffer " + str(self.buff.count())) for j in range(self.max_steps): self.loss = 0 self.total_reward = 0 self.action_t = self.action_noise(train_indicator) choose_action = np.argmax(self.action_t[0][0:4]) collision, if_pass = self.update_action(choose_action, train_indicator, e) if self.if_done: self.sim_inter = UpdateInter() self.state_t = self.sim_inter.get_state() self.if_done = False break if train_indicator: self.update_weights() self.total_correct += int(collision <= 0 and if_pass) self.total_wrong += int(collision > 0) accuracy = 0 if self.total_correct + self.total_wrong: accuracy = self.total_correct / (self.total_correct + self.total_wrong) if np.mod(e, 100) == 0: self.accuracy_all.append(accuracy) self.total_correct = 0 self.total_wrong = 0 print("TOTAL REWARD @ " + str(e) + "-th Episode : Reward " + str(self.total_reward) + " Collision " + str(collision > 0) + " Accuracy " + str(accuracy) + " All Accuracy " + str(self.accuracy_all)) print("") print("Finish.")
def playGame(train_indicator=1, safety_constrain_flag=False): #1 means Train, 0 means simply Run #initialization = 0 episode_trained = 0 BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.9999 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 2 #Steering/Acceleration/Brake state_dim = 29 + 36 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 1000 max_steps = 300 reward = 0 done = False step = 0 epsilon = 1.0 indicator = 0 plt.ion() #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel_following.h5") critic.model.load_weights("criticmodel_following.h5") actor.target_model.load_weights("actormodel_following.h5") critic.target_model.load_weights("criticmodel_following.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") cumreward_list = [] average_step_reward_list = [] damage_rate_list = [] epsilon_list = [] results_list = [] trackPos_list = [] speed_list = [] epreward_list = [] damage_time = [] for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) print("Epsilon is: ", epsilon) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) epsilon = epsilon * 0.998 total_reward = 0. damage_steps = 0 for j in range(max_steps): loss = 0 damage = 0 #epsilon -= 1 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) if train_indicator: a_t_original = actor.target_model.predict( s_t.reshape(1, s_t.shape[0])) else: a_t_original = actor.target_model.predict( s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0.1) * OU.function2( a_t_original[0][0], 0.5, 0.90, 0.2) #noise_t[0][1] = train_indicator * max(epsilon, 0.0) * OU.function(a_t_original[0][1], 1.0 , 1.00, 0.10) noise_t[0][1] = train_indicator * max(epsilon, 0.1) * OU.function1( a_t_original[0][1], 0.9, 1.0, 0.60) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) ''' if np.random.randn() < max(epsilon,0.05): a_t[0][0] = np.random.randn()*2-1 else: a_t[0][0] = a_t_original[0][0] ''' a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t_primitive = Get_actions(a_t[0][0], a_t[0][1], ob, safety_constrain=safety_constrain_flag) ob, r_t, done, info = env.step(a_t_primitive) if r_t == -5.0 or r_t == -1.0: damage_steps += 1 damage = 1 trackPos_list.append(ob.trackPos) speed_list.append(ob.speedX) epreward_list.append(r_t) damage_time.append(damage) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break damage_rate = (float)(damage_steps / j * 100) if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel_following.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel_following.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) if train_indicator: # Save the results cumreward_list.append(total_reward) average_step_reward_list.append(total_reward / j) damage_rate_list.append(damage_rate) epsilon_list.append(epsilon) sio.savemat( 'results_overtaking.mat', { 'total_reward': cumreward_list, 'average_reward': average_step_reward_list, 'epsilon': epsilon_list, 'damage': damage_rate_list }) else: sio.savemat( 'info.mat', { 'ep_reward': epreward_list, 'trackPos': trackPos_list, 'speed': speed_list, 'damage_rate': damage_rate, 'damage_time': damage_time }) print('damage rate is:', damage_rate) plt.figure(1) plt.hold(True) plt.subplot(511) plt.plot(i, total_reward, 'ro') plt.xlabel("Episodie") plt.ylabel("Episodic total reward") plt.subplot(512) plt.plot(i, total_reward / j, 'bo') plt.xlabel("Episodie") plt.ylabel("Expected reward each step") plt.subplot(513) plt.plot(i, damage_rate, 'go') plt.xlabel("Episodie") plt.ylabel("Damage rate per episode [%]") plt.subplot(514) plt.plot(i, max(epsilon, 0.1), 'yo') plt.xlabel("Episodie") plt.ylabel("epsilon") plt.subplot(515) plt.plot(i, loss / j, 'yo') plt.xlabel("Episodie") plt.ylabel("Average loss") plt.draw() plt.show() plt.pause(0.001) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS plt.savefig('test.png') print("Finish.")
def playGame(train=0): #1 means Train, 0 means simply Run load_from = "." save_to = os.path.join("data", "saved") save_thresh = 100000 # Save if total reward for the episode is more BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input EXPLORE = 100000. episode_count = 2000 max_steps = 100000 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) ou = OU().function #Ornstein-Uhlenbeck Process buff = ReplayBuffer(BUFFER_SIZE) env = TorcsEnv(vision=False, throttle=True, gear_change=False) def state(ob): return np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) def load_weights(dir): print("Loading weights from ", dir) try: actor.model.load_weights(os.path.join(dir, "actormodel.h5")) critic.model.load_weights(os.path.join(dir, "criticmodel.h5")) actor.target_model.load_weights(os.path.join(dir, "actormodel.h5")) critic.target_model.load_weights( os.path.join(dir, "criticmodel.h5")) print("Weight load successfully") except: print("Cannot find the weight") def save_weights(dir): if not os.path.exists(dir): os.makedirs(dir) print("Saving weights in ", dir) actor.model.save_weights(os.path.join(dir, "actormodel.h5"), overwrite=True) critic.model.save_weights(os.path.join(dir, "criticmodel.h5"), overwrite=True) with open(os.path.join(dir, "actormodel.json"), "w") as outfile: json.dump(actor.model.to_json(), outfile) with open(os.path.join(dir, "criticmodel.json"), "w") as outfile: json.dump(critic.model.to_json(), outfile) load_weights(load_from) # Generate a Torcs environment print("TORCS Experiment Start.") np.random.seed(1337) done = False step = 0 epsilon = 1 for episode in range(episode_count): print("Episode : " + str(episode) + " Replay Buffer " + str(buff.count())) ob = env.reset() s_t = state(ob) total_reward = 0. progress = tqdm.trange(max_steps, disable=not train) for _ in progress: loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train * max(epsilon, 0) * ou( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train * max(epsilon, 0) * ou( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train * max(epsilon, 0) * ou( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = state(ob) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.update_target() critic.update_target() total_reward += r_t s_t = s_t1 progress.set_description("Episode %4i, TR %6.0f, loss %7.0f" % (episode, total_reward, loss)) #print("Episode", i, "Step", step, "Action", [ "%.3f" % x for x in a_t[0]], "Reward", r_t, "Loss", loss) step += 1 if done: break #print("Episode %i, TOTAL REWARD %.0f" % (episode, total_reward)) if train and total_reward > save_thresh: save_weights(save_to + str(episode)) save_thresh = min(1000000, 2 * save_thresh) env.end() # This is for shutting down TORCS print("Finish.")
class DDPG: """docstring for DDPG""" def __init__(self, env_name, state_dim, action_dim): self.name = 'DDPG' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Ensure action bound is symmetric self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(save_location) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.getBatch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def saveNetwork(self): self.saver.save(self.sess, save_location + self.env_name + 'network' + '-ddpg', global_step=self.time_step) def action(self, state): action = self.actor_network.action(state) action[0] = np.clip(action[0], -1, 1) action[1] = np.clip(action[1], 0, 1) action[2] = np.clip(action[2], 0, 1) #print "Action:", action return action def noise_action(self, state, epsilon): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) #print action.shape #print "Action_No_Noise:", action noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80) noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) noise_t[2] = epsilon * self.OU.function(action[2], -0.1, 1.00, 0.05) if random.random() <= 0.01: # 0.1 print("********Stochastic brake***********") noise_t[2] = epsilon * self.OU.function(action[2], 0.2, 1.00, 0.10) action = action + noise_t action[0] = np.clip(action[0], -1, 1) action[1] = np.clip(action[1], 0, 1) action[2] = np.clip(action[2], 0, 1) #print "Action_Noise:", action return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer if (not (math.isnan(reward))): self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train()
def playGame(checkpoints=None, train_indicator=1, eps=1.0): #1 means Train, 0 means simply Run BUFFER_SIZE = 40000 BATCH_SIZE = 16 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.01 #Learning rate for Actor LRC = 0.05 #Lerning rate for Critic vision = True action_dim = 3 #Steering/Acceleration/Brake if vision: state_dim = (64, 64, 3) #of sensors input else: state_dim = 29 np.random.seed(1337) EXPLORE = 1000000. episode_count = 2000 max_steps = 8000000 reward = 0 done = False step = 0 epsilon = eps indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA, vision, summary_writer) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC, vision) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer history = History() # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) log_file = open('train_log.log', 'w') #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints)) actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.target_model.load_weights( "criticmodel_{}.h5".format(checkpoints)) print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") max_reward = 0 min_reward = 0 for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() if vision: history.fill((ob.img)) s_t = history.get() else: s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. total_damage = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) if vision: a_t_original = actor.model.predict( s_t.reshape((-1, ) + state_dim)) else: a_t_original = actor.model.predict(s_t.reshape( 1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.30, 0.30) noise_t[0][1] = 0.1 + train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) damage = ob.damage if vision: last_s_t = history.get().copy() history.add((ob.img)) next_s_t = history.get().copy() if np.mod(step, 4) == 0: buff.add(last_s_t, a_t[0], r_t, next_s_t, done) #Add replay buffer s_t1 = history.get() else: s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) if vision: target_q_values = critic.target_model.predict([ new_states.reshape((-1, ) + state_dim), actor.target_model.predict(new_states).reshape( (-1, ) + (action_dim, )) ]) else: target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator and buff.count() >= 1000: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t total_damage += damage s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel_{}.h5".format(i), overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel_{}.h5".format(i), overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) max_reward = max(max_reward, total_reward) min_reward = min(min_reward, total_reward) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward) + " EPS " + str(epsilon)) print("Total Step: " + str(step) + ' Max: ' + str(max_reward) + ' Min: ' + str(min_reward)) print("") env.end() # This is for shutting down TORCS print("Finish.")
import argparse from keras.models import Model from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.optimizers import Adam import tensorflow as tf #from keras.engine.training import collect_trainable_weights from keras import backend as K from mmstore import MMStore from mujoco_actor_nn import ActorNetwork from mujoco_critic_nn import CriticNetwork from OU import OU import time noise_func = OU() MAX_INTERACTION = 10000000 MAX_EPI_INT = 100 MEM_SIZE = 500000 BATCH_SIZE = 64 REPETITION_NUM = 3 gamma = 0.99 SOFT_UPDATE = 1e-3 ALR = 1e-4 CLR = 1e-3 train_flag = False train_int_cnt = 0 epi_flag = False epi_int_cnt = 0 epi_cnt = 0
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") attacks = [] for i in range(-10, 0): val = i / 10.0 attacks.append([77, val]) # for i in range(45, 55): # attacks.append([i, -1.5]) # attacks.append([i, 1.5]) for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) # if np.mod(i, 3) == 0: # ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error # else: # ob = env.reset() ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # if j == 71: # print("cp attack!") # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 if (j == attacks[i][0]): print('cp attack on {} with {}'.format(attacks[i][0], attacks[i][1])) a_t[0][0] = attacks[i][1] ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer cur_step_sample = [ s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done ] cur_sample.append(cur_step_sample) # #Do the batch update # batch = buff.getBatch(BATCH_SIZE) # states = np.asarray([e[0] for e in batch]) # actions = np.asarray([e[1] for e in batch]) # rewards = np.asarray([e[2] for e in batch]) # new_states = np.asarray([e[3] for e in batch]) # dones = np.asarray([e[4] for e in batch]) # y_t = np.asarray([e[1] for e in batch]) # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) # for k in range(len(batch)): # if dones[k]: # y_t[k] = rewards[k] # else: # y_t[k] = rewards[k] + GAMMA*target_q_values[k] # if (train_indicator): # loss += critic.model.train_on_batch([states,actions], y_t) # a_for_grad = actor.model.predict(states) # grads = critic.gradients(states, a_for_grad) # actor.train(states, grads) # actor.target_train() # critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 200: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0], attacks[i][1]) with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file: the_file.write(s) # overall_scores.append(total_reward) # plt.clf() # plt.plot(overall_scores) # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1): # 1 means Train, 0 means simply Run BUFFER_SIZE = 100000 # 缓存能力,网络储存能力 BATCH_SIZE = 32 # 批尺寸,一次处理样本数 GAMMA = 0.99 # 折扣系数 TAU = 0.001 # Target Network HyperParameters 目标网络超系数 LRA = 0.0001 # Learning rate for Actor Actor网络学习率 LRC = 0.001 # Lerning rate for Critic Critic网络学习率 action_dim = 3 # Steering/Acceleration/Brake 加速/转向/刹车 state_dim = 29 # of sensors input 29个传感器输入 np.random.seed(1337) # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。 vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU 管理策略,此处使用动态内存申请策略 config = tf.ConfigProto() config.gpu_options.allow_growth = True # 硬性限制GPU使用率为0.4 # config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) # Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") theTime = datetime.datetime.now() # 获取系统当前时间 theTime = theTime.strftime('%y-%m-%d_%H:%M:%S') # 转换为字符串形式作为CSV文件头 folder_path = "practise_progress/" + theTime + "/" # 只适用于Linux系统 if not os.path.exists(folder_path): os.makedirs(folder_path) print("folder created") else: print("folder existed") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv" fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ", "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"] csvFile = open(csvfileHeader, "w") writer = csv.writer(csvFile) writer.writerow(fileHeader) for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) # The following code do the stochastic brake # if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300, a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss] """ 参数记录 轮次 步骤计数 车辆位置 X轴速度 Y轴速度 Z轴速度 加速输出 转向输出 刹车输出 回报 损失函""" writer.writerow(csvData) print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: csvFile.close() break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(DDPG_config, train_indicator=1): #1 means Train, 0 means simply Run # SETUP STARTS HERE if train_indicator > 0: folder = setup_run(DDPG_config) elif train_indicator == 0: folder = DDPG_config['EXPERIMENT'] if DDPG_config['RSEED'] == 0: DDPG_config['RSEED'] = None np.random.seed(DDPG_config['RSEED']) ACTIVE_NODES = DDPG_config['ACTIVE_NODES'] # Generate an environment if DDPG_config['ENV'] == 'balancing': env = OmnetBalancerEnv(DDPG_config, folder) elif DDPG_config['ENV'] == 'label': env = OmnetLinkweightEnv(DDPG_config, folder) action_dim, state_dim = env.a_dim, env.s_dim MU = DDPG_config['MU'] THETA = DDPG_config['THETA'] SIGMA = DDPG_config['SIGMA'] ou = OU(action_dim, MU, THETA, SIGMA) #Ornstein-Uhlenbeck Process BUFFER_SIZE = DDPG_config['BUFFER_SIZE'] BATCH_SIZE = DDPG_config['BATCH_SIZE'] GAMMA = DDPG_config['GAMMA'] EXPLORE = DDPG_config['EXPLORE'] EPISODE_COUNT = DDPG_config['EPISODE_COUNT'] MAX_STEPS = DDPG_config['MAX_STEPS'] if EXPLORE <= 1: EXPLORE = EPISODE_COUNT * MAX_STEPS * EXPLORE # SETUP ENDS HERE reward = 0 done = False wise = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, DDPG_config) critic = CriticNetwork(sess, state_dim, action_dim, DDPG_config) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer ltm = ['a_h0', 'a_h1', 'a_V', 'c_w1', 'c_a1', 'c_h1', 'c_h3', 'c_V'] layers_to_mind = {} L2 = {} for k in ltm: layers_to_mind[k] = 0 L2[k] = 0 vector_to_file(ltm, folder + 'weightsL2' + 'Log.csv', 'w') #Now load the weight try: actor.model.load_weights(folder + "actormodel.h5") critic.model.load_weights(folder + "criticmodel.h5") actor.target_model.load_weights(folder + "actormodel.h5") critic.target_model.load_weights(folder + "criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("OMNeT++ Experiment Start.") # initial state of simulator s_t = env.reset() loss = 0 for i in range(EPISODE_COUNT): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) total_reward = 0 for j in range(MAX_STEPS): print('step ', j) epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) if train_indicator and epsilon > 0 and (step % 1000) // 100 != 9: noise_t[0] = epsilon * ou.evolve() a = a_t_original[0] n = noise_t[0] a_t[0] = np.where((a + n > 0) & (a + n < 1), a + n, a - n).clip(min=0, max=1) # execute action s_t1, r_t, done = env.step(a_t[0], j) # print(s_t1) print('reward ', r_t) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer scale = lambda x: x #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = scale(np.asarray([e[0] for e in batch])) actions = scale(np.asarray([e[1] for e in batch])) rewards = scale(np.asarray([e[2] for e in batch])) new_states = scale(np.asarray([e[3] for e in batch])) dones = np.asarray([e[4] for e in batch]) y_t = np.zeros([len(batch), action_dim]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator and len(batch) >= BATCH_SIZE: loss = critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) # does this give an output like train_on_batch above? NO actor.train(states, grads) actor.target_train() critic.target_train() with open(folder + 'lossLog.csv', 'a') as file: file.write(pretty(loss) + '\n') total_reward += r_t s_t = s_t1 for layer in actor.model.layers + critic.model.layers: if layer.name in layers_to_mind.keys(): L2[layer.name] = np.linalg.norm( np.ravel(layer.get_weights()[0]) - layers_to_mind[layer.name]) # vector_to_file(np.ravel(layer.get_weights()[0]), folder + 'weights_' + layer.name + 'Log.csv', 'a') layers_to_mind[layer.name] = np.ravel( layer.get_weights()[0]) # if max(L2.values()) <= 0.02: # wise = True if train_indicator and len(batch) >= BATCH_SIZE: vector_to_file([L2[x] for x in ltm], folder + 'weightsL2' + 'Log.csv', 'a') vector_to_file(a_t_original[0], folder + 'actionLog.csv', 'a') vector_to_file(noise_t[0], folder + 'noiseLog.csv', 'a') if 'PRINT' in DDPG_config.keys() and DDPG_config['PRINT']: print("Episode", "%5d" % i, "Step", "%5d" % step, "Reward", "%.6f" % r_t) print("Epsilon", "%.6f" % max(epsilon, 0)) att_ = np.split(a_t[0], ACTIVE_NODES) for _ in range(ACTIVE_NODES): att_[_] = np.insert(att_[_], _, -1) att_ = np.concatenate(att_) print("Action\n", att_.reshape(ACTIVE_NODES, ACTIVE_NODES)) print(max(L2, key=L2.get), pretty(max(L2.values()))) step += 1 if done or wise: break if step % 1000 == 0: # writes at every 1000 step if (train_indicator): actor.model.save_weights(folder + "actormodel.h5", overwrite=True) actor.model.save_weights(folder + "actormodel" + str(step) + ".h5") with open(folder + "actormodel.json", "w") as outfile: outfile.write(actor.model.to_json(indent=4) + '\n') critic.model.save_weights(folder + "criticmodel.h5", overwrite=True) critic.model.save_weights(folder + "criticmodel" + str(step) + ".h5") with open(folder + "criticmodel.json", "w") as outfile: outfile.write(critic.model.to_json(indent=4) + '\n') print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down print("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run time.sleep(1) BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) pre_model = load_model("weights_rescale_all-0000.hdf5") # x = np.array([ 4.82767379e-01, 5.92105016e-02, 3.61700505e-01, 2.74807483e-01, # 2.31401995e-01, 2.07236990e-01, 1.95800006e-01, 1.89892501e-01, # 1.84837490e-01, 1.81293502e-01, 1.77807003e-01, 1.74377009e-01, # 1.71005994e-01, 1.66384503e-01, 1.61247000e-01, 1.52030498e-01, # 1.35238498e-01, 1.11962005e-01, 8.79574940e-02, 4.76383008e-02, # 4.78339800e-01, 6.97819047e-01, 4.60800716e-01, 5.00754069e-01, # -1.00000000e+00, 9.99979496e-01, 8.71338917e-13]) # x_s = np.array([x, x]) # pre_y = pre_model.predict(x_s) # print(x_s[0]) # print(pre_y[0]) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] attack_valid = 1 gap = (i / 10) / 100.0 attack_step = -1 attack_target = 0 for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # os.system("scrot saved_pic/{}.png".format(j)) if j == 80: print("cp attack!") a_t[0][0] = -1.0 if j == 83: os.system("scrot saved_pic/{}.png".format(j)) # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 # if(step == 60): # a_t[0][0] = 1.0 # s_t_scaled = rescale_state(s_t) # # print(s_t[0]) # s_t_0 = restore_state(s_t_scaled) # # print(s_t_0[0]) # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0])) # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)]) # actions = np.array([np.copy(a_t[0]) for val in range(21)]) # for val in range(21): # actions[val][0] = -1.0 + val/10.0 # # print(actions) # x_0 = np.hstack((s_t_scaled_list, actions)) # # print(x_0.shape, s_t_scaled_list.shape, actions.shape) # pre_y = pre_model.predict(x_0) # # print(x_0[0]) # # print(pre_y[0]) # steer_index = int(a_t[0][0]*10.0 + 10.0) # for pre_step in range(2): # restore_new_Y = restore_states(pre_y) # actions = actor.model.predict(restore_new_Y) # x_step1 = np.hstack((pre_y, actions)) # pre_y = pre_model.predict(x_step1) # for index in range(21): # diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index]) # pro = np.random.random() # if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50: # a_t[0][0] = -1.0 + index/10.0 # print("adv!", diff, "pro:", pro) # attack_step = j # attack_target = a_t[0][0] # attack_valid -= 1 # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y]) # print("{:.2f}".format(max(dis_list)*100000)) # print("{}".format(max(dis_list)*100000)) # s_t_scaled = np.copy(s_t1) # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5) # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5) # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7) # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7) # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7) # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0])) # print(actions[0][0]) # ob, r_t, done, info = env.step(new_a_t[0]) ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print(a_t[0][0]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # action_states = [] # for i in range(-5, 6): # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done] # cur_sample.append(cur_step_sample) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 500: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target, i, j, total_reward) attack_valid = 1 attack_step = -1 attack_target = 0 with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file: the_file.write(s) overall_scores.append(total_reward) plt.clf() plt.plot(overall_scores) plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step / 10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Now load the weight # print("Now we load the weight") # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) print ob.track total_reward = 0. stucked = 0 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.1: print("********Now we apply the brake***********") noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 512 #of sensors input np.random.seed(61502) vision = True EXPLORE = 100000. episode_count = 600000 max_steps = 1800 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 esar2 = [] esar4 = [] #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) #We insert the Deep Q Image Processing Module args = { 'save_model_freq': 10000, 'target_model_update_freq': 10000, 'normalize_weights': True, 'learning_rate': .00025, 'model': None } # print(args["save_model_freq"]) C = DeepQNetwork(512, sess, '/home/lou/DDPG-Keras-Torcs', args=args) # print(C) x, h_fc1 = C.buildNetwork('test', trainable=True, numActions=1) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodelIMG.h5") critic.model.load_weights("criticmodelIMG.h5") actor.target_model.load_weights("actormodel2IMG.h5") critic.target_model.load_weights("criticmodel2IMG.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 500) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 500 episode because of the memory leak error else: ob = env.reset() imgfinal = np.zeros((1, 128, 128, 4), dtype=np.int32) s_t = C.getFC7(imgfinal) total_reward = 0. imglst = [] speed = 0 stepreset = 0 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) # a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) a_t_original = actor.model.predict(C.getFC7(imgfinal)) #print('ATORIGINAL', a_t_original) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.05: print("********Now we apply the brake***********") noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) imglst.append(ob.img) if len(imglst) == 4: imgcopy = imglst[:] imgfinal = np.stack(imgcopy) # print("Original stacked matrix", imgfinal) imgfinal = np.reshape(imgfinal, (4, 128, 128)) # print("Reshaped stacked matrix", imgfinal) imgfinal = np.transpose(imgfinal, (1, 2, 0)) # print("Transposed stacked matrix", imgfinal) imgfinal = np.reshape(imgfinal, (1, 128, 128, 4)) # print("Shape of imgfinal", imgfinal.shape) s_t1 = C.getFC7(imgfinal) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) # print('NEW STATES', new_states) # target_q_values = critic.target_model.predict([C.getFC7(imgfinal), actor.target_model.predict(C.getFC7(imgfinal))]) # print('ACTOR TARGET MODEL PREDICT', C.getFC7(imgfinal)) new_states = np.reshape(new_states, (-1, 512)) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # print('TARGET Q VALUES', target_q_values) # print('NEW STATES', new_states) # print('ACTOR MODEL PREDICT NEW STATES', actor.target_model.predict(new_states)) # print('REWARDS', rewards) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): states = np.reshape(states, (-1, 512)) print('STATESSHAPE', np.shape(states)) print('ACTIONSSHAPE', np.shape(actions)) print('YT', np.shape(y_t)) loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 speed += ob.speedX * 300 speedavg = speed / stepreset #print("SPEED X", ob.speedX) print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss, "Average Speed", speedavg) esar = (i, step, a_t, r_t, loss, speedavg) esar2.append(esar) step += 1 stepreset += 1 if len(imglst) >= 4: del imglst[0] # print("Length of imglist", len(imglst)) # print("List itself", imgfinal) if done: break if np.mod(i, 50) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodelIMG.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodelIMG.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") esar3 = (i, step, total_reward, speedavg) esar4.append(esar3) if np.mod(i, 50) == 0: save_object(esar2, 'IntraEpisode.pkl') save_object(esar4, 'InterEpisode.pkl') env.end() # This is for shutting down TORCS print("Finish.") print("Saving esars.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.00005 #Learning rate for Actor LRC = 0.0005 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 200000. if train_indicator: episode_count = 1000 else: episode_count = 20 max_steps = 4000 step = 0 if train_indicator: epsilon = 1 else: epsilon = 0 min_laptime = 10000000 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight # loading networks print("Now we load the weight") saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks/") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. # totalLaptime = 0. for j in range(max_steps): loss = 0 if train_indicator: epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, 0.10) a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0], train_indicator) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_predict(new_states, actor.target_predict(new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.train_on_batch(states, actions, y_t) a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 100) == 0: print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime) step += 1 if i == 0: break if done: break # if np.mod(i, 3) == 0: if (train_indicator) and i > 0: if env.lapTime < min_laptime and env.num_lap == 10: min_laptime = env.lapTime print("Now we save model") saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i)) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=0): # 1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.00001 # Learning rate for Actor LRC = 0.0001 # Lerning rate for Critic server_number = 5 # node_number = 18 hot_node_number = 150 action_dim = hot_node_number # Number of servers state_dim = hot_node_number * (server_number + 1 + 10 ) # 1000 node * 10 features # baseline = 4e-05 #load&locality of baselines np.random.seed(500) # vision = False EXPLORE = 100000. episode_count = 100 max_steps = 100000 line_number = 1000 step_number = 35 # reward = 0 done = False step = 0 epsilon = 1 # indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() # config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a MDS environment env = MetaEnvironment(server_number) # Now load the weight print("Now we load the weight") try: actor.model.load_weights("model/actormodel-" + str(server_number) + ".h5") critic.model.load_weights("model/criticmodel-" + str(server_number) + ".h5") actor.target_model.load_weights("model/actormodel-" + str(server_number) + ".h5") critic.target_model.load_weights("model/criticmodel-" + str(server_number) + ".h5") print("Weight load successfully") except: print("Cannot find the weight") print("Experiment Start.") f = open("query.txt") queryList = [] for line in f.readlines(): line = line.strip() queryList.append(line) f.close() sumLoc = 0 sumLod = 0 lossList = [] mdsLoadList = [[] for x in range(server_number)] for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) # if np.mod(i, 3) == 0: # ob = env.reset(relaunch=True) #relaunch every 3 episode because of the memory leak error # else: # ob = env.reset() traceList = queryList[0:line_number] # Reset s_t = env.state(traceList) # Get State from env localityList = [] loadList = [] total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) # add noise a_t_original = actor.model.predict(s_t) for k in range(action_dim): noise_t[0][k] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][k], 0.0, 0.60, 0.30) for m in range(action_dim): a_t[0][m] = a_t_original[0][m] # + noise_t[0][m] migration = env.take_actions(a_t[0]) print("migration", migration) tracelist = queryList[(j + 1) * line_number:(j + 2) * line_number] s_t1 = env.state(tracelist) # Update state from env # r_t = 0.5*env.locality() + 50*env.load() - baseline # print("gagaga", 1e5*env.locality() + 1e7*env.load()) # 1.5, 3, 2 x = 1e5 * env.locality() + 1e7 * env.load() - 1.5 * migration # x = 1e5*env.locality() + 1.5 * 1e7*env.load() # r_t = 1.0 / (1.0 + np.exp(-(x/50))) r_t = x if j == step_number: done = True else: done = False buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) states = states.reshape(len(batch), -1) new_states = new_states.reshape(len(batch), -1) actions = actions.reshape(len(batch), -1) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss, "Locality", env.locality(), "Load", env.load()) print("Episode", i, "Step", step, "Reward", r_t, "Loss", loss, "Locality", env.locality(), "Load", env.load()) lossList.append(loss) localityList.append(env.locality()) loadList.append(env.load()) for index in range(server_number): mdsLoadList[index].append(env.loadList[index]) step += 1 if done: break curLocalitySum = sum(localityList) curLoadSum = sum(loadList) # f = open('' + str(server_number) + '.txt', 'w') # f.write(','.join(map(str, lossList))) # f.close() # f = open('anglecut-mdsload-' + str(server_number) + '.txt', 'w') # for i in range(server_number): # f.write(','.join(map(str, mdsLoadList[i]))) # f.write('\n') # f.close() # print("写入成功") if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("model/actormodel-" + str(server_number) + ".h5", overwrite=True) with open("model/actormodel-" + str(server_number) + ".json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("model/criticmodel-" + str(server_number) + ".h5", overwrite=True) with open("model/criticmodel-" + str(server_number) + ".json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) # print("Final Locality:", env.final_locality(), "Final Load Balancing:", env.final_load()) # env.clear() print("") # env.end() print("Finish.")