class Simulation(): def __init__(self, args,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) simu_params = { 'number_of_vehicles': 0, 'number_of_walkers': 0, 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name, params=simu_params) self.device = torch.device("cpu") self.load_index = self.args.max_train # self.load_index = 40000 self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_history =[] self.Q_std_history = [] def run(self): alpha = 0.004 step = 0 summaryFlag = True while True: self.state, self.info = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) for i in range(500): q = self.Q_net1(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] if self.args.double_Q: q = torch.min( q, self.Q_net2(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]) self.Q_history.append(q.detach().item()) self.u = self.u.squeeze(0) # TODO if summaryFlag: with SummaryWriter(log_dir='./logs') as writer: # writer.add_scalar('random', np.random.randint(0, 10), i) v = self.env.ego.get_velocity() v = np.array([v.x, v.y, v.z]) writer.add_scalar('v_x', self.env.state_info['velocity_t'][0], i) writer.add_scalar('v_y', self.env.state_info['velocity_t'][1], i) writer.add_scalar('accelaration_x', self.env.state_info['acceleration_t'][0], i) writer.add_scalar('accelaration_y', self.env.state_info['acceleration_t'][1], i) # writer.add_scalar('distance2terminal', self.env.state_info['dist_to_dest'], i) # writer.add_scalar('delta_yaw', self.state[5]*2, i) writer.add_scalar('angular_speed_z', self.env.state_info['dyaw_dt_t'], i) # writer.add_scalar('lateral_dist', self.state[7]/10, i) writer.add_scalar('action_throttle', self.env.state_info['action_t_1'][0], i) writer.add_scalar('action_steer', self.env.state_info['action_t_1'][1], i) writer.add_scalar('delta_yaw', self.env.state_info['delta_yaw_t'], i) writer.add_scalar('dist2center', self.env.state_info['lateral_dist_t'], i) self.state, self.reward, self.done, self.info = self.env.step(self.u) self.reward_history.append(self.reward) self.done_history.append(self.done) self.entropy_history.append(log_prob) # render the image cv2.imshow("camera img", self.state.squeeze()) cv2.waitKey(1) # if step%10000 >=0 and step%10000 <=9999: # self.env.render(mode='human') state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) if self.done == True or self.env.isTimeOut: time.sleep(1) print("Episode Done!") summaryFlag = False # return break step += 1 self.episode_step += 1 if self.done == True: pass #break print(self.reward_history) for i in range(len(self.Q_history)): a = 0 for j in range(i, len(self.Q_history), 1): a += pow(self.args.gamma, j-i)*self.reward_history[j] for z in range(i+1, len(self.Q_history), 1): a -= alpha * pow(self.args.gamma, z-i) * self.entropy_history[z] self.Q_real_history.append(a) plt.figure() x = np.arange(0,len(self.Q_history),1) plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0) plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0) plt.show()
class Evaluator(object): def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) eval_params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': int(2000 + 3 * args.num_actors), # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name, params=eval_params) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item() ) if args.alpha == 'auto' else 0 self.evaluation_interval = 20000 self.max_state_num_evaluated_in_an_episode = 50 # 500 self.episode_num_evaluation = 5 self.episode_num_test = 5 self.time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.evaluated_Q_mean_history = [] self.evaluated_Q_std_history = [] self.true_gamma_return_mean_history = [] self.policy_entropy_history = [] self.a_std_history = [] self.a_abs_history = [] def average_max_n(self, list_for_average, n): sorted_list = sorted(list_for_average, reverse=True) return sum(sorted_list[:n]) / n def run_an_episode(self, deterministic): #state_list = [] action_list = [] log_prob_list = [] reward_list = [] evaluated_Q_list = [] Q_std_list = [] a_std_list = [] done = 0 state, info = self.env.reset() while not done and len(reward_list) < (self.args.max_step - 1): state_tensor = torch.FloatTensor(state.copy()).float().to( self.device) info_tensor = torch.FloatTensor(info.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) # 3, 256, 256 u, log_prob, a_std = self.actor.get_action( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), deterministic) log_prob_list.append(log_prob) a_std_list.append(a_std) if self.args.double_Q and not self.args.double_actor: q = torch.min( self.Q_net1.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0], self.Q_net2.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0]) else: q, q_std, _ = self.Q_net1.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device)) evaluated_Q_list.append(q.detach().item()) if self.args.distributional_Q: Q_std_list.append(q_std.detach().item()) else: Q_std_list.append(0) u = u.squeeze(0) state, reward, done, info = self.env.step(u) # self.env.render(mode='human') action_list.append(u) reward_list.append(reward * self.args.reward_scale) if not deterministic: entropy_list = list(-self.alpha * np.array(log_prob_list)) true_gamma_return_list = cal_gamma_return_of_an_episode( reward_list, entropy_list, self.args.gamma) policy_entropy = -sum(log_prob_list) / len(log_prob_list) a_std_mean = np.mean(np.array(a_std_list), axis=0) a_abs_mean = np.mean(np.abs(np.array(action_list)), axis=0) return dict( #state_list=np.array(state_list), #action_list=np.array(action_list), log_prob_list=np.array(log_prob_list), policy_entropy=policy_entropy, #reward_list=np.array(reward_list), a_std_mean=a_std_mean, a_abs_mean=a_abs_mean, evaluated_Q_list=np.array(evaluated_Q_list), Q_std_list=np.array(Q_std_list), true_gamma_return_list=true_gamma_return_list, ) else: episode_return = sum(reward_list) / self.args.reward_scale episode_len = len(reward_list) return dict(episode_return=episode_return, episode_len=episode_len) def run_n_episodes(self, n, max_state, deterministic): n_episode_state_list = [] n_episode_action_list = [] n_episode_log_prob_list = [] n_episode_reward_list = [] n_episode_evaluated_Q_list = [] n_episode_Q_std_list = [] n_episode_true_gamma_return_list = [] n_episode_return_list = [] n_episode_len_list = [] n_episode_policyentropy_list = [] n_episode_a_std_list = [] for _ in range(n): episode_info = self.run_an_episode(deterministic) # n_episode_state_list.append(episode_info['state_list']) # n_episode_action_list.append(episode_info['action_list']) # n_episode_log_prob_list.append(episode_info['log_prob_list']) # n_episode_reward_list.append(episode_info['reward_list']) if not deterministic: n_episode_evaluated_Q_list.append( episode_info['evaluated_Q_list']) n_episode_Q_std_list.append(episode_info['Q_std_list']) n_episode_true_gamma_return_list.append( episode_info['true_gamma_return_list']) n_episode_policyentropy_list.append( episode_info['policy_entropy']) n_episode_a_std_list.append(episode_info['a_std_mean']) n_episode_action_list.append(episode_info['a_abs_mean']) else: n_episode_return_list.append(episode_info['episode_return']) n_episode_len_list.append(episode_info['episode_len']) if not deterministic: average_policy_entropy = sum(n_episode_policyentropy_list) / len( n_episode_policyentropy_list) average_a_std = np.mean(np.array(n_episode_a_std_list), axis=0) average_a_abs = np.mean(np.array(n_episode_action_list), axis=0) # n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history)) # n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history)) def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi): tmp = list(copy.deepcopy(list_of_n_epi)) tmp[0] = tmp[0] if len( tmp[0]) <= max_state else tmp[0][:max_state] def reduce_fuc(a, b): return np.concatenate( [a, b]) if len(b) < max_state else np.concatenate( [a, b[:max_state]]) interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp) return sum(interest_epi_part_of_one_ite) / len( interest_epi_part_of_one_ite) evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_evaluated_Q_list)) evaluated_Q_std = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_Q_std_list)) true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_true_gamma_return_list)) return dict(evaluated_Q_mean=evaluated_Q_mean, true_gamma_return_mean=true_gamma_return_mean, evaluated_Q_std=evaluated_Q_std, n_episode_reward_list=np.array(n_episode_reward_list), policy_entropy=average_policy_entropy, a_std=average_a_std, a_abs=average_a_abs) else: average_return_with_diff_base = np.array([ self.average_max_n(n_episode_return_list, x) for x in [1, self.episode_num_test - 2, self.episode_num_test] ]) average_reward = sum(n_episode_return_list) / sum( n_episode_len_list) return dict( n_episode_reward_list=np.array(n_episode_reward_list), average_return_with_diff_base=average_return_with_diff_base, average_reward=average_reward, ) def run(self): while not self.stop_sign.value: if self.iteration_counter.value % self.evaluation_interval == 0: self.alpha = np.exp(self.log_alpha_share.detach().item() ) if self.args.alpha == 'auto' else 0 self.iteration = self.iteration_counter.value self.actor.load_state_dict(self.actor_share.state_dict()) self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.Q_net2.load_state_dict(self.Q_net2_share.state_dict()) delta_time = time.time() - self.time self.time = time.time() n_episode_info = self.run_n_episodes( self.episode_num_evaluation, self.max_state_num_evaluated_in_an_episode, False) self.iteration_history.append(self.iteration) self.evaluated_Q_mean_history.append( n_episode_info['evaluated_Q_mean']) self.evaluated_Q_std_history.append( n_episode_info['evaluated_Q_std']) self.true_gamma_return_mean_history.append( n_episode_info['true_gamma_return_mean']) self.time_history.append(delta_time) # self.list_of_n_episode_rewards_history.append(list_of_n_episode_rewards) self.alpha_history.append(self.alpha.item()) self.policy_entropy_history.append( n_episode_info['policy_entropy']) self.a_std_history.append(n_episode_info['a_std']) self.a_abs_history.append(n_episode_info['a_abs']) n_episode_info_test = self.run_n_episodes( self.episode_num_test, self.max_state_num_evaluated_in_an_episode, True) self.average_return_with_diff_base_history.append( n_episode_info_test['average_return_with_diff_base']) self.average_reward_history.append( n_episode_info_test['average_reward']) print('Saving evaluation results of the {} iteration.'.format( self.iteration)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration', np.array(self.iteration_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_mean', np.array(self.evaluated_Q_mean_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_std', np.array(self.evaluated_Q_std_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/true_gamma_return_mean', np.array(self.true_gamma_return_mean_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/time', np.array(self.time_history)) # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards', # np.array(self.list_of_n_episode_rewards_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_return_with_diff_base', np.array(self.average_return_with_diff_base_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_reward', np.array(self.average_reward_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/alpha', np.array(self.alpha_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/policy_entropy', np.array(self.policy_entropy_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/a_std', np.array(self.a_std_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/a_abs', np.array(self.a_abs_history)) # plot_online(self.args.env_name, self.args.method, self.args.method_name, # self.max_state_num_evaluated_in_an_episode) if self.iteration >= self.args.max_train: self.stop_sign.value = 1 break
class Test(): def __init__(self, args, shared_value, share_net): super(Test, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) test_params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': int(2000 + 3 * args.num_actors), # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 500, # maximum timesteps per episode 'desired_speed': 8, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.args = args self.env = gym.make(args.env_name, params=test_params) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.actor_share = share_net[0] self.log_alpha = share_net[1] self.test_step = 0 self.episode_num = 10 self.test_interval = 20000 self.start_time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.accel_history = [] self.steer_history = [] def run_an_episode(self): reward_list = [] accel_list = [] steer_list = [] done = 0 state, info = self.env.reset() while not done and len(reward_list) < self.args.max_step: state_tensor = torch.FloatTensor(state.copy()).float().to( self.device) info_tensor = torch.FloatTensor(info.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) # 3, 64, 160 u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) u = u.squeeze(0) state, reward, done, info = self.env.step(u) #self.env.render(mode='human') reward_list.append(reward) accel_list.append(u[0]) steer_list.append(u[1]) episode_return = sum(reward_list) episode_len = len(reward_list) episode_accel = np.mean(accel_list) episode_steer = np.mean(steer_list) return np.array( reward_list ), episode_return, episode_len, episode_accel, episode_steer def average_max_n(self, list_for_average, n): sorted_list = sorted(list_for_average, reverse=True) return sum(sorted_list[:n]) / n def run_n_episodes(self, n): assert n >= 5, "n must be at least 5" list_of_n_episode_rewards = [] list_of_return = [] list_of_len = [] list_of_accel = [] list_of_steer = [] for _ in range(n): reward_list, episode_return, episode_len, episode_accel, episode_steer = self.run_an_episode( ) list_of_n_episode_rewards.append(reward_list) list_of_return.append(episode_return) list_of_len.append(episode_len) list_of_accel.append(episode_accel) list_of_steer.append(episode_steer) average_return_with_diff_base = np.array( [self.average_max_n(list_of_return, x) for x in [1, 3, 5]]) average_reward = sum(list_of_return) / sum(list_of_len) avg_accel = sum(list_of_accel) / sum(list_of_len) avg_steer = sum(list_of_steer) / sum(list_of_len) return np.array( list_of_n_episode_rewards ), average_return_with_diff_base, average_reward, avg_accel, avg_steer def run(self): while not self.stop_sign.value: if self.iteration_counter.value % self.test_interval == 0: self.iteration = self.iteration_counter.value self.actor.load_state_dict(self.actor_share.state_dict()) delta_time = time.time() - self.start_time list_of_n_episode_rewards, average_return_with_diff_base, average_reward, avg_accel, avg_steer = self.run_n_episodes( self.episode_num) self.iteration_history.append(self.iteration) self.time_history.append(delta_time) self.list_of_n_episode_rewards_history.append( list_of_n_episode_rewards) self.average_return_with_diff_base_history.append( average_return_with_diff_base) self.average_reward_history.append(average_reward) self.alpha_history.append(self.log_alpha.detach().exp().item()) self.accel_history.append(avg_accel) self.steer_history.append(avg_steer) print('Saving test data of the {} iteration.'.format( self.iteration)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration', np.array(self.iteration_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/time', np.array(self.time_history)) # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards', # np.array(self.list_of_n_episode_rewards_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_return_with_diff_base', np.array(self.average_return_with_diff_base_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_reward', np.array(self.average_reward_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/alpha', np.array(self.alpha_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/accel', np.array(self.accel_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/steer', np.array(self.steer_history)) # plot_online(self.args.env_name, self.args.method, self.args.method_name) if self.iteration >= self.args.max_train: self.stop_sign.value = 1 break
class Actor(): def __init__(self, args, shared_queue, shared_value, share_net, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) actor_params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': int(2000 + 3 * self.agent_id), # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'train', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name, params=actor_params) self.args = args self.experience_in_queue = [] for i in range(args.num_buffers): self.experience_in_queue.append(shared_queue[0][i]) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) # self.Q_net1 = QNet(args).to(self.device) #share_net = [Q_net1,Q_net1_target,Q_net2,Q_net2_target,actor,actor_target,log_alpha] #share_optimizer=[Q_net1_optimizer,Q_net2_optimizer,actor_optimizer,alpha_optimizer] self.Q_net1_share = share_net[1] self.actor_share = share_net[0] def put_data(self): if not self.stop_sign.value: index = np.random.randint(0, self.args.num_buffers) if self.experience_in_queue[index].full(): #print("agent", self.agent_id, "is waiting queue space") time.sleep(0.5) self.put_data() else: self.experience_in_queue[index].put((self.state, self.info, self.u, \ [self.reward*self.args.reward_scale], self.state_next, self.info_next, [self.done], self.TD.detach().cpu().numpy().squeeze())) else: pass def run(self): time_init = time.time() step = 0 while not self.stop_sign.value: self.state, self.info = self.env.reset() self.episode_step = 0 for i in range(self.args.max_step - 1): state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) info_tensor = torch.FloatTensor(self.info.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, _, _ = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), False) self.u = self.u.squeeze(0) self.state_next, self.reward, self.done, self.info_next = self.env.step( self.u) self.TD = torch.zeros(1) self.put_data() self.state = self.state_next.copy() self.info = self.info_next.copy() with self.lock: self.counter.value += 1 if self.done == True: break if step % self.args.load_param_period == 0: #self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.actor.load_state_dict(self.actor_share.state_dict()) step += 1 self.episode_step += 1
class Actor(): def __init__(self, args, shared_queue, shared_value, share_net, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name) self.args = args self.experience_in_queue = [] for i in range(args.num_buffers): self.experience_in_queue.append(shared_queue[0][i]) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) #share_net = [Q_net1,Q_net1_target,Q_net2,Q_net2_target,actor,actor_target,log_alpha] #share_optimizer=[Q_net1_optimizer,Q_net2_optimizer,actor_optimizer,alpha_optimizer] self.Q_net1_share = share_net[1] self.actor_share = share_net[0] def put_data(self): if not self.stop_sign.value: index = np.random.randint(0, self.args.num_buffers) if self.experience_in_queue[index].full(): #print("agent", self.agent_id, "is waiting queue space") time.sleep(0.5) self.put_data() else: self.experience_in_queue[index].put( (self.last_state, self.last_u, [self.reward * self.args.reward_scale], self.state, [self.done], self.TD.detach().cpu().numpy().squeeze())) else: pass def run(self): time_init = time.time() step = 0 while not self.stop_sign.value: self.state = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, _ = self.actor.get_action(state_tensor.unsqueeze(0), False) #q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] self.u = self.u.squeeze(0) self.last_state = self.state.copy() self.last_u = self.u.copy() #last_q_1 = q_1 for i in range(self.args.max_step - 1): self.state, self.reward, self.done, _ = self.env.step(self.u) state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, _ = self.actor.get_action(state_tensor.unsqueeze(0), False) #q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] self.u = self.u.squeeze(0) self.TD = torch.zeros( 1 ) #self.reward + (1 - self.done) * self.args.gamma * q_1 - last_q_1 self.put_data() self.last_state = self.state.copy() self.last_u = self.u.copy() #last_q_1 = q_1 with self.lock: self.counter.value += 1 if self.done == True: break if step % self.args.load_param_period == 0: #self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.actor.load_state_dict(self.actor_share.state_dict()) step += 1 self.episode_step += 1
class Simulation(): def __init__(self, args, shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = self.args.max_train self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl', map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl', map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl', map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history = [] self.done_history = [] self.Q_real_history = [] self.Q_history = [] self.Q_std_history = [] def run(self): alpha = 0.004 step = 0 while True: self.state = self.env.reset() self.episode_step = 0 for i in range(300): state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob, _ = self.actor.get_action( state_tensor.unsqueeze(0), True) q = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] if self.args.double_Q: q = torch.min( self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to( self.device))[0], self.Q_net2(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to( self.device))[0]) self.u = self.u.squeeze(0) self.state, self.reward, self.done, _ = self.env.step(self.u) self.Q_history.append(q.detach().item()) self.reward_history.append(self.reward) self.done_history.append(self.done) self.entropy_history.append(log_prob) if step % 10000 >= 0 and step % 10000 <= 9999: self.env.render(mode='human') if self.done == True: time.sleep(1) print("!!!!!!!!!!!!!!!") break step += 1 self.episode_step += 1 if self.done == True: pass #break print(self.reward_history) for i in range(len(self.Q_history)): a = 0 for j in range(i, len(self.Q_history), 1): a += pow(self.args.gamma, j - i) * self.reward_history[j] for z in range(i + 1, len(self.Q_history), 1): a -= alpha * pow(self.args.gamma, z - i) * self.entropy_history[z] self.Q_real_history.append(a) plt.figure() x = np.arange(0, len(self.Q_history), 1) plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0) plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0) plt.show()
class Actor(): def __init__(self, args, shared_queue, shared_value, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) self.experience_queue = shared_queue[0] self.policy_param_queue = shared_queue[1] self.q_param_queue = shared_queue[2] self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name) self.args = args self.device = torch.device("cpu") self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low, args.NN_type).to(self.device) self.Q_net1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) def update_actor_net(self, current_dict, actor_net): params_target = get_flat_params_from(actor_net) params = get_flat_params_from_dict(current_dict) set_flat_params_to(actor_net, (1 - self.args.syn_tau) * params_target + self.args.syn_tau * params) def load_param(self): if self.policy_param_queue.empty(): #pass #print("agent", self.agent_id, "is waiting param") time.sleep(0.5) #self.load_param() else: param = self.policy_param_queue.get() if self.args.syn_method == "copy": self.actor.load_state_dict(param) elif self.args.syn_method == "slow": self.update_actor_net(param, self.actor) if self.q_param_queue.empty(): time.sleep(0.5) #self.load_param() else: param = self.q_param_queue.get() self.Q_net1.load_state_dict(param) def put_data(self): if not self.stop_sign.value: if self.experience_queue.full(): #print("agent", self.agent_id, "is waiting queue space") time.sleep(0.5) self.put_data() else: self.experience_queue.put( (self.last_state, self.last_u, [self.reward], self.state, [self.micro_step], [self.done], self.TD.detach().cpu().numpy().squeeze())) else: pass def run(self): time_init = time.time() step = 0 self.micro_step = 0 while not self.stop_sign.value: self.state = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] self.u = self.u.squeeze(0) self.last_state = self.state.copy() self.last_u = self.u.copy() last_q_1 = q_1 for i in range(self.args.max_step): self.state, self.reward, self.done, _ = self.env.step(self.u) state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action( state_tensor.unsqueeze(0), False) q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] self.u = self.u.squeeze(0) if self.episode_step > 0: self.TD = self.reward + ( 1 - self.done) * self.args.gamma * q_1 - last_q_1 self.put_data() self.last_state = self.state.copy() self.last_u = self.u.copy() last_q_1 = q_1 with self.lock: self.counter.value += 1 if self.done == True: break if step % self.args.load_param_period == 0: self.load_param() step += 1 self.episode_step += 1
class Simulation(): def __init__(self, args, shared_queue,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.policy_test_queue = shared_queue[3] self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = 20000 self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device) self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl')) self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl')) self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_m0_history =[] self.Q_m1_history = [] self.Q_m2_history = [] self.Q_std_m2_history = [] def load_param(self): if self.policy_test_queue.empty(): pass else: self.iteration, param = self.policy_test_queue.get() self.actor.load_state_dict(param) def run(self): step = 0 while True: self.state = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) for i in range(self.args.max_step): q_m0 = self.Q_net1_m0(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] q_m1 = torch.min( self.Q_net1_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0], self.Q_net2_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]) q_m2, q_std, _ = self.Q_net1_m2.evaluate(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device)) self.Q_m0_history.append(q_m0.detach().item()) self.Q_m1_history.append(q_m1.detach().item()) self.Q_m2_history.append(q_m2.detach().item()) self.Q_std_m2_history.append(q_std.detach().item()) self.u = self.u.squeeze(0) self.state, self.reward, self.done, _ = self.env.step(self.u) self.reward_history.append(self.reward) self.done_history.append(self.done) self.entropy_history.append(log_prob) if step%10000 >=0 and step%10000 <=9999: self.env.render(mode='human') state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) if self.done == True: time.sleep(1) print("!!!!!!!!!!!!!!!") break step += 1 self.episode_step += 1 if self.done == True: break for i in range(len(self.Q_m0_history)): a = 0 for j in range(i, len(self.Q_m0_history), 1): a += pow(self.args.gamma, j-i)*self.reward_history[j] for z in range(i+1, len(self.Q_m0_history), 1): a -= self.args.alpha * pow(self.args.gamma, z-i) * self.entropy_history[z] self.Q_real_history.append(a) print(self.reward_history) print(self.entropy_history) print(self.Q_m2_history) print(self.Q_std_m2_history) plt.figure() x = np.arange(0,len(self.Q_m0_history),1) plt.plot(x, np.array(self.Q_m0_history), 'r', linewidth=2.0) plt.plot(x, np.array(self.Q_m1_history), 'g', linewidth=2.0) plt.plot(x, np.array(self.Q_m2_history), 'b', linewidth=2.0) plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0) plt.show()
class Evaluator(object): def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0 self.evaluation_interval = 50000 self.max_state_num_evaluated_in_an_episode = 500 self.episode_num_to_run = 10 self.iteration_history = [] self.evaluated_Q_mean_history=[] self.true_gamma_return_mean_history=[] # self.n_episodes_info_history = [] self.evaluated_Q_history = [] self.true_gamma_return_history = [] def run_an_episode(self): state_list = [] action_list = [] log_prob_list = [] reward_list = [] evaluated_Q_list = [] done = 0 state = self.env.reset() while not done and len(reward_list) < self.args.max_step: state_tensor = torch.FloatTensor(state.copy()).float().to(self.device) u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), self.args.stochastic_actor) state_list.append(state.copy()) action_list.append(u.copy()) log_prob_list.append(log_prob) if self.args.double_Q and not self.args.double_actor: q = torch.min( self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0], self.Q_net2(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0]) else: q = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0] evaluated_Q_list.append(q.detach().item()) u = u.squeeze(0) state, reward, done, load_action = self.env.step(u) # self.env.render(mode='human') reward_list.append(reward * self.args.reward_scale) entropy_list = list(-self.alpha * np.array(log_prob_list)) true_gamma_return_list = cal_gamma_return_of_an_episode(reward_list, entropy_list, self.args.gamma) episode_return = sum(reward_list) episode_len = len(reward_list) return dict(state_list=np.array(state_list), action_list=np.array(action_list), log_prob_list=np.array(log_prob_list), reward_list=np.array(reward_list), evaluated_Q_list=np.array(evaluated_Q_list), true_gamma_return_list=true_gamma_return_list, episode_return=episode_return, episode_len=episode_len) def run_n_episodes(self, n, max_state): n_episode_state_list = [] n_episode_action_list = [] n_episode_log_prob_list = [] n_episode_reward_list = [] n_episode_evaluated_Q_list = [] n_episode_true_gamma_return_list = [] n_episode_return_list = [] n_episode_len_list = [] for _ in range(n): episode_info = self.run_an_episode() n_episode_state_list.append(episode_info['state_list']) n_episode_action_list.append(episode_info['action_list']) n_episode_log_prob_list.append(episode_info['log_prob_list']) n_episode_reward_list.append(episode_info['reward_list']) n_episode_evaluated_Q_list.append(episode_info['evaluated_Q_list']) n_episode_true_gamma_return_list.append(episode_info['true_gamma_return_list']) n_episode_return_list.append(episode_info['episode_return']) n_episode_len_list.append(episode_info['episode_len']) #n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history)) #n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history)) def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi): tmp = list(copy.deepcopy(list_of_n_epi)) tmp[0] = tmp[0] if len(tmp[0]) <= max_state else tmp[0][:max_state] def reduce_fuc(a, b): return np.concatenate([a, b]) if len(b) < max_state else np.concatenate([a, b[:max_state]]) interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp) return sum(interest_epi_part_of_one_ite) / len(interest_epi_part_of_one_ite) evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean(np.array(n_episode_evaluated_Q_list)) true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_true_gamma_return_list)) return evaluated_Q_mean, true_gamma_return_mean # return dict(n_episode_state_list=np.array(n_episode_state_list), # n_episode_action_list=np.array(n_episode_action_list), # n_episode_log_prob_list=np.array(n_episode_log_prob_list), # n_episode_reward_list=np.array(n_episode_reward_list), # n_episode_evaluated_Q_list=np.array(n_episode_evaluated_Q_list), # n_episode_true_gamma_return_list=np.array(n_episode_true_gamma_return_list), # n_episode_return_list=np.array(n_episode_return_list), # n_episode_len_list=np.array(n_episode_len_list)) def run(self): while not self.stop_sign.value: if self.iteration_counter.value % self.evaluation_interval == 0: self.alpha = np.exp(self.log_alpha_share.detach().item()) if self.args.alpha == 'auto' else 0 self.iteration = self.iteration_counter.value self.actor.load_state_dict(self.actor_share.state_dict()) self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.Q_net2.load_state_dict(self.Q_net2_share.state_dict()) evaluated_Q_mean, true_gamma_return_mean = self.run_n_episodes(self.episode_num_to_run,self.max_state_num_evaluated_in_an_episode) self.iteration_history.append(self.iteration) self.evaluated_Q_mean_history.append(evaluated_Q_mean) self.true_gamma_return_mean_history.append(true_gamma_return_mean) print('Saving evaluation results of the {} iteration.'.format(self.iteration)) np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration_evaluation', np.array(self.iteration_history)) np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_mean', np.array(self.evaluated_Q_mean_history)) np.save('./' + self.args.env_name + '/method_' + str( self.args.method) + '/result/true_gamma_return_mean', np.array(self.true_gamma_return_mean_history)) plot_online(self.args.env_name, self.args.method, self.args.method_name, self.max_state_num_evaluated_in_an_episode)
class Test(): def __init__(self, args, shared_queue, shared_value): super(Test, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.policy_test_queue = shared_queue[3] self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low, args.NN_type).to(self.device) self.test_step = 0 self.epoch_length = 1000 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.iteration_history = [] def load_param(self): if self.policy_test_queue.empty(): pass else: self.iteration, param = self.policy_test_queue.get() self.actor.load_state_dict(param) def run(self): epoch = 0 step = 0 epoch_reward = 0 """ write_stop = 0 writer = SummaryWriter(comment="test", log_dir='compare'+str(self.args.method)) """ while not self.stop_sign.value: self.state = self.env.reset() self.episode_step = 0 self.micro_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) self.u = self.u.squeeze(0) accumulate_reward = 0 for i in range(self.args.max_step): self.state, self.reward, self.done, self.load_action = self.env.step( self.u) if step % 10000 >= 0 and step % 10000 <= 9999: epoch_reward += self.reward / self.epoch_length self.env.render(mode='human') state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action( state_tensor.unsqueeze(0), False) self.u = self.u.squeeze(0) if self.done == True: time.sleep(1) break step += 1 self.episode_step += 1 if step % self.epoch_length == 0: self.iteration_history.append(self.iteration) self.reward_history.append(epoch_reward) self.load_param() epoch_reward = 0 epoch += 1 print(epoch) if step % self.save_interval == 0: np.save( './data/method_' + str(self.args.method) + '/result/iteration', np.array(self.iteration_history)) np.save( './data/method_' + str(self.args.method) + '/result/reward', np.array(self.reward_history)) if self.iteration >= self.args.max_train: self.stop_sign.value = 1 break """
class Application(): def __init__(self, args): super(Application, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.args = args self.device = torch.device("cpu") self.load_index = self.args.max_train self.PI_net = PINet(args).to(self.device) self.PI_net.load_state_dict( torch.load('./Net1-0_0816/PI_' + str(self.load_index) + '.pkl', map_location='cpu')) self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict( torch.load('./Net1-0_0816/policy1_' + str(self.load_index) + '.pkl', map_location='cpu')) # self.Q_net0 = QNet(args).to(self.device) # self.Q_net0.load_state_dict(torch.load('./Net1-0_0816/Q1_' + str(self.load_index) + '.pkl',map_location='cpu')) self.speed_limit_max = 20 / 3.6 self.steer_factor = 20 self.action_multiplier = np.array( [math.pi / (9 * self.steer_factor), 2], dtype=np.float32) self.safe_gap = 0. self.noise_factor = 0. self.step = 0 self.frequency = 10 self.lanekeeping_max = 5 self.lane_number = 2 self.lane_width = 3.5 self.car_length = 4.5 self.car_width = 1.855 self.road_width = self.lane_number * self.lane_width self.detect_rear_dist = 80 self.detect_front_dist = 100 self.other_length = 5.169 self.other_width = 2.392 self.position_index = 0 self.position_other = np.zeros(20, dtype=np.int) self.max_state_other = np.array([ 100, self.road_width, self.speed_limit_max, math.pi / 6, 6.5, 2.5 ], dtype=np.float32) self.max_state_ego = np.array( [self.speed_limit_max, 1, math.pi / 6, math.pi / 6, math.pi / 6, 4, self.lane_width / 2, self.road_width, self.road_width, self.speed_limit_max, self.speed_limit_max, self.lane_number - 1, 5, 2, \ math.pi / 6, math.pi / 6, math.pi / 6, math.pi / 6, math.pi / 6], dtype=np.float32) self.wheel_steer_bound = [-1.5 * math.pi, 1.5 * math.pi] self.lane_list, self.lane_center_list, self.road_angle = load_map() def _get_dist_to_roadside(self, lane_index, dist_to_center): dist_left = (self.lane_number - 1 - lane_index ) * self.lane_width + self.lane_width / 2 - dist_to_center dist_right = lane_index * self.lane_width + self.lane_width / 2 + dist_to_center return dist_left, dist_right def _get_road_related_info(self, ego_x, ego_y, ego_angle): self.ego_x = ego_x self.ego_y = ego_y if self.step >= 1: self.lane_index_old = self.lane_index for i in range(max(self.position_index - 100, 0), len(self.road_angle) - 1, 1): if self.ego_x <= self.lane_list[1][ i, 0] and self.ego_x >= self.lane_list[1][i + 1, 0]: self.position_index = i if ego_y <= self.lane_list[1][i, 1]: lane_index = 1 else: lane_index = 0 break if i == len(self.road_angle) - 2: lane_index = 0 self.position_index = i # print("lane_index",lane_index,"road_angle",len(self.road_angle)) if ego_y > self.lane_center_list[lane_index][self.position_index, 1]: dist_flag = -1.0 else: dist_flag = 1.0 dist2center = dist_flag * np.sqrt( (ego_x - self.lane_center_list[lane_index][self.position_index, 0])**2 + (ego_y - self.lane_center_list[lane_index][self.position_index, 1])**2) dist_left, dist_right = self._get_dist_to_roadside( lane_index, dist2center) self.lane_index = lane_index self.dist2center = np.float32(dist2center) self.dist_left = np.float32(dist_left) self.dist_right = np.float32(dist_right) self.delta_angle = -(ego_angle - self.road_angle[self.position_index]) self.current_road_angle = self.road_angle[self.position_index] # print(self.lane_index,self.dist2center,self.dist_left ,self.dist_right,self.delta_angle ) def _get_next_vehicle(self, t_interval): lane_list = [] for i in range(len(self.x_other)): for j in range(max(self.position_other[i] - 100, 0), len(self.road_angle) - 1, 1): if self.x_other[i] <= self.lane_center_list[0][ j, 0] and self.x_other[i] >= self.lane_center_list[0][ j + 1, 0]: index = j break dist_1_0 = np.sqrt( (self.x_other[i] - self.lane_center_list[0][index, 0])**2 + (self.y_other[i] - self.lane_center_list[0][index, 1])**2) dist_1_1 = np.sqrt( (self.x_other[i] - self.lane_center_list[1][index, 0])**2 + (self.y_other[i] - self.lane_center_list[1][index, 1])**2) self.position_other[i] = index if dist_1_0 < dist_1_1: lane_list.append(0) else: lane_list.append(1) x_next = [] y_next = [] heading_next = [] for i in range(len(self.x_other)): x_next.append(self.x_other[i] - self.v_other[i] * t_interval) if len(self.road_angle) - self.position_other[i] < 1000: x_next[i] = self.lane_center_list[lane_list[i]][0, 0] y_next.append(self.lane_center_list[lane_list[i]][0, 1]) heading_next.append(self.road_angle[0]) self.position_other[i] = 0 else: y_next.append( self.lane_center_list[lane_list[i]][self.position_other[i], 1]) heading_next.append(self.road_angle[self.position_other[i]]) # for j in range(len(self.road_angle) - 1): # if x_next[i]<=self.lane_center_list[lane_list[i]][j, 0] and x_next[i]>=self.lane_center_list[lane_list[i]][j+1, 0]: # y_next.append(self.lane_center_list[lane_list[i]][j, 1]) return x_next, y_next, self.v_other, heading_next def _get_ego_state(self, v=1, v_lat=0, yaw_rate=0, wheel_steer=0, acc=0): if self.step == 0: self.lanekeeping_time = 4 else: if self.lane_index == self.lane_index_old: self.lanekeeping_time += 1 / self.frequency else: self.lanekeeping_time = 0 self.lanekeeping_time = min(self.lanekeeping_time, self.lanekeeping_max) self.state_ego_dict_real = dict( v=v, v_lat=v_lat, yaw_rate=yaw_rate * math.pi / 180, heading=self.delta_angle * math.pi / 180, steer=wheel_steer / self.steer_factor * math.pi / 180, acc=acc, #v_long=self.ego_dynamics['Longitudinal_speed'], dist2center=self.dist2center, dist2road_bound_1=self.dist_left - self.car_width / 2 - self.safe_gap, dist2road_bound_2=self.dist_right - self.car_width / 2 - self.safe_gap, dist2speed_limit=self.speed_limit_max - v, dist2speed_limit_low=v - 0, lane=self.lane_index, other_veh_num=self.veh_num, lanekeeping_time=self.lanekeeping_time, future_heading_10=0, future_heading_20=0, future_heading_30=0, future_heading_40=0, future_heading_50=0, ) position_noise = self.noise_factor * np.clip( np.random.normal(0, 0.033), -0.1, 0.1) heading_noise = self.noise_factor * np.clip(np.random.normal(0, 0.33), -1, 1) self.state_ego_dict = dict( v=v, v_lat=v_lat, yaw_rate=yaw_rate * math.pi / 180, heading=(self.delta_angle + heading_noise) * math.pi / 180, steer=wheel_steer / self.steer_factor * math.pi / 180, acc=acc, #v_long=self.ego_dynamics['Longitudinal_speed'], dist2center=self.dist2center + position_noise, dist2road_bound_1=self.dist_left - self.car_width / 2 - self.safe_gap - position_noise, dist2road_bound_2=self.dist_right - self.car_width / 2 - self.safe_gap + position_noise, dist2speed_limit=self.speed_limit_max - v, dist2speed_limit_low=v - 0, lane=self.lane_index, other_veh_num=self.veh_num, lanekeeping_time=self.lanekeeping_time, future_heading_10=0, future_heading_20=0, future_heading_30=0, future_heading_40=0, future_heading_50=0, ) self.state_ego = np.array(list(self.state_ego_dict.values()), dtype=np.float32) / self.max_state_ego def _get_other_info(self, v_ego, x, y, v, heading): self.x_other = x self.y_other = y self.v_other = v heading_other = heading self.veh_num = 0 veh_index = [] for i in range(len(x)): if self.ego_x - x[i] < self.detect_front_dist and x[ i] - self.ego_x < self.detect_rear_dist: self.veh_num += 1 veh_index.append(i) if self.veh_num != 0: self.element_ori = np.zeros([len(veh_index), 6], dtype=np.float32) self.element_ori_real = self.element_ori.copy() for i in range(len(veh_index)): other_x = x[veh_index[i]] other_y = y[veh_index[i]] other_v = v[veh_index[i]] other_heading = heading_other[veh_index[i]] delta_x = self.ego_x - other_x delta_y = self.ego_y - other_y dist_ego2other = np.sqrt(delta_x**2 + delta_y**2) if delta_x >= 0: heading_ego2other = np.arctan(delta_y / (delta_x + 1e-6)) else: heading_ego2other = np.arctan(delta_y / (delta_x - 1e-6)) + math.pi if heading_ego2other >= math.pi: heading_ego2other = heading_ego2other - 2 * math.pi elif heading_ego2other < -math.pi: heading_ego2other = heading_ego2other + 2 * math.pi delta_heading = heading_ego2other - ( 270 * math.pi / 180 - self.current_road_angle * math.pi / 180) relate_x = dist_ego2other * np.cos(delta_heading) relate_y = dist_ego2other * np.sin(delta_heading) self.element_ori_real[i] = np.array([ relate_x, relate_y, other_v - v_ego, other_heading, self.other_length, self.other_width ], dtype=np.float32) self.element_ori[i] = np.array([ relate_x + self.noise_factor * np.clip(np.random.normal(0, 0.1), -0.3, 0.3), relate_y + self.noise_factor * np.clip(np.random.normal(0, 0.1), -0.3, 0.3), other_v - v_ego + self.noise_factor * np.clip(np.random.normal(0, 0.1), -0.3, 0.3), other_heading + self.noise_factor * np.clip(np.random.normal(0, 0.05), -0.15, 0.15), self.other_length + self.noise_factor * np.clip(np.random.normal(0, 0.02), -0.06, 0.06), self.other_width + self.noise_factor * np.clip(np.random.normal(0, 0.02), -0.06, 0.06) ], dtype=np.float32) else: self.veh_num = 1 self.element_ori = np.array([[ self.detect_front_dist, 0, 0, 0, self.other_length, self.other_width ]], dtype=np.float32) self.element_ori_real = np.array([[ self.detect_front_dist, 0, 0, 0, self.other_length, self.other_width ]], dtype=np.float32) # f2=plt.figure(0,figsize=(20, 5)) # plt.ion() # for i in range(len(self.lane_list)): # plt.plot(self.lane_list[i][:,0], self.lane_list[i][:,1], color='green', linewidth='2') # for i in range(len(self.lane_center_list)): # plt.plot(self.lane_center_list[i][:,0], self.lane_center_list[i][:,1], color='red', linewidth='2') # plt.scatter(self.ego_x,self.ego_y, color='red') # for i in range(len(x)): # plt.scatter(x[i], y[i], color='blue') # ax = plt.gca() # ax.set_aspect('equal') # ax.invert_xaxis() # ax.invert_yaxis() # plt.title(['relate_x:' + str(self.element_ori[:,0]) + ' relate_y:' + str(self.element_ori[:,1])+ " relate_angle:"+str(round(self.delta_angle,2))+\ # 'lane:' + str(round(self.lane_index,2)) + ' dist2center:' + str(round(self.dist2center,2))+ " distleft:"+str(round(self.dist_left,2))+\ # ' dist_right:' + str(round(self.dist_right,2)) + ' road angle:' + str(round(self.current_road_angle,2))]) # plt.pause(0.01) # f2.clf() # plt.figure(figsize=(20, 5)) # plt.plot(road_info[:,0], road_info[:,2], color='green', linewidth='2') # ax = plt.gca() # ax.set_aspect('equal') # ax.invert_xaxis() # plt.show() self.element = self.element_ori / self.max_state_other def simu(self): self.env = gym.make("Experiment-v3") time_init = time.time() step = 0 observation = self.env.reset(random=False, sensor_used=self.args.if_sensor_used, veh_num=self.args.veh_num, simu=True) self.episode_step = 0 simu_state_list = [] while True: state_ego = observation[0] element = observation[1] veh_number = observation[1].shape[0] state_other_vector = observation[2] simu_state_list.append(observation[3]) element_tensor = torch.FloatTensor(element.copy()).to(self.device) ego_tensor = torch.FloatTensor(state_ego.copy()).to(self.device) state_other = torch.sum(self.PI_net.evaluate(element_tensor), dim=0) state_tensor = torch.cat([ego_tensor, state_other.detach()]) self.u, log_prob, _ = self.actor.get_action( state_tensor.unsqueeze(0), True) self.u = self.u.squeeze(0) observation, self.reward, self.done, _ = self.env.step(self.u) self.env.render(mode='human') step += 1 if self.done or step == 1000: time.sleep(1) print('method', self.args.method, 'step', step, 'time', time.time() - time_init) print("!!!!!!!!!!!!!!!") break def control_step(self): time_init = time.time() element_tensor = torch.FloatTensor(self.element.copy()).to(self.device) ego_tensor = torch.FloatTensor(self.state_ego.copy()).to(self.device) state_other = torch.sum(self.PI_net.evaluate(element_tensor), dim=0) state_tensor = torch.cat([ego_tensor, state_other.detach()]) self.u, log_prob, _ = self.actor.get_action(state_tensor.unsqueeze(0), True) self.u = self.u.squeeze(0) self.step += 1 #print(time.time()-time_init) return self.u
def main(): # parameters for the gym_carla environment params = { 'display_size': 256, # screen size of bird-eye render 'obs_size': 128, # screen size of cv2 window 'dt': 0.1, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port # 'town': 'Town01', # which town to simulate 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 5000, # maximum timesteps per episode 'desired_speed': 8, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } # Set gym-carla environment env = gym.make('carla-v0', params=params) # load net device = torch.device('cpu') args = Args() args.NN_type actor = PolicyNet(args).to(device) actor.load_state_dict(torch.load('./policy1_500000.pkl',map_location='cpu')) Q_net1 = QNet(args).to(device) Q_net1.load_state_dict(torch.load('./Q1_500000.pkl',map_location='cpu')) obs, info_dict = env.reset() info = info_dict_to_array(info_dict) state_tensor = torch.FloatTensor(obs.copy()).float().to(device) info_tensor = torch.FloatTensor(info.copy()).float().to(device) # print(env.ego.get_location()) tic = time.time() done = False ret = 0 start = carla.Location(x=env.start[0], y=env.start[1], z=0.22) end = carla.Location(x=env.dest[0], y=env.dest[1], z=0.22) if args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) while not done: tac = time.time() u, log_prob = actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) u = u.squeeze(0) obs, r, done, info = env.step(u) info = info_dict_to_array(info_dict) state_tensor = torch.FloatTensor(obs.copy()).float().to(device) if args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) info_tensor = torch.FloatTensor(info.copy()).float().to(device) ret += r cv2.imshow("camera img", obs) cv2.waitKey(1) # print(info['acceleration_t'].shape) env.world.debug.draw_point(start) env.world.debug.draw_point(end) if done: toc = time.time() print("An episode took %f s" %(toc - tic)) print("total reward is", ret) print("time steps", env.time_step) env.close() env.reset() ret = 0 # print(env.ego.get_location()) done = False