from __future__ import print_function
class Evaluator(object): def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) eval_params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': int(2000 + 3 * args.num_actors), # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name, params=eval_params) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item() ) if args.alpha == 'auto' else 0 self.evaluation_interval = 20000 self.max_state_num_evaluated_in_an_episode = 50 # 500 self.episode_num_evaluation = 5 self.episode_num_test = 5 self.time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.evaluated_Q_mean_history = [] self.evaluated_Q_std_history = [] self.true_gamma_return_mean_history = [] self.policy_entropy_history = [] self.a_std_history = [] self.a_abs_history = [] def average_max_n(self, list_for_average, n): sorted_list = sorted(list_for_average, reverse=True) return sum(sorted_list[:n]) / n def run_an_episode(self, deterministic): #state_list = [] action_list = [] log_prob_list = [] reward_list = [] evaluated_Q_list = [] Q_std_list = [] a_std_list = [] done = 0 state, info = self.env.reset() while not done and len(reward_list) < (self.args.max_step - 1): state_tensor = torch.FloatTensor(state.copy()).float().to( self.device) info_tensor = torch.FloatTensor(info.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) # 3, 256, 256 u, log_prob, a_std = self.actor.get_action( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), deterministic) log_prob_list.append(log_prob) a_std_list.append(a_std) if self.args.double_Q and not self.args.double_actor: q = torch.min( self.Q_net1.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0], self.Q_net2.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0]) else: q, q_std, _ = self.Q_net1.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device)) evaluated_Q_list.append(q.detach().item()) if self.args.distributional_Q: Q_std_list.append(q_std.detach().item()) else: Q_std_list.append(0) u = u.squeeze(0) state, reward, done, info = self.env.step(u) # self.env.render(mode='human') action_list.append(u) reward_list.append(reward * self.args.reward_scale) if not deterministic: entropy_list = list(-self.alpha * np.array(log_prob_list)) true_gamma_return_list = cal_gamma_return_of_an_episode( reward_list, entropy_list, self.args.gamma) policy_entropy = -sum(log_prob_list) / len(log_prob_list) a_std_mean = np.mean(np.array(a_std_list), axis=0) a_abs_mean = np.mean(np.abs(np.array(action_list)), axis=0) return dict( #state_list=np.array(state_list), #action_list=np.array(action_list), log_prob_list=np.array(log_prob_list), policy_entropy=policy_entropy, #reward_list=np.array(reward_list), a_std_mean=a_std_mean, a_abs_mean=a_abs_mean, evaluated_Q_list=np.array(evaluated_Q_list), Q_std_list=np.array(Q_std_list), true_gamma_return_list=true_gamma_return_list, ) else: episode_return = sum(reward_list) / self.args.reward_scale episode_len = len(reward_list) return dict(episode_return=episode_return, episode_len=episode_len) def run_n_episodes(self, n, max_state, deterministic): n_episode_state_list = [] n_episode_action_list = [] n_episode_log_prob_list = [] n_episode_reward_list = [] n_episode_evaluated_Q_list = [] n_episode_Q_std_list = [] n_episode_true_gamma_return_list = [] n_episode_return_list = [] n_episode_len_list = [] n_episode_policyentropy_list = [] n_episode_a_std_list = [] for _ in range(n): episode_info = self.run_an_episode(deterministic) # n_episode_state_list.append(episode_info['state_list']) # n_episode_action_list.append(episode_info['action_list']) # n_episode_log_prob_list.append(episode_info['log_prob_list']) # n_episode_reward_list.append(episode_info['reward_list']) if not deterministic: n_episode_evaluated_Q_list.append( episode_info['evaluated_Q_list']) n_episode_Q_std_list.append(episode_info['Q_std_list']) n_episode_true_gamma_return_list.append( episode_info['true_gamma_return_list']) n_episode_policyentropy_list.append( episode_info['policy_entropy']) n_episode_a_std_list.append(episode_info['a_std_mean']) n_episode_action_list.append(episode_info['a_abs_mean']) else: n_episode_return_list.append(episode_info['episode_return']) n_episode_len_list.append(episode_info['episode_len']) if not deterministic: average_policy_entropy = sum(n_episode_policyentropy_list) / len( n_episode_policyentropy_list) average_a_std = np.mean(np.array(n_episode_a_std_list), axis=0) average_a_abs = np.mean(np.array(n_episode_action_list), axis=0) # n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history)) # n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history)) def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi): tmp = list(copy.deepcopy(list_of_n_epi)) tmp[0] = tmp[0] if len( tmp[0]) <= max_state else tmp[0][:max_state] def reduce_fuc(a, b): return np.concatenate( [a, b]) if len(b) < max_state else np.concatenate( [a, b[:max_state]]) interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp) return sum(interest_epi_part_of_one_ite) / len( interest_epi_part_of_one_ite) evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_evaluated_Q_list)) evaluated_Q_std = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_Q_std_list)) true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_true_gamma_return_list)) return dict(evaluated_Q_mean=evaluated_Q_mean, true_gamma_return_mean=true_gamma_return_mean, evaluated_Q_std=evaluated_Q_std, n_episode_reward_list=np.array(n_episode_reward_list), policy_entropy=average_policy_entropy, a_std=average_a_std, a_abs=average_a_abs) else: average_return_with_diff_base = np.array([ self.average_max_n(n_episode_return_list, x) for x in [1, self.episode_num_test - 2, self.episode_num_test] ]) average_reward = sum(n_episode_return_list) / sum( n_episode_len_list) return dict( n_episode_reward_list=np.array(n_episode_reward_list), average_return_with_diff_base=average_return_with_diff_base, average_reward=average_reward, ) def run(self): while not self.stop_sign.value: if self.iteration_counter.value % self.evaluation_interval == 0: self.alpha = np.exp(self.log_alpha_share.detach().item() ) if self.args.alpha == 'auto' else 0 self.iteration = self.iteration_counter.value self.actor.load_state_dict(self.actor_share.state_dict()) self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.Q_net2.load_state_dict(self.Q_net2_share.state_dict()) delta_time = time.time() - self.time self.time = time.time() n_episode_info = self.run_n_episodes( self.episode_num_evaluation, self.max_state_num_evaluated_in_an_episode, False) self.iteration_history.append(self.iteration) self.evaluated_Q_mean_history.append( n_episode_info['evaluated_Q_mean']) self.evaluated_Q_std_history.append( n_episode_info['evaluated_Q_std']) self.true_gamma_return_mean_history.append( n_episode_info['true_gamma_return_mean']) self.time_history.append(delta_time) # self.list_of_n_episode_rewards_history.append(list_of_n_episode_rewards) self.alpha_history.append(self.alpha.item()) self.policy_entropy_history.append( n_episode_info['policy_entropy']) self.a_std_history.append(n_episode_info['a_std']) self.a_abs_history.append(n_episode_info['a_abs']) n_episode_info_test = self.run_n_episodes( self.episode_num_test, self.max_state_num_evaluated_in_an_episode, True) self.average_return_with_diff_base_history.append( n_episode_info_test['average_return_with_diff_base']) self.average_reward_history.append( n_episode_info_test['average_reward']) print('Saving evaluation results of the {} iteration.'.format( self.iteration)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration', np.array(self.iteration_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_mean', np.array(self.evaluated_Q_mean_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_std', np.array(self.evaluated_Q_std_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/true_gamma_return_mean', np.array(self.true_gamma_return_mean_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/time', np.array(self.time_history)) # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards', # np.array(self.list_of_n_episode_rewards_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_return_with_diff_base', np.array(self.average_return_with_diff_base_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_reward', np.array(self.average_reward_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/alpha', np.array(self.alpha_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/policy_entropy', np.array(self.policy_entropy_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/a_std', np.array(self.a_std_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/a_abs', np.array(self.a_abs_history)) # plot_online(self.args.env_name, self.args.method, self.args.method_name, # self.max_state_num_evaluated_in_an_episode) if self.iteration >= self.args.max_train: self.stop_sign.value = 1 break
class Learner(): def __init__(self, args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i): super(Learner, self).__init__() self.args = args seed = self.args.seed self.init_time = self.args.init_time np.random.seed(seed) torch.manual_seed(seed) self.agent_id = i self.experience_out_queue = [] for i in range(args.num_buffers): self.experience_out_queue.append(shared_queue[1][i]) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.device = device if self.device == torch.device("cpu"): self.gpu = False else: self.gpu = True self.lock = lock self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \ self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer self.Q_net1 = QNet(args).to(self.device) self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR( self.Q_net1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net1.train() self.Q_net1_target = QNet(args).to(self.device) self.Q_net1_target.train() self.Q_net2 = QNet(args).to(self.device) self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR( self.Q_net2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net2.train() self.Q_net2_target = QNet(args).to(self.device) self.Q_net2_target.train() self.actor1 = PolicyNet(args).to(self.device) self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR( self.actor1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor1.train() self.actor1_target = PolicyNet(args).to(self.device) self.actor1_target.train() self.actor2 = PolicyNet(args).to(self.device) self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR( self.actor2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor2.train() self.actor2_target = PolicyNet(args).to(self.device) self.actor2_target.train() self.scheduler_alpha = lr_scheduler.CosineAnnealingLR( self.alpha_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) if self.args.alpha == 'auto': self.target_entropy = args.target_entropy else: self.alpha = torch.tensor(self.args.alpha) def get_qloss(self, q, q_std, target_q, target_q_bound): if self.args.distributional_Q: # loss = -Normal(q, q_std).log_prob(target_q).mean() # loss = torch.mean(-Normal(q, q_std).log_prob(target_q_bound)*self.weight \ # + self.weight.logical_not()*torch.pow(q-target_q,2)) loss = torch.mean(torch.pow(q-target_q,2)/(2*torch.pow(q_std.detach(),2)) \ + torch.pow(q.detach()-target_q_bound,2)/(2*torch.pow(q_std,2))\ + torch.log(q_std)) else: criterion = nn.MSELoss() loss = criterion(q, target_q) return loss def get_policyloss(self, q, log_prob_a_new): loss = (self.alpha.detach() * log_prob_a_new - q).mean() return loss def update_net(self, loss, optimizer, net, net_share, scheduler): optimizer.zero_grad() if self.gpu: if self.args.alpha == 'auto': if net is not self.log_alpha: net.zero_grad() else: net.zero_grad() loss.backward() if self.args.alpha == 'auto': if net is self.log_alpha: if self.log_alpha_share.grad is None or self.log_alpha_share.grad == 0: self.log_alpha_share._grad = self.log_alpha.grad else: ensure_shared_grads(model=net, shared_model=net_share, gpu=self.gpu) else: ensure_shared_grads(model=net, shared_model=net_share, gpu=self.gpu) optimizer.step() scheduler.step(self.iteration) def target_q(self, r, done, q, q_std, q_next, log_prob_a_next): target_q = r + (1 - done) * self.args.gamma * ( q_next - self.alpha.detach() * log_prob_a_next) if self.args.distributional_Q: if self.args.adaptive_bound: target_max = q + 3 * q_std target_min = q - 3 * q_std target_q = torch.min(target_q, target_max) target_q = torch.max(target_q, target_min) difference = torch.clamp(target_q - q, -self.args.TD_bound, self.args.TD_bound) target_q_bound = q + difference self.weight = torch.le(torch.abs(target_q - q), self.args.TD_bound).detach() else: target_q_bound = target_q return target_q.detach(), target_q_bound.detach() def send_to_device(self, s, info, a, r, s_next, info_next, done, device): s = s.to(device) info = info.to(device) a = a.to(device) r = r.to(device) s_next = s_next.to(device) info_next = info_next.to(device) done = done.to(device) return s, info, a, r, s_next, info_next, done def run(self): local_iteration = 0 index = np.random.randint(0, self.args.num_buffers) while self.experience_out_queue[index].empty( ) and not self.stop_sign.value: index = np.random.randint(0, self.args.num_buffers) time.sleep(0.1) while not self.stop_sign.value: self.iteration = self.iteration_counter.value self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.Q_net1_target.load_state_dict( self.Q_net1_target_share.state_dict()) self.Q_net2.load_state_dict(self.Q_net2_share.state_dict()) self.Q_net2_target.load_state_dict( self.Q_net2_target_share.state_dict()) self.actor1.load_state_dict(self.actor1_share.state_dict()) self.actor1_target.load_state_dict( self.actor1_target_share.state_dict()) self.actor2.load_state_dict(self.actor2_share.state_dict()) self.actor2_target.load_state_dict( self.actor2_target_share.state_dict()) if self.args.alpha == 'auto': self.log_alpha = self.log_alpha_share.detach().clone( ).requires_grad_(True) self.alpha = self.log_alpha.exp().to(self.device) index = np.random.randint(0, self.args.num_buffers) while self.experience_out_queue[index].empty( ) and not self.stop_sign.value: index = np.random.randint(0, self.args.num_buffers) time.sleep(0.1) if not self.experience_out_queue[index].empty(): s, info, a, r, s_next, info_next, done = self.experience_out_queue[ index].get() s, info, a, r, s_next, info_next, done = self.send_to_device( s, info, a, r, s_next, info_next, done, self.device) q_1, q_std_1, _ = self.Q_net1.evaluate(s, info, a, device=self.device, min=False) if self.args.double_Q: q_2, q_std_2, _ = self.Q_net2.evaluate(s, info, a, device=self.device, min=False) smoothing_trick = False if not self.args.stochastic_actor: if self.args.policy_smooth: smoothing_trick = True a_new_1, log_prob_a_new_1, a_new_std_1 = self.actor1.evaluate( s, info, smooth_policy=False, device=self.device) a_next_1, log_prob_a_next_1, _ = self.actor1_target.evaluate( s_next, info_next, smooth_policy=smoothing_trick, device=self.device) if self.args.double_actor: a_new_2, log_prob_a_new_2, _ = self.actor2.evaluate( s, info, smooth_policy=False, device=self.device) a_next_2, log_prob_a_next_2, _ = self.actor2_target.evaluate( s_next, info_next, smooth_policy=smoothing_trick, device=self.device) if self.args.double_Q and self.args.double_actor: q_next_target_1, _, q_next_sample_1 = self.Q_net2_target.evaluate( s_next, info_next, a_next_1, device=self.device, min=False) q_next_target_2, _, _ = self.Q_net1_target.evaluate( s_next, info_next, a_next_2, device=self.device, min=False) target_q_1, target_q_1_bound = self.target_q( r, done, q_1.detach(), q_std_1.detach(), q_next_target_1.detach(), log_prob_a_next_1.detach()) target_q_2, target_q_2_bound = self.target_q( r, done, q_2.detach(), q_std_2.detach(), q_next_target_2.detach(), log_prob_a_next_2.detach()) else: q_next_1, _, q_next_sample_1 = self.Q_net1_target.evaluate( s_next, info_next, a_next_1, device=self.device, min=False) if self.args.double_Q: q_next_2, _, _ = self.Q_net2_target.evaluate( s_next, info_next, a_next_1, device=self.device, min=False) q_next_target_1 = torch.min(q_next_1, q_next_2) elif self.args.distributional_Q: q_next_target_1 = q_next_sample_1 else: q_next_target_1 = q_next_1 target_q_1, target_q_1_bound = self.target_q( r, done, q_1.detach(), q_std_1.detach(), q_next_target_1.detach(), log_prob_a_next_1.detach()) if self.args.double_Q and self.args.double_actor: q_object_1, _, _ = self.Q_net1.evaluate(s, info, a_new_1, device=self.device, min=False) q_object_2, _, _ = self.Q_net2.evaluate(s, info, a_new_2, device=self.device, min=False) else: q_new_1, _, _ = self.Q_net1.evaluate(s, info, a_new_1, device=self.device, min=False) if self.args.double_Q: q_new_2, _, _ = self.Q_net2.evaluate(s, info, a_new_1, device=self.device, min=False) q_object_1 = torch.min(q_new_1, q_new_2) elif self.args.distributional_Q: q_object_1 = q_new_1 else: q_object_1 = q_new_1 if local_iteration % self.args.delay_update == 0: if self.args.alpha == 'auto': alpha_loss = -(self.log_alpha * (log_prob_a_new_1.detach().cpu() + self.target_entropy)).mean() self.update_net(alpha_loss, self.alpha_optimizer, self.log_alpha, self.log_alpha_share, self.scheduler_alpha) q_loss_1 = self.get_qloss(q_1, q_std_1, target_q_1, target_q_1_bound) self.update_net(q_loss_1, self.Q_net1_optimizer, self.Q_net1, self.Q_net1_share, self.scheduler_Q_net1) if self.args.double_Q: if self.args.double_actor: q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_2, target_q_2_bound) self.update_net(q_loss_2, self.Q_net2_optimizer, self.Q_net2, self.Q_net2_share, self.scheduler_Q_net2) else: q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_1, target_q_1_bound) self.update_net(q_loss_2, self.Q_net2_optimizer, self.Q_net2, self.Q_net2_share, self.scheduler_Q_net2) if self.args.code_model == "train": if local_iteration % self.args.delay_update == 0: policy_loss_1 = self.get_policyloss( q_object_1, log_prob_a_new_1) self.update_net(policy_loss_1, self.actor1_optimizer, self.actor1, self.actor1_share, self.scheduler_actor1) slow_sync_param(self.actor1_share, self.actor1_target_share, self.args.tau, self.gpu) if self.args.double_actor: policy_loss_2 = self.get_policyloss( q_object_2, log_prob_a_new_2) self.update_net(policy_loss_2, self.actor2_optimizer, self.actor2, self.actor2_share, self.scheduler_actor2) slow_sync_param(self.actor2_share, self.actor2_target_share, self.args.tau, self.gpu) if local_iteration % self.args.delay_update == 0: slow_sync_param(self.Q_net1_share, self.Q_net1_target_share, self.args.tau, self.gpu) if self.args.double_Q: slow_sync_param(self.Q_net2_share, self.Q_net2_target_share, self.args.tau, self.gpu) with self.lock: self.iteration_counter.value += 1 local_iteration += 1 if self.iteration % self.args.save_model_period == 0 or ( self.iteration == 0 and self.agent_id == 0): torch.save( self.actor1.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy1_' + str(self.iteration) + '.pkl') torch.save( self.Q_net1.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q1_' + str(self.iteration) + '.pkl') if self.args.alpha == 'auto': np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/log_alpha' + str(self.iteration), self.log_alpha.detach().cpu().numpy()) if self.args.double_Q: torch.save( self.Q_net2.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q2_' + str(self.iteration) + '.pkl') if self.args.double_actor: torch.save( self.actor2.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy2_' + str(self.iteration) + '.pkl') if self.iteration % 500 == 0 or self.iteration == 0 and self.agent_id == 0: print("agent", self.agent_id, "method", self.args.method, "iteration", self.iteration, "time", time.time() - self.init_time) print("loss_1", q_loss_1, "alpha", self.alpha, "lr", self.scheduler_Q_net1.get_lr(), self.scheduler_Q_net2.get_lr(), self.scheduler_actor1.get_lr(), self.scheduler_actor2.get_lr(), self.scheduler_alpha.get_lr()) print("q_std", q_std_1.t()[0][0:8]) print("a_std", a_new_std_1.t()[0][0:8])
class Simulation(): def __init__(self, args, shared_queue,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.policy_test_queue = shared_queue[3] self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = 20000 self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device) self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl')) self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl')) self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_m0_history =[] self.Q_m1_history = [] self.Q_m2_history = [] self.Q_std_m2_history = [] def load_param(self): if self.policy_test_queue.empty(): pass else: self.iteration, param = self.policy_test_queue.get() self.actor.load_state_dict(param) def run(self): step = 0 while True: self.state = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) for i in range(self.args.max_step): q_m0 = self.Q_net1_m0(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] q_m1 = torch.min( self.Q_net1_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0], self.Q_net2_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]) q_m2, q_std, _ = self.Q_net1_m2.evaluate(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device)) self.Q_m0_history.append(q_m0.detach().item()) self.Q_m1_history.append(q_m1.detach().item()) self.Q_m2_history.append(q_m2.detach().item()) self.Q_std_m2_history.append(q_std.detach().item()) self.u = self.u.squeeze(0) self.state, self.reward, self.done, _ = self.env.step(self.u) self.reward_history.append(self.reward) self.done_history.append(self.done) self.entropy_history.append(log_prob) if step%10000 >=0 and step%10000 <=9999: self.env.render(mode='human') state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) if self.done == True: time.sleep(1) print("!!!!!!!!!!!!!!!") break step += 1 self.episode_step += 1 if self.done == True: break for i in range(len(self.Q_m0_history)): a = 0 for j in range(i, len(self.Q_m0_history), 1): a += pow(self.args.gamma, j-i)*self.reward_history[j] for z in range(i+1, len(self.Q_m0_history), 1): a -= self.args.alpha * pow(self.args.gamma, z-i) * self.entropy_history[z] self.Q_real_history.append(a) print(self.reward_history) print(self.entropy_history) print(self.Q_m2_history) print(self.Q_std_m2_history) plt.figure() x = np.arange(0,len(self.Q_m0_history),1) plt.plot(x, np.array(self.Q_m0_history), 'r', linewidth=2.0) plt.plot(x, np.array(self.Q_m1_history), 'g', linewidth=2.0) plt.plot(x, np.array(self.Q_m2_history), 'b', linewidth=2.0) plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0) plt.show()