def __init__(self, state_size, action_size, seed, double_agent=False,dueling_agent=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_agent(bool) : True if we want to use DDQN dueling_agent (bool): True if we want to use Dueling """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_agent=double_agent self.dueling_agent=dueling_agent self.qnetwork_local = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device) self.qnetwork_target = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device) self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0 self.evaluation_interval = 50000 self.max_state_num_evaluated_in_an_episode = 500 self.episode_num_to_run = 10 self.iteration_history = [] self.evaluated_Q_mean_history=[] self.true_gamma_return_mean_history=[] # self.n_episodes_info_history = [] self.evaluated_Q_history = [] self.true_gamma_return_history = []
def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) eval_params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': int(2000 + 3 * args.num_actors), # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name, params=eval_params) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item() ) if args.alpha == 'auto' else 0 self.evaluation_interval = 20000 self.max_state_num_evaluated_in_an_episode = 50 # 500 self.episode_num_evaluation = 5 self.episode_num_test = 5 self.time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.evaluated_Q_mean_history = [] self.evaluated_Q_std_history = [] self.true_gamma_return_mean_history = [] self.policy_entropy_history = [] self.a_std_history = [] self.a_abs_history = []
def __init__(self, args,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) simu_params = { 'number_of_vehicles': 0, 'number_of_walkers': 0, 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name, params=simu_params) self.device = torch.device("cpu") self.load_index = self.args.max_train # self.load_index = 40000 self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_history =[] self.Q_std_history = []
def __init__(self, args, shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = self.args.max_train self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl', map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl', map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl', map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history = [] self.done_history = [] self.Q_real_history = [] self.Q_history = [] self.Q_std_history = []
def __init__(self, args, shared_queue, shared_value, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) self.experience_queue = shared_queue[0] self.policy_param_queue = shared_queue[1] self.q_param_queue = shared_queue[2] self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name) self.args = args self.device = torch.device("cpu") self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low, args.NN_type).to(self.device) self.Q_net1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item() ) if args.alpha == 'auto' else 0 self.evaluation_interval = 20000 self.max_state_num_evaluated_in_an_episode = 500 self.episode_num_evaluation = 5 self.episode_num_test = 5 self.time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.evaluated_Q_mean_history = [] self.evaluated_Q_std_history = [] self.true_gamma_return_mean_history = [] self.policy_entropy_history = [] self.a_std_history = [] self.a_abs_history = []
def __init__(self, args, shared_queue,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.policy_test_queue = shared_queue[3] self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = 20000 self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device) self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl')) self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl')) self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_m0_history =[] self.Q_m1_history = [] self.Q_m2_history = [] self.Q_std_m2_history = []
def __init__(self, args, shared_queue, shared_value, share_net, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name) self.args = args self.experience_in_queue = [] for i in range(args.num_buffers): self.experience_in_queue.append(shared_queue[0][i]) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net1_share = share_net[1] self.actor_share = share_net[0]
from __future__ import print_function
def __init__(self, args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i): super(Learner, self).__init__() self.args = args seed = self.args.seed self.init_time = self.args.init_time np.random.seed(seed) torch.manual_seed(seed) self.agent_id = i self.experience_out_queue = [] for i in range(args.num_buffers): self.experience_out_queue.append(shared_queue[1][i]) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.device = device if self.device == torch.device("cpu"): self.gpu = False else: self.gpu = True self.lock = lock self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \ self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer self.Q_net1 = QNet(args).to(self.device) self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR( self.Q_net1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net1.train() self.Q_net1_target = QNet(args).to(self.device) self.Q_net1_target.train() self.Q_net2 = QNet(args).to(self.device) self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR( self.Q_net2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net2.train() self.Q_net2_target = QNet(args).to(self.device) self.Q_net2_target.train() self.actor1 = PolicyNet(args).to(self.device) self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR( self.actor1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor1.train() self.actor1_target = PolicyNet(args).to(self.device) self.actor1_target.train() self.actor2 = PolicyNet(args).to(self.device) self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR( self.actor2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor2.train() self.actor2_target = PolicyNet(args).to(self.device) self.actor2_target.train() self.scheduler_alpha = lr_scheduler.CosineAnnealingLR( self.alpha_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) if self.args.alpha == 'auto': self.target_entropy = args.target_entropy else: self.alpha = torch.tensor(self.args.alpha)
class Learner(): def __init__(self, args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i): super(Learner, self).__init__() self.args = args seed = self.args.seed self.init_time = self.args.init_time np.random.seed(seed) torch.manual_seed(seed) self.agent_id = i self.experience_out_queue = [] for i in range(args.num_buffers): self.experience_out_queue.append(shared_queue[1][i]) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.device = device if self.device == torch.device("cpu"): self.gpu = False else: self.gpu = True self.lock = lock self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \ self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer self.Q_net1 = QNet(args).to(self.device) self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR( self.Q_net1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net1.train() self.Q_net1_target = QNet(args).to(self.device) self.Q_net1_target.train() self.Q_net2 = QNet(args).to(self.device) self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR( self.Q_net2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net2.train() self.Q_net2_target = QNet(args).to(self.device) self.Q_net2_target.train() self.actor1 = PolicyNet(args).to(self.device) self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR( self.actor1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor1.train() self.actor1_target = PolicyNet(args).to(self.device) self.actor1_target.train() self.actor2 = PolicyNet(args).to(self.device) self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR( self.actor2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor2.train() self.actor2_target = PolicyNet(args).to(self.device) self.actor2_target.train() self.scheduler_alpha = lr_scheduler.CosineAnnealingLR( self.alpha_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) if self.args.alpha == 'auto': self.target_entropy = args.target_entropy else: self.alpha = torch.tensor(self.args.alpha) def get_qloss(self, q, q_std, target_q, target_q_bound): if self.args.distributional_Q: # loss = -Normal(q, q_std).log_prob(target_q).mean() # loss = torch.mean(-Normal(q, q_std).log_prob(target_q_bound)*self.weight \ # + self.weight.logical_not()*torch.pow(q-target_q,2)) loss = torch.mean(torch.pow(q-target_q,2)/(2*torch.pow(q_std.detach(),2)) \ + torch.pow(q.detach()-target_q_bound,2)/(2*torch.pow(q_std,2))\ + torch.log(q_std)) else: criterion = nn.MSELoss() loss = criterion(q, target_q) return loss def get_policyloss(self, q, log_prob_a_new): loss = (self.alpha.detach() * log_prob_a_new - q).mean() return loss def update_net(self, loss, optimizer, net, net_share, scheduler): optimizer.zero_grad() if self.gpu: if self.args.alpha == 'auto': if net is not self.log_alpha: net.zero_grad() else: net.zero_grad() loss.backward() if self.args.alpha == 'auto': if net is self.log_alpha: if self.log_alpha_share.grad is None or self.log_alpha_share.grad == 0: self.log_alpha_share._grad = self.log_alpha.grad else: ensure_shared_grads(model=net, shared_model=net_share, gpu=self.gpu) else: ensure_shared_grads(model=net, shared_model=net_share, gpu=self.gpu) optimizer.step() scheduler.step(self.iteration) def target_q(self, r, done, q, q_std, q_next, log_prob_a_next): target_q = r + (1 - done) * self.args.gamma * ( q_next - self.alpha.detach() * log_prob_a_next) if self.args.distributional_Q: if self.args.adaptive_bound: target_max = q + 3 * q_std target_min = q - 3 * q_std target_q = torch.min(target_q, target_max) target_q = torch.max(target_q, target_min) difference = torch.clamp(target_q - q, -self.args.TD_bound, self.args.TD_bound) target_q_bound = q + difference self.weight = torch.le(torch.abs(target_q - q), self.args.TD_bound).detach() else: target_q_bound = target_q return target_q.detach(), target_q_bound.detach() def send_to_device(self, s, info, a, r, s_next, info_next, done, device): s = s.to(device) info = info.to(device) a = a.to(device) r = r.to(device) s_next = s_next.to(device) info_next = info_next.to(device) done = done.to(device) return s, info, a, r, s_next, info_next, done def run(self): local_iteration = 0 index = np.random.randint(0, self.args.num_buffers) while self.experience_out_queue[index].empty( ) and not self.stop_sign.value: index = np.random.randint(0, self.args.num_buffers) time.sleep(0.1) while not self.stop_sign.value: self.iteration = self.iteration_counter.value self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.Q_net1_target.load_state_dict( self.Q_net1_target_share.state_dict()) self.Q_net2.load_state_dict(self.Q_net2_share.state_dict()) self.Q_net2_target.load_state_dict( self.Q_net2_target_share.state_dict()) self.actor1.load_state_dict(self.actor1_share.state_dict()) self.actor1_target.load_state_dict( self.actor1_target_share.state_dict()) self.actor2.load_state_dict(self.actor2_share.state_dict()) self.actor2_target.load_state_dict( self.actor2_target_share.state_dict()) if self.args.alpha == 'auto': self.log_alpha = self.log_alpha_share.detach().clone( ).requires_grad_(True) self.alpha = self.log_alpha.exp().to(self.device) index = np.random.randint(0, self.args.num_buffers) while self.experience_out_queue[index].empty( ) and not self.stop_sign.value: index = np.random.randint(0, self.args.num_buffers) time.sleep(0.1) if not self.experience_out_queue[index].empty(): s, info, a, r, s_next, info_next, done = self.experience_out_queue[ index].get() s, info, a, r, s_next, info_next, done = self.send_to_device( s, info, a, r, s_next, info_next, done, self.device) q_1, q_std_1, _ = self.Q_net1.evaluate(s, info, a, device=self.device, min=False) if self.args.double_Q: q_2, q_std_2, _ = self.Q_net2.evaluate(s, info, a, device=self.device, min=False) smoothing_trick = False if not self.args.stochastic_actor: if self.args.policy_smooth: smoothing_trick = True a_new_1, log_prob_a_new_1, a_new_std_1 = self.actor1.evaluate( s, info, smooth_policy=False, device=self.device) a_next_1, log_prob_a_next_1, _ = self.actor1_target.evaluate( s_next, info_next, smooth_policy=smoothing_trick, device=self.device) if self.args.double_actor: a_new_2, log_prob_a_new_2, _ = self.actor2.evaluate( s, info, smooth_policy=False, device=self.device) a_next_2, log_prob_a_next_2, _ = self.actor2_target.evaluate( s_next, info_next, smooth_policy=smoothing_trick, device=self.device) if self.args.double_Q and self.args.double_actor: q_next_target_1, _, q_next_sample_1 = self.Q_net2_target.evaluate( s_next, info_next, a_next_1, device=self.device, min=False) q_next_target_2, _, _ = self.Q_net1_target.evaluate( s_next, info_next, a_next_2, device=self.device, min=False) target_q_1, target_q_1_bound = self.target_q( r, done, q_1.detach(), q_std_1.detach(), q_next_target_1.detach(), log_prob_a_next_1.detach()) target_q_2, target_q_2_bound = self.target_q( r, done, q_2.detach(), q_std_2.detach(), q_next_target_2.detach(), log_prob_a_next_2.detach()) else: q_next_1, _, q_next_sample_1 = self.Q_net1_target.evaluate( s_next, info_next, a_next_1, device=self.device, min=False) if self.args.double_Q: q_next_2, _, _ = self.Q_net2_target.evaluate( s_next, info_next, a_next_1, device=self.device, min=False) q_next_target_1 = torch.min(q_next_1, q_next_2) elif self.args.distributional_Q: q_next_target_1 = q_next_sample_1 else: q_next_target_1 = q_next_1 target_q_1, target_q_1_bound = self.target_q( r, done, q_1.detach(), q_std_1.detach(), q_next_target_1.detach(), log_prob_a_next_1.detach()) if self.args.double_Q and self.args.double_actor: q_object_1, _, _ = self.Q_net1.evaluate(s, info, a_new_1, device=self.device, min=False) q_object_2, _, _ = self.Q_net2.evaluate(s, info, a_new_2, device=self.device, min=False) else: q_new_1, _, _ = self.Q_net1.evaluate(s, info, a_new_1, device=self.device, min=False) if self.args.double_Q: q_new_2, _, _ = self.Q_net2.evaluate(s, info, a_new_1, device=self.device, min=False) q_object_1 = torch.min(q_new_1, q_new_2) elif self.args.distributional_Q: q_object_1 = q_new_1 else: q_object_1 = q_new_1 if local_iteration % self.args.delay_update == 0: if self.args.alpha == 'auto': alpha_loss = -(self.log_alpha * (log_prob_a_new_1.detach().cpu() + self.target_entropy)).mean() self.update_net(alpha_loss, self.alpha_optimizer, self.log_alpha, self.log_alpha_share, self.scheduler_alpha) q_loss_1 = self.get_qloss(q_1, q_std_1, target_q_1, target_q_1_bound) self.update_net(q_loss_1, self.Q_net1_optimizer, self.Q_net1, self.Q_net1_share, self.scheduler_Q_net1) if self.args.double_Q: if self.args.double_actor: q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_2, target_q_2_bound) self.update_net(q_loss_2, self.Q_net2_optimizer, self.Q_net2, self.Q_net2_share, self.scheduler_Q_net2) else: q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_1, target_q_1_bound) self.update_net(q_loss_2, self.Q_net2_optimizer, self.Q_net2, self.Q_net2_share, self.scheduler_Q_net2) if self.args.code_model == "train": if local_iteration % self.args.delay_update == 0: policy_loss_1 = self.get_policyloss( q_object_1, log_prob_a_new_1) self.update_net(policy_loss_1, self.actor1_optimizer, self.actor1, self.actor1_share, self.scheduler_actor1) slow_sync_param(self.actor1_share, self.actor1_target_share, self.args.tau, self.gpu) if self.args.double_actor: policy_loss_2 = self.get_policyloss( q_object_2, log_prob_a_new_2) self.update_net(policy_loss_2, self.actor2_optimizer, self.actor2, self.actor2_share, self.scheduler_actor2) slow_sync_param(self.actor2_share, self.actor2_target_share, self.args.tau, self.gpu) if local_iteration % self.args.delay_update == 0: slow_sync_param(self.Q_net1_share, self.Q_net1_target_share, self.args.tau, self.gpu) if self.args.double_Q: slow_sync_param(self.Q_net2_share, self.Q_net2_target_share, self.args.tau, self.gpu) with self.lock: self.iteration_counter.value += 1 local_iteration += 1 if self.iteration % self.args.save_model_period == 0 or ( self.iteration == 0 and self.agent_id == 0): torch.save( self.actor1.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy1_' + str(self.iteration) + '.pkl') torch.save( self.Q_net1.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q1_' + str(self.iteration) + '.pkl') if self.args.alpha == 'auto': np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/log_alpha' + str(self.iteration), self.log_alpha.detach().cpu().numpy()) if self.args.double_Q: torch.save( self.Q_net2.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q2_' + str(self.iteration) + '.pkl') if self.args.double_actor: torch.save( self.actor2.state_dict(), './' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy2_' + str(self.iteration) + '.pkl') if self.iteration % 500 == 0 or self.iteration == 0 and self.agent_id == 0: print("agent", self.agent_id, "method", self.args.method, "iteration", self.iteration, "time", time.time() - self.init_time) print("loss_1", q_loss_1, "alpha", self.alpha, "lr", self.scheduler_Q_net1.get_lr(), self.scheduler_Q_net2.get_lr(), self.scheduler_actor1.get_lr(), self.scheduler_actor2.get_lr(), self.scheduler_alpha.get_lr()) print("q_std", q_std_1.t()[0][0:8]) print("a_std", a_new_std_1.t()[0][0:8])
# 1 Cart Velocity -Inf Inf int(check_bound(np.degrees(observation[2]), np.arange(-11, 11, 1))), # 2 Pole Angle -24 deg 24 deg int(check_bound(observation[3], np.arange(-0.88, 0.88, 0.08))) # 3 Pole Velocity At Tip -Inf Inf ] # Create Agent actions = range(env.action_space.n) agent = Agent(None, (25, 25, 25), actions) temp_agent = agent.__copy__() # Create Network net_size = 128 net = QNet(env.observation_space.shape[0], env.action_space.n, net_size, device).to(device) optimizer = optim.Adam(net.parameters(), lr=1e-3) net.train() ok = False guts = 0 i_episode = 0 total = 0 loss = 0 guts_required = 100 guts_print_div = 10 big_data = [[], []] print("Learning...") while not ok: # Agent learning while guts < guts_required:
class Evaluator(object): def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) eval_params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': int(2000 + 3 * args.num_actors), # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name, params=eval_params) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item() ) if args.alpha == 'auto' else 0 self.evaluation_interval = 20000 self.max_state_num_evaluated_in_an_episode = 50 # 500 self.episode_num_evaluation = 5 self.episode_num_test = 5 self.time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.evaluated_Q_mean_history = [] self.evaluated_Q_std_history = [] self.true_gamma_return_mean_history = [] self.policy_entropy_history = [] self.a_std_history = [] self.a_abs_history = [] def average_max_n(self, list_for_average, n): sorted_list = sorted(list_for_average, reverse=True) return sum(sorted_list[:n]) / n def run_an_episode(self, deterministic): #state_list = [] action_list = [] log_prob_list = [] reward_list = [] evaluated_Q_list = [] Q_std_list = [] a_std_list = [] done = 0 state, info = self.env.reset() while not done and len(reward_list) < (self.args.max_step - 1): state_tensor = torch.FloatTensor(state.copy()).float().to( self.device) info_tensor = torch.FloatTensor(info.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) # 3, 256, 256 u, log_prob, a_std = self.actor.get_action( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), deterministic) log_prob_list.append(log_prob) a_std_list.append(a_std) if self.args.double_Q and not self.args.double_actor: q = torch.min( self.Q_net1.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0], self.Q_net2.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0]) else: q, q_std, _ = self.Q_net1.evaluate( state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device)) evaluated_Q_list.append(q.detach().item()) if self.args.distributional_Q: Q_std_list.append(q_std.detach().item()) else: Q_std_list.append(0) u = u.squeeze(0) state, reward, done, info = self.env.step(u) # self.env.render(mode='human') action_list.append(u) reward_list.append(reward * self.args.reward_scale) if not deterministic: entropy_list = list(-self.alpha * np.array(log_prob_list)) true_gamma_return_list = cal_gamma_return_of_an_episode( reward_list, entropy_list, self.args.gamma) policy_entropy = -sum(log_prob_list) / len(log_prob_list) a_std_mean = np.mean(np.array(a_std_list), axis=0) a_abs_mean = np.mean(np.abs(np.array(action_list)), axis=0) return dict( #state_list=np.array(state_list), #action_list=np.array(action_list), log_prob_list=np.array(log_prob_list), policy_entropy=policy_entropy, #reward_list=np.array(reward_list), a_std_mean=a_std_mean, a_abs_mean=a_abs_mean, evaluated_Q_list=np.array(evaluated_Q_list), Q_std_list=np.array(Q_std_list), true_gamma_return_list=true_gamma_return_list, ) else: episode_return = sum(reward_list) / self.args.reward_scale episode_len = len(reward_list) return dict(episode_return=episode_return, episode_len=episode_len) def run_n_episodes(self, n, max_state, deterministic): n_episode_state_list = [] n_episode_action_list = [] n_episode_log_prob_list = [] n_episode_reward_list = [] n_episode_evaluated_Q_list = [] n_episode_Q_std_list = [] n_episode_true_gamma_return_list = [] n_episode_return_list = [] n_episode_len_list = [] n_episode_policyentropy_list = [] n_episode_a_std_list = [] for _ in range(n): episode_info = self.run_an_episode(deterministic) # n_episode_state_list.append(episode_info['state_list']) # n_episode_action_list.append(episode_info['action_list']) # n_episode_log_prob_list.append(episode_info['log_prob_list']) # n_episode_reward_list.append(episode_info['reward_list']) if not deterministic: n_episode_evaluated_Q_list.append( episode_info['evaluated_Q_list']) n_episode_Q_std_list.append(episode_info['Q_std_list']) n_episode_true_gamma_return_list.append( episode_info['true_gamma_return_list']) n_episode_policyentropy_list.append( episode_info['policy_entropy']) n_episode_a_std_list.append(episode_info['a_std_mean']) n_episode_action_list.append(episode_info['a_abs_mean']) else: n_episode_return_list.append(episode_info['episode_return']) n_episode_len_list.append(episode_info['episode_len']) if not deterministic: average_policy_entropy = sum(n_episode_policyentropy_list) / len( n_episode_policyentropy_list) average_a_std = np.mean(np.array(n_episode_a_std_list), axis=0) average_a_abs = np.mean(np.array(n_episode_action_list), axis=0) # n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history)) # n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history)) def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi): tmp = list(copy.deepcopy(list_of_n_epi)) tmp[0] = tmp[0] if len( tmp[0]) <= max_state else tmp[0][:max_state] def reduce_fuc(a, b): return np.concatenate( [a, b]) if len(b) < max_state else np.concatenate( [a, b[:max_state]]) interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp) return sum(interest_epi_part_of_one_ite) / len( interest_epi_part_of_one_ite) evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_evaluated_Q_list)) evaluated_Q_std = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_Q_std_list)) true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_true_gamma_return_list)) return dict(evaluated_Q_mean=evaluated_Q_mean, true_gamma_return_mean=true_gamma_return_mean, evaluated_Q_std=evaluated_Q_std, n_episode_reward_list=np.array(n_episode_reward_list), policy_entropy=average_policy_entropy, a_std=average_a_std, a_abs=average_a_abs) else: average_return_with_diff_base = np.array([ self.average_max_n(n_episode_return_list, x) for x in [1, self.episode_num_test - 2, self.episode_num_test] ]) average_reward = sum(n_episode_return_list) / sum( n_episode_len_list) return dict( n_episode_reward_list=np.array(n_episode_reward_list), average_return_with_diff_base=average_return_with_diff_base, average_reward=average_reward, ) def run(self): while not self.stop_sign.value: if self.iteration_counter.value % self.evaluation_interval == 0: self.alpha = np.exp(self.log_alpha_share.detach().item() ) if self.args.alpha == 'auto' else 0 self.iteration = self.iteration_counter.value self.actor.load_state_dict(self.actor_share.state_dict()) self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.Q_net2.load_state_dict(self.Q_net2_share.state_dict()) delta_time = time.time() - self.time self.time = time.time() n_episode_info = self.run_n_episodes( self.episode_num_evaluation, self.max_state_num_evaluated_in_an_episode, False) self.iteration_history.append(self.iteration) self.evaluated_Q_mean_history.append( n_episode_info['evaluated_Q_mean']) self.evaluated_Q_std_history.append( n_episode_info['evaluated_Q_std']) self.true_gamma_return_mean_history.append( n_episode_info['true_gamma_return_mean']) self.time_history.append(delta_time) # self.list_of_n_episode_rewards_history.append(list_of_n_episode_rewards) self.alpha_history.append(self.alpha.item()) self.policy_entropy_history.append( n_episode_info['policy_entropy']) self.a_std_history.append(n_episode_info['a_std']) self.a_abs_history.append(n_episode_info['a_abs']) n_episode_info_test = self.run_n_episodes( self.episode_num_test, self.max_state_num_evaluated_in_an_episode, True) self.average_return_with_diff_base_history.append( n_episode_info_test['average_return_with_diff_base']) self.average_reward_history.append( n_episode_info_test['average_reward']) print('Saving evaluation results of the {} iteration.'.format( self.iteration)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration', np.array(self.iteration_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_mean', np.array(self.evaluated_Q_mean_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_std', np.array(self.evaluated_Q_std_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/true_gamma_return_mean', np.array(self.true_gamma_return_mean_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/time', np.array(self.time_history)) # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards', # np.array(self.list_of_n_episode_rewards_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_return_with_diff_base', np.array(self.average_return_with_diff_base_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/average_reward', np.array(self.average_reward_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/alpha', np.array(self.alpha_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/policy_entropy', np.array(self.policy_entropy_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/a_std', np.array(self.a_std_history)) np.save( './' + self.args.env_name + '/method_' + str(self.args.method) + '/result/a_abs', np.array(self.a_abs_history)) # plot_online(self.args.env_name, self.args.method, self.args.method_name, # self.max_state_num_evaluated_in_an_episode) if self.iteration >= self.args.max_train: self.stop_sign.value = 1 break
def main(method): args = built_parser(method=method) env = gym.make(args.env_name) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] args.state_dim = state_dim args.action_dim = action_dim action_high = env.action_space.high action_low = env.action_space.low args.action_high = action_high.tolist() args.action_low = action_low.tolist() args.seed = np.random.randint(0, 30) args.init_time = time.time() if args.alpha == 'auto' and args.target_entropy == 'auto': delta_a = np.array(args.action_high, dtype=np.float32) - np.array( args.action_low, dtype=np.float32) args.target_entropy = -1 * args.action_dim #+ sum(np.log(delta_a/2)) Q_net1 = QNet(args) Q_net1.train() Q_net1.share_memory() Q_net1_target = QNet(args) Q_net1_target.train() Q_net1_target.share_memory() Q_net2 = QNet(args) Q_net2.train() Q_net2.share_memory() Q_net2_target = QNet(args) Q_net2_target.train() Q_net2_target.share_memory() actor1 = PolicyNet(args) actor1.train() actor1.share_memory() actor1_target = PolicyNet(args) actor1_target.train() actor1_target.share_memory() actor2 = PolicyNet(args) actor2.train() actor2.share_memory() actor2_target = PolicyNet(args) actor2_target.train() actor2_target.share_memory() Q_net1_target.load_state_dict(Q_net1.state_dict()) Q_net2_target.load_state_dict(Q_net2.state_dict()) actor1_target.load_state_dict(actor1.state_dict()) actor2_target.load_state_dict(actor2.state_dict()) Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(), lr=args.critic_lr) Q_net1_optimizer.share_memory() Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(), lr=args.critic_lr) Q_net2_optimizer.share_memory() actor1_optimizer = my_optim.SharedAdam(actor1.parameters(), lr=args.actor_lr) actor1_optimizer.share_memory() actor2_optimizer = my_optim.SharedAdam(actor2.parameters(), lr=args.actor_lr) actor2_optimizer.share_memory() log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True) log_alpha.share_memory_() alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr) alpha_optimizer.share_memory() share_net = [ Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target, actor2, actor2_target, log_alpha ] share_optimizer = [ Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer, alpha_optimizer ] experience_in_queue = [] experience_out_queue = [] for i in range(args.num_buffers): experience_in_queue.append(Queue(maxsize=10)) experience_out_queue.append(Queue(maxsize=10)) shared_queue = [experience_in_queue, experience_out_queue] step_counter = mp.Value('i', 0) stop_sign = mp.Value('i', 0) iteration_counter = mp.Value('i', 0) shared_value = [step_counter, stop_sign, iteration_counter] lock = mp.Lock() procs = [] if args.code_model == "train": for i in range(args.num_actors): procs.append( Process(target=actor_agent, args=(args, shared_queue, shared_value, [actor1, Q_net1], lock, i))) for i in range(args.num_buffers): procs.append( Process(target=buffer, args=(args, shared_queue, shared_value, i))) procs.append( Process(target=evaluate_agent, args=(args, shared_value, share_net))) for i in range(args.num_learners): #device = torch.device("cuda") device = torch.device("cpu") procs.append( Process(target=leaner_agent, args=(args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i))) elif args.code_model == "simu": procs.append(Process(target=simu_agent, args=(args, shared_value))) for p in procs: p.start() for p in procs: p.join()
class Actor(): def __init__(self, args, shared_queue, shared_value, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) self.experience_queue = shared_queue[0] self.policy_param_queue = shared_queue[1] self.q_param_queue = shared_queue[2] self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name) self.args = args self.device = torch.device("cpu") self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low, args.NN_type).to(self.device) self.Q_net1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) def update_actor_net(self, current_dict, actor_net): params_target = get_flat_params_from(actor_net) params = get_flat_params_from_dict(current_dict) set_flat_params_to(actor_net, (1 - self.args.syn_tau) * params_target + self.args.syn_tau * params) def load_param(self): if self.policy_param_queue.empty(): #pass #print("agent", self.agent_id, "is waiting param") time.sleep(0.5) #self.load_param() else: param = self.policy_param_queue.get() if self.args.syn_method == "copy": self.actor.load_state_dict(param) elif self.args.syn_method == "slow": self.update_actor_net(param, self.actor) if self.q_param_queue.empty(): time.sleep(0.5) #self.load_param() else: param = self.q_param_queue.get() self.Q_net1.load_state_dict(param) def put_data(self): if not self.stop_sign.value: if self.experience_queue.full(): #print("agent", self.agent_id, "is waiting queue space") time.sleep(0.5) self.put_data() else: self.experience_queue.put( (self.last_state, self.last_u, [self.reward], self.state, [self.micro_step], [self.done], self.TD.detach().cpu().numpy().squeeze())) else: pass def run(self): time_init = time.time() step = 0 self.micro_step = 0 while not self.stop_sign.value: self.state = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] self.u = self.u.squeeze(0) self.last_state = self.state.copy() self.last_u = self.u.copy() last_q_1 = q_1 for i in range(self.args.max_step): self.state, self.reward, self.done, _ = self.env.step(self.u) state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action( state_tensor.unsqueeze(0), False) q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] self.u = self.u.squeeze(0) if self.episode_step > 0: self.TD = self.reward + ( 1 - self.done) * self.args.gamma * q_1 - last_q_1 self.put_data() self.last_state = self.state.copy() self.last_u = self.u.copy() last_q_1 = q_1 with self.lock: self.counter.value += 1 if self.done == True: break if step % self.args.load_param_period == 0: self.load_param() step += 1 self.episode_step += 1
def main(): # parameters for the gym_carla environment params = { 'display_size': 256, # screen size of bird-eye render 'obs_size': 128, # screen size of cv2 window 'dt': 0.1, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port # 'town': 'Town01', # which town to simulate 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 5000, # maximum timesteps per episode 'desired_speed': 8, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } # Set gym-carla environment env = gym.make('carla-v0', params=params) # load net device = torch.device('cpu') args = Args() args.NN_type actor = PolicyNet(args).to(device) actor.load_state_dict(torch.load('./policy1_500000.pkl',map_location='cpu')) Q_net1 = QNet(args).to(device) Q_net1.load_state_dict(torch.load('./Q1_500000.pkl',map_location='cpu')) obs, info_dict = env.reset() info = info_dict_to_array(info_dict) state_tensor = torch.FloatTensor(obs.copy()).float().to(device) info_tensor = torch.FloatTensor(info.copy()).float().to(device) # print(env.ego.get_location()) tic = time.time() done = False ret = 0 start = carla.Location(x=env.start[0], y=env.start[1], z=0.22) end = carla.Location(x=env.dest[0], y=env.dest[1], z=0.22) if args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) while not done: tac = time.time() u, log_prob = actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) u = u.squeeze(0) obs, r, done, info = env.step(u) info = info_dict_to_array(info_dict) state_tensor = torch.FloatTensor(obs.copy()).float().to(device) if args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) info_tensor = torch.FloatTensor(info.copy()).float().to(device) ret += r cv2.imshow("camera img", obs) cv2.waitKey(1) # print(info['acceleration_t'].shape) env.world.debug.draw_point(start) env.world.debug.draw_point(end) if done: toc = time.time() print("An episode took %f s" %(toc - tic)) print("total reward is", ret) print("time steps", env.time_step) env.close() env.reset() ret = 0 # print(env.ego.get_location()) done = False
class Simulation(): def __init__(self, args, shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = self.args.max_train self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl', map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl', map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl', map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history = [] self.done_history = [] self.Q_real_history = [] self.Q_history = [] self.Q_std_history = [] def run(self): alpha = 0.004 step = 0 while True: self.state = self.env.reset() self.episode_step = 0 for i in range(300): state_tensor = torch.FloatTensor(self.state.copy()).float().to( self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob, _ = self.actor.get_action( state_tensor.unsqueeze(0), True) q = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] if self.args.double_Q: q = torch.min( self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to( self.device))[0], self.Q_net2(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to( self.device))[0]) self.u = self.u.squeeze(0) self.state, self.reward, self.done, _ = self.env.step(self.u) self.Q_history.append(q.detach().item()) self.reward_history.append(self.reward) self.done_history.append(self.done) self.entropy_history.append(log_prob) if step % 10000 >= 0 and step % 10000 <= 9999: self.env.render(mode='human') if self.done == True: time.sleep(1) print("!!!!!!!!!!!!!!!") break step += 1 self.episode_step += 1 if self.done == True: pass #break print(self.reward_history) for i in range(len(self.Q_history)): a = 0 for j in range(i, len(self.Q_history), 1): a += pow(self.args.gamma, j - i) * self.reward_history[j] for z in range(i + 1, len(self.Q_history), 1): a -= alpha * pow(self.args.gamma, z - i) * self.entropy_history[z] self.Q_real_history.append(a) plt.figure() x = np.arange(0, len(self.Q_history), 1) plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0) plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0) plt.show()
def main(method): params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'train', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } args = built_parser(method=method) env = gym.make(args.env_name, params=params) state_dim = env.state_space.shape action_dim = env.action_space.shape[0] args.state_dim = state_dim args.action_dim = action_dim action_high = env.action_space.high action_low = env.action_space.low args.action_high = action_high.tolist() args.action_low = action_low.tolist() args.seed = np.random.randint(0, 30) args.init_time = time.time() num_cpu = mp.cpu_count() print(state_dim, action_dim, action_high, num_cpu) if args.alpha == 'auto' and args.target_entropy == 'auto': delta_a = np.array(args.action_high, dtype=np.float32) - np.array( args.action_low, dtype=np.float32) args.target_entropy = -1 * args.action_dim # + sum(np.log(delta_a/2)) Q_net1 = QNet(args) Q_net1.train() Q_net1.share_memory() Q_net1_target = QNet(args) Q_net1_target.train() Q_net1_target.share_memory() Q_net2 = QNet(args) Q_net2.train() Q_net2.share_memory() Q_net2_target = QNet(args) Q_net2_target.train() Q_net2_target.share_memory() actor1 = PolicyNet(args) print("Network inited") if args.code_model == "eval": actor1.load_state_dict( torch.load('./' + args.env_name + '/method_' + str(args.method) + '/model/policy_' + str(args.max_train) + '.pkl')) actor1.train() actor1.share_memory() actor1_target = PolicyNet(args) actor1_target.train() actor1_target.share_memory() actor2 = PolicyNet(args) actor2.train() actor2.share_memory() actor2_target = PolicyNet(args) actor2_target.train() actor2_target.share_memory() print("Network set") Q_net1_target.load_state_dict(Q_net1.state_dict()) Q_net2_target.load_state_dict(Q_net2.state_dict()) actor1_target.load_state_dict(actor1.state_dict()) actor2_target.load_state_dict(actor2.state_dict()) print("Network loaded!") Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(), lr=args.critic_lr) Q_net1_optimizer.share_memory() Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(), lr=args.critic_lr) Q_net2_optimizer.share_memory() actor1_optimizer = my_optim.SharedAdam(actor1.parameters(), lr=args.actor_lr) actor1_optimizer.share_memory() actor2_optimizer = my_optim.SharedAdam(actor2.parameters(), lr=args.actor_lr) actor2_optimizer.share_memory() log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True) log_alpha.share_memory_() alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr) alpha_optimizer.share_memory() print("Optimizer done") share_net = [ Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target, actor2, actor2_target, log_alpha ] share_optimizer = [ Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer, alpha_optimizer ] experience_in_queue = [] experience_out_queue = [] for i in range(args.num_buffers): experience_in_queue.append(Queue(maxsize=10)) experience_out_queue.append(Queue(maxsize=10)) shared_queue = [experience_in_queue, experience_out_queue] step_counter = mp.Value('i', 0) stop_sign = mp.Value('i', 0) iteration_counter = mp.Value('i', 0) shared_value = [step_counter, stop_sign, iteration_counter] lock = mp.Lock() procs = [] if args.code_model == "train": for i in range(args.num_learners): if i % 2 == 0: device = torch.device("cuda:1") else: device = torch.device("cuda:0") # device = torch.device("cpu") procs.append( Process(target=leaner_agent, args=(args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i))) for i in range(args.num_actors): procs.append( Process(target=actor_agent, args=(args, shared_queue, shared_value, [actor1, Q_net1], lock, i))) for i in range(args.num_buffers): procs.append( Process(target=buffer, args=(args, shared_queue, shared_value, i))) procs.append( Process(target=evaluate_agent, args=(args, shared_value, share_net))) elif args.code_model == "simu": procs.append(Process(target=simu_agent, args=(args, shared_value))) for p in procs: p.start() for p in procs: p.join()
class Simulation(): def __init__(self, args, shared_queue,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.policy_test_queue = shared_queue[3] self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = 20000 self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device) self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl')) self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl')) self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_m0_history =[] self.Q_m1_history = [] self.Q_m2_history = [] self.Q_std_m2_history = [] def load_param(self): if self.policy_test_queue.empty(): pass else: self.iteration, param = self.policy_test_queue.get() self.actor.load_state_dict(param) def run(self): step = 0 while True: self.state = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) for i in range(self.args.max_step): q_m0 = self.Q_net1_m0(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] q_m1 = torch.min( self.Q_net1_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0], self.Q_net2_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]) q_m2, q_std, _ = self.Q_net1_m2.evaluate(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device)) self.Q_m0_history.append(q_m0.detach().item()) self.Q_m1_history.append(q_m1.detach().item()) self.Q_m2_history.append(q_m2.detach().item()) self.Q_std_m2_history.append(q_std.detach().item()) self.u = self.u.squeeze(0) self.state, self.reward, self.done, _ = self.env.step(self.u) self.reward_history.append(self.reward) self.done_history.append(self.done) self.entropy_history.append(log_prob) if step%10000 >=0 and step%10000 <=9999: self.env.render(mode='human') state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False) if self.done == True: time.sleep(1) print("!!!!!!!!!!!!!!!") break step += 1 self.episode_step += 1 if self.done == True: break for i in range(len(self.Q_m0_history)): a = 0 for j in range(i, len(self.Q_m0_history), 1): a += pow(self.args.gamma, j-i)*self.reward_history[j] for z in range(i+1, len(self.Q_m0_history), 1): a -= self.args.alpha * pow(self.args.gamma, z-i) * self.entropy_history[z] self.Q_real_history.append(a) print(self.reward_history) print(self.entropy_history) print(self.Q_m2_history) print(self.Q_std_m2_history) plt.figure() x = np.arange(0,len(self.Q_m0_history),1) plt.plot(x, np.array(self.Q_m0_history), 'r', linewidth=2.0) plt.plot(x, np.array(self.Q_m1_history), 'g', linewidth=2.0) plt.plot(x, np.array(self.Q_m2_history), 'b', linewidth=2.0) plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0) plt.show()
class Simulation(): def __init__(self, args,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) simu_params = { 'number_of_vehicles': 0, 'number_of_walkers': 0, 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name, params=simu_params) self.device = torch.device("cpu") self.load_index = self.args.max_train # self.load_index = 40000 self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_history =[] self.Q_std_history = [] def run(self): alpha = 0.004 step = 0 summaryFlag = True while True: self.state, self.info = self.env.reset() self.episode_step = 0 state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) for i in range(500): q = self.Q_net1(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0] if self.args.double_Q: q = torch.min( q, self.Q_net2(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]) self.Q_history.append(q.detach().item()) self.u = self.u.squeeze(0) # TODO if summaryFlag: with SummaryWriter(log_dir='./logs') as writer: # writer.add_scalar('random', np.random.randint(0, 10), i) v = self.env.ego.get_velocity() v = np.array([v.x, v.y, v.z]) writer.add_scalar('v_x', self.env.state_info['velocity_t'][0], i) writer.add_scalar('v_y', self.env.state_info['velocity_t'][1], i) writer.add_scalar('accelaration_x', self.env.state_info['acceleration_t'][0], i) writer.add_scalar('accelaration_y', self.env.state_info['acceleration_t'][1], i) # writer.add_scalar('distance2terminal', self.env.state_info['dist_to_dest'], i) # writer.add_scalar('delta_yaw', self.state[5]*2, i) writer.add_scalar('angular_speed_z', self.env.state_info['dyaw_dt_t'], i) # writer.add_scalar('lateral_dist', self.state[7]/10, i) writer.add_scalar('action_throttle', self.env.state_info['action_t_1'][0], i) writer.add_scalar('action_steer', self.env.state_info['action_t_1'][1], i) writer.add_scalar('delta_yaw', self.env.state_info['delta_yaw_t'], i) writer.add_scalar('dist2center', self.env.state_info['lateral_dist_t'], i) self.state, self.reward, self.done, self.info = self.env.step(self.u) self.reward_history.append(self.reward) self.done_history.append(self.done) self.entropy_history.append(log_prob) # render the image cv2.imshow("camera img", self.state.squeeze()) cv2.waitKey(1) # if step%10000 >=0 and step%10000 <=9999: # self.env.render(mode='human') state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device) info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device) if self.args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) if self.done == True or self.env.isTimeOut: time.sleep(1) print("Episode Done!") summaryFlag = False # return break step += 1 self.episode_step += 1 if self.done == True: pass #break print(self.reward_history) for i in range(len(self.Q_history)): a = 0 for j in range(i, len(self.Q_history), 1): a += pow(self.args.gamma, j-i)*self.reward_history[j] for z in range(i+1, len(self.Q_history), 1): a -= alpha * pow(self.args.gamma, z-i) * self.entropy_history[z] self.Q_real_history.append(a) plt.figure() x = np.arange(0,len(self.Q_history),1) plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0) plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0) plt.show()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, double_agent=False,dueling_agent=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_agent(bool) : True if we want to use DDQN dueling_agent (bool): True if we want to use Dueling """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_agent=double_agent self.dueling_agent=dueling_agent self.qnetwork_local = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device) self.qnetwork_target = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device) self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def weighted_mse_loss(self,Q_expected, Q_targets,deltas): """ Returns the weighted mean square error between Q_expected and Q_target Params ====== Q_expected, Q_targets : target and current guesses deltas : weights """ weight =( deltas/torch.sum(deltas)*BATCH_SIZE )** (-1) return torch.mean(weight * (Q_expected - Q_targets) ** 2) def get_q_target(self,next_states,rewards,gamma,dones): """ Returns the target expected Q value Params ====== next_states : list of states we arrived in rewards : rewards we got gamma : discounting factor dones : list of bool telling if the episode is done """ # Get max predicted Q values (for next states) from target model if (not self.double_agent): Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) else : indices= torch.argmax(self.qnetwork_local(next_states).detach(),1) Q_targets_next = self.qnetwork_target(next_states).detach().gather(1,indices.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) return Q_targets def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones,deltas = experiences Q_targets = self.get_q_target(next_states,rewards,gamma,dones) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = self.weighted_mse_loss(Q_expected, Q_targets,deltas) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Evaluator(object): def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0 self.evaluation_interval = 50000 self.max_state_num_evaluated_in_an_episode = 500 self.episode_num_to_run = 10 self.iteration_history = [] self.evaluated_Q_mean_history=[] self.true_gamma_return_mean_history=[] # self.n_episodes_info_history = [] self.evaluated_Q_history = [] self.true_gamma_return_history = [] def run_an_episode(self): state_list = [] action_list = [] log_prob_list = [] reward_list = [] evaluated_Q_list = [] done = 0 state = self.env.reset() while not done and len(reward_list) < self.args.max_step: state_tensor = torch.FloatTensor(state.copy()).float().to(self.device) u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), self.args.stochastic_actor) state_list.append(state.copy()) action_list.append(u.copy()) log_prob_list.append(log_prob) if self.args.double_Q and not self.args.double_actor: q = torch.min( self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0], self.Q_net2(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0]) else: q = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0] evaluated_Q_list.append(q.detach().item()) u = u.squeeze(0) state, reward, done, load_action = self.env.step(u) # self.env.render(mode='human') reward_list.append(reward * self.args.reward_scale) entropy_list = list(-self.alpha * np.array(log_prob_list)) true_gamma_return_list = cal_gamma_return_of_an_episode(reward_list, entropy_list, self.args.gamma) episode_return = sum(reward_list) episode_len = len(reward_list) return dict(state_list=np.array(state_list), action_list=np.array(action_list), log_prob_list=np.array(log_prob_list), reward_list=np.array(reward_list), evaluated_Q_list=np.array(evaluated_Q_list), true_gamma_return_list=true_gamma_return_list, episode_return=episode_return, episode_len=episode_len) def run_n_episodes(self, n, max_state): n_episode_state_list = [] n_episode_action_list = [] n_episode_log_prob_list = [] n_episode_reward_list = [] n_episode_evaluated_Q_list = [] n_episode_true_gamma_return_list = [] n_episode_return_list = [] n_episode_len_list = [] for _ in range(n): episode_info = self.run_an_episode() n_episode_state_list.append(episode_info['state_list']) n_episode_action_list.append(episode_info['action_list']) n_episode_log_prob_list.append(episode_info['log_prob_list']) n_episode_reward_list.append(episode_info['reward_list']) n_episode_evaluated_Q_list.append(episode_info['evaluated_Q_list']) n_episode_true_gamma_return_list.append(episode_info['true_gamma_return_list']) n_episode_return_list.append(episode_info['episode_return']) n_episode_len_list.append(episode_info['episode_len']) #n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history)) #n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history)) def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi): tmp = list(copy.deepcopy(list_of_n_epi)) tmp[0] = tmp[0] if len(tmp[0]) <= max_state else tmp[0][:max_state] def reduce_fuc(a, b): return np.concatenate([a, b]) if len(b) < max_state else np.concatenate([a, b[:max_state]]) interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp) return sum(interest_epi_part_of_one_ite) / len(interest_epi_part_of_one_ite) evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean(np.array(n_episode_evaluated_Q_list)) true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean( np.array(n_episode_true_gamma_return_list)) return evaluated_Q_mean, true_gamma_return_mean # return dict(n_episode_state_list=np.array(n_episode_state_list), # n_episode_action_list=np.array(n_episode_action_list), # n_episode_log_prob_list=np.array(n_episode_log_prob_list), # n_episode_reward_list=np.array(n_episode_reward_list), # n_episode_evaluated_Q_list=np.array(n_episode_evaluated_Q_list), # n_episode_true_gamma_return_list=np.array(n_episode_true_gamma_return_list), # n_episode_return_list=np.array(n_episode_return_list), # n_episode_len_list=np.array(n_episode_len_list)) def run(self): while not self.stop_sign.value: if self.iteration_counter.value % self.evaluation_interval == 0: self.alpha = np.exp(self.log_alpha_share.detach().item()) if self.args.alpha == 'auto' else 0 self.iteration = self.iteration_counter.value self.actor.load_state_dict(self.actor_share.state_dict()) self.Q_net1.load_state_dict(self.Q_net1_share.state_dict()) self.Q_net2.load_state_dict(self.Q_net2_share.state_dict()) evaluated_Q_mean, true_gamma_return_mean = self.run_n_episodes(self.episode_num_to_run,self.max_state_num_evaluated_in_an_episode) self.iteration_history.append(self.iteration) self.evaluated_Q_mean_history.append(evaluated_Q_mean) self.true_gamma_return_mean_history.append(true_gamma_return_mean) print('Saving evaluation results of the {} iteration.'.format(self.iteration)) np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration_evaluation', np.array(self.iteration_history)) np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_mean', np.array(self.evaluated_Q_mean_history)) np.save('./' + self.args.env_name + '/method_' + str( self.args.method) + '/result/true_gamma_return_mean', np.array(self.true_gamma_return_mean_history)) plot_online(self.args.env_name, self.args.method, self.args.method_name, self.max_state_num_evaluated_in_an_episode)