def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0

        self.evaluation_interval = 50000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_to_run = 10
        self.iteration_history = []
        self.evaluated_Q_mean_history=[]
        self.true_gamma_return_mean_history=[]
        # self.n_episodes_info_history = []
        self.evaluated_Q_history = []
        self.true_gamma_return_history = []
Example #2
0
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        eval_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name, params=eval_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 50  # 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []
Example #3
0
    def __init__(self, args,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        simu_params = {
            'number_of_vehicles': 0,
            'number_of_walkers': 0,
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': 2000,  # connection port
            'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name, params=simu_params)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train
        # self.load_index = 40000

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu'))


        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_history =[]
        self.Q_std_history = []
    def __init__(self, args, shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/policy1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/Q1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(
                torch.load('./' + self.args.env_name + '/method_' +
                           str(self.args.method) + '/model/Q2_' +
                           str(self.load_index) + '.pkl',
                           map_location='cpu'))

        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history = []
        self.done_history = []
        self.Q_real_history = []
        self.Q_history = []
        self.Q_std_history = []
Example #5
0
 def __init__(self, args, shared_queue, shared_value):
     super(Test, self).__init__()
     seed = args.seed
     np.random.seed(seed)
     torch.manual_seed(seed)
     self.policy_test_queue = shared_queue[3]
     self.stop_sign = shared_value[1]
     self.args = args
     self.env = gym.make(args.env_name)
     self.device = torch.device("cpu")
     self.actor = PolicyNet(args.state_dim, args.num_hidden_cell,
                            args.action_high, args.action_low,
                            args.NN_type).to(self.device)
     self.test_step = 0
     self.epoch_length = 1000
     self.save_interval = 10000
     self.iteration = 0
     self.reward_history = []
     self.iteration_history = []
Example #6
0
    def __init__(self, args, shared_queue,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.policy_test_queue = shared_queue[3]
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = 20000



        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device)
        self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl'))
        self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl'))



        self.test_step = 0

        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_m0_history =[]
        self.Q_m1_history = []
        self.Q_m2_history = []
        self.Q_std_m2_history = []
Example #7
0
    def __init__(self, args, shared_value, share_net):
        super(Test, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        test_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 500,  # maximum timesteps per episode
            'desired_speed': 8,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.args = args
        self.env = gym.make(args.env_name, params=test_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.actor_share = share_net[0]
        self.log_alpha = share_net[1]

        self.test_step = 0
        self.episode_num = 10
        self.test_interval = 20000
        self.start_time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.accel_history = []
        self.steer_history = []
Example #8
0
 def __init__(self, args, shared_queue, shared_value, lock, i):
     super(Actor, self).__init__()
     self.agent_id = i
     seed = args.seed + np.int64(self.agent_id)
     np.random.seed(seed)
     torch.manual_seed(seed)
     self.experience_queue = shared_queue[0]
     self.policy_param_queue = shared_queue[1]
     self.q_param_queue = shared_queue[2]
     self.counter = shared_value[0]
     self.stop_sign = shared_value[1]
     self.lock = lock
     self.env = gym.make(args.env_name)
     self.args = args
     self.device = torch.device("cpu")
     self.actor = PolicyNet(args.state_dim, args.num_hidden_cell,
                            args.action_high, args.action_low,
                            args.NN_type).to(self.device)
     self.Q_net1 = QNet(args.state_dim, args.action_dim,
                        args.num_hidden_cell, args.NN_type).to(self.device)
Example #9
0
    def __init__(self, args, shared_queue, shared_value, share_net, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)

        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name)
        self.args = args
        self.experience_in_queue = []
        for i in range(args.num_buffers):
            self.experience_in_queue.append(shared_queue[0][i])

        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)

        self.Q_net1_share = share_net[1]
        self.actor_share = share_net[0]
Example #10
0
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []
Example #11
0
    def __init__(self, args, shared_queue, shared_value, share_net, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)

        actor_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * self.agent_id),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'train',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name, params=actor_params)
        self.args = args
        self.experience_in_queue = []
        for i in range(args.num_buffers):
            self.experience_in_queue.append(shared_queue[0][i])

        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        # self.Q_net1 = QNet(args).to(self.device)

        #share_net = [Q_net1,Q_net1_target,Q_net2,Q_net2_target,actor,actor_target,log_alpha]
        #share_optimizer=[Q_net1_optimizer,Q_net2_optimizer,actor_optimizer,alpha_optimizer]
        self.Q_net1_share = share_net[1]
        self.actor_share = share_net[0]
Example #12
0
class Test():
    def __init__(self, args, shared_value, share_net):
        super(Test, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        test_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 500,  # maximum timesteps per episode
            'desired_speed': 8,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.args = args
        self.env = gym.make(args.env_name, params=test_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.actor_share = share_net[0]
        self.log_alpha = share_net[1]

        self.test_step = 0
        self.episode_num = 10
        self.test_interval = 20000
        self.start_time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.accel_history = []
        self.steer_history = []

    def run_an_episode(self):
        reward_list = []
        accel_list = []
        steer_list = []
        done = 0
        state, info = self.env.reset()

        while not done and len(reward_list) < self.args.max_step:
            state_tensor = torch.FloatTensor(state.copy()).float().to(
                self.device)
            info_tensor = torch.FloatTensor(info.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)  # 3, 64, 160
            u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0),
                                                info_tensor.unsqueeze(0), True)
            u = u.squeeze(0)
            state, reward, done, info = self.env.step(u)
            #self.env.render(mode='human')
            reward_list.append(reward)
            accel_list.append(u[0])
            steer_list.append(u[1])

        episode_return = sum(reward_list)
        episode_len = len(reward_list)
        episode_accel = np.mean(accel_list)
        episode_steer = np.mean(steer_list)

        return np.array(
            reward_list
        ), episode_return, episode_len, episode_accel, episode_steer

    def average_max_n(self, list_for_average, n):
        sorted_list = sorted(list_for_average, reverse=True)
        return sum(sorted_list[:n]) / n

    def run_n_episodes(self, n):
        assert n >= 5, "n must be at least 5"
        list_of_n_episode_rewards = []
        list_of_return = []
        list_of_len = []
        list_of_accel = []
        list_of_steer = []
        for _ in range(n):
            reward_list, episode_return, episode_len, episode_accel, episode_steer = self.run_an_episode(
            )
            list_of_n_episode_rewards.append(reward_list)
            list_of_return.append(episode_return)
            list_of_len.append(episode_len)
            list_of_accel.append(episode_accel)
            list_of_steer.append(episode_steer)

        average_return_with_diff_base = np.array(
            [self.average_max_n(list_of_return, x) for x in [1, 3, 5]])
        average_reward = sum(list_of_return) / sum(list_of_len)
        avg_accel = sum(list_of_accel) / sum(list_of_len)
        avg_steer = sum(list_of_steer) / sum(list_of_len)
        return np.array(
            list_of_n_episode_rewards
        ), average_return_with_diff_base, average_reward, avg_accel, avg_steer

    def run(self):
        while not self.stop_sign.value:
            if self.iteration_counter.value % self.test_interval == 0:
                self.iteration = self.iteration_counter.value
                self.actor.load_state_dict(self.actor_share.state_dict())
                delta_time = time.time() - self.start_time
                list_of_n_episode_rewards, average_return_with_diff_base, average_reward, avg_accel, avg_steer = self.run_n_episodes(
                    self.episode_num)
                self.iteration_history.append(self.iteration)
                self.time_history.append(delta_time)
                self.list_of_n_episode_rewards_history.append(
                    list_of_n_episode_rewards)
                self.average_return_with_diff_base_history.append(
                    average_return_with_diff_base)
                self.average_reward_history.append(average_reward)
                self.alpha_history.append(self.log_alpha.detach().exp().item())
                self.accel_history.append(avg_accel)
                self.steer_history.append(avg_steer)
                print('Saving test data of the {} iteration.'.format(
                    self.iteration))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/iteration',
                    np.array(self.iteration_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/time',
                    np.array(self.time_history))
                # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards',
                #         np.array(self.list_of_n_episode_rewards_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) +
                    '/result/average_return_with_diff_base',
                    np.array(self.average_return_with_diff_base_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/average_reward',
                    np.array(self.average_reward_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/alpha',
                    np.array(self.alpha_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/accel',
                    np.array(self.accel_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/steer',
                    np.array(self.steer_history))

                # plot_online(self.args.env_name, self.args.method, self.args.method_name)

                if self.iteration >= self.args.max_train:
                    self.stop_sign.value = 1
                    break
Example #13
0
    def __init__(self, args, shared_queue, shared_value, share_net,
                 share_optimizer, device, lock, i):
        super(Learner, self).__init__()
        self.args = args
        seed = self.args.seed
        self.init_time = self.args.init_time
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.agent_id = i

        self.experience_out_queue = []
        for i in range(args.num_buffers):
            self.experience_out_queue.append(shared_queue[1][i])

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value

        self.device = device
        if self.device == torch.device("cpu"):
            self.gpu = False
        else:
            self.gpu = True
        self.lock = lock

        self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \
                                        self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net
        self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer

        self.Q_net1 = QNet(args).to(self.device)
        self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR(
            self.Q_net1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net1.train()

        self.Q_net1_target = QNet(args).to(self.device)
        self.Q_net1_target.train()

        self.Q_net2 = QNet(args).to(self.device)
        self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR(
            self.Q_net2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net2.train()

        self.Q_net2_target = QNet(args).to(self.device)
        self.Q_net2_target.train()

        self.actor1 = PolicyNet(args).to(self.device)
        self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR(
            self.actor1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor1.train()

        self.actor1_target = PolicyNet(args).to(self.device)
        self.actor1_target.train()

        self.actor2 = PolicyNet(args).to(self.device)
        self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR(
            self.actor2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor2.train()

        self.actor2_target = PolicyNet(args).to(self.device)
        self.actor2_target.train()

        self.scheduler_alpha = lr_scheduler.CosineAnnealingLR(
            self.alpha_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)

        if self.args.alpha == 'auto':
            self.target_entropy = args.target_entropy
        else:
            self.alpha = torch.tensor(self.args.alpha)
Example #14
0
class Learner():
    def __init__(self, args, shared_queue, shared_value, share_net,
                 share_optimizer, device, lock, i):
        super(Learner, self).__init__()
        self.args = args
        seed = self.args.seed
        self.init_time = self.args.init_time
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.agent_id = i

        self.experience_out_queue = []
        for i in range(args.num_buffers):
            self.experience_out_queue.append(shared_queue[1][i])

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value

        self.device = device
        if self.device == torch.device("cpu"):
            self.gpu = False
        else:
            self.gpu = True
        self.lock = lock

        self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \
                                        self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net
        self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer

        self.Q_net1 = QNet(args).to(self.device)
        self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR(
            self.Q_net1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net1.train()

        self.Q_net1_target = QNet(args).to(self.device)
        self.Q_net1_target.train()

        self.Q_net2 = QNet(args).to(self.device)
        self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR(
            self.Q_net2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net2.train()

        self.Q_net2_target = QNet(args).to(self.device)
        self.Q_net2_target.train()

        self.actor1 = PolicyNet(args).to(self.device)
        self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR(
            self.actor1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor1.train()

        self.actor1_target = PolicyNet(args).to(self.device)
        self.actor1_target.train()

        self.actor2 = PolicyNet(args).to(self.device)
        self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR(
            self.actor2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor2.train()

        self.actor2_target = PolicyNet(args).to(self.device)
        self.actor2_target.train()

        self.scheduler_alpha = lr_scheduler.CosineAnnealingLR(
            self.alpha_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)

        if self.args.alpha == 'auto':
            self.target_entropy = args.target_entropy
        else:
            self.alpha = torch.tensor(self.args.alpha)

    def get_qloss(self, q, q_std, target_q, target_q_bound):
        if self.args.distributional_Q:
            # loss = -Normal(q, q_std).log_prob(target_q).mean()
            # loss = torch.mean(-Normal(q, q_std).log_prob(target_q_bound)*self.weight \
            #                   + self.weight.logical_not()*torch.pow(q-target_q,2))
            loss = torch.mean(torch.pow(q-target_q,2)/(2*torch.pow(q_std.detach(),2)) \
                   + torch.pow(q.detach()-target_q_bound,2)/(2*torch.pow(q_std,2))\
                   + torch.log(q_std))
        else:
            criterion = nn.MSELoss()
            loss = criterion(q, target_q)
        return loss

    def get_policyloss(self, q, log_prob_a_new):
        loss = (self.alpha.detach() * log_prob_a_new - q).mean()
        return loss

    def update_net(self, loss, optimizer, net, net_share, scheduler):
        optimizer.zero_grad()
        if self.gpu:
            if self.args.alpha == 'auto':
                if net is not self.log_alpha:
                    net.zero_grad()
            else:
                net.zero_grad()
        loss.backward()
        if self.args.alpha == 'auto':
            if net is self.log_alpha:
                if self.log_alpha_share.grad is None or self.log_alpha_share.grad == 0:
                    self.log_alpha_share._grad = self.log_alpha.grad
            else:
                ensure_shared_grads(model=net,
                                    shared_model=net_share,
                                    gpu=self.gpu)
        else:
            ensure_shared_grads(model=net,
                                shared_model=net_share,
                                gpu=self.gpu)
        optimizer.step()
        scheduler.step(self.iteration)

    def target_q(self, r, done, q, q_std, q_next, log_prob_a_next):
        target_q = r + (1 - done) * self.args.gamma * (
            q_next - self.alpha.detach() * log_prob_a_next)
        if self.args.distributional_Q:
            if self.args.adaptive_bound:
                target_max = q + 3 * q_std
                target_min = q - 3 * q_std
                target_q = torch.min(target_q, target_max)
                target_q = torch.max(target_q, target_min)
            difference = torch.clamp(target_q - q, -self.args.TD_bound,
                                     self.args.TD_bound)
            target_q_bound = q + difference
            self.weight = torch.le(torch.abs(target_q - q),
                                   self.args.TD_bound).detach()
        else:
            target_q_bound = target_q
        return target_q.detach(), target_q_bound.detach()

    def send_to_device(self, s, info, a, r, s_next, info_next, done, device):
        s = s.to(device)
        info = info.to(device)
        a = a.to(device)
        r = r.to(device)
        s_next = s_next.to(device)
        info_next = info_next.to(device)
        done = done.to(device)
        return s, info, a, r, s_next, info_next, done

    def run(self):
        local_iteration = 0
        index = np.random.randint(0, self.args.num_buffers)
        while self.experience_out_queue[index].empty(
        ) and not self.stop_sign.value:
            index = np.random.randint(0, self.args.num_buffers)
            time.sleep(0.1)

        while not self.stop_sign.value:
            self.iteration = self.iteration_counter.value
            self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
            self.Q_net1_target.load_state_dict(
                self.Q_net1_target_share.state_dict())
            self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())
            self.Q_net2_target.load_state_dict(
                self.Q_net2_target_share.state_dict())
            self.actor1.load_state_dict(self.actor1_share.state_dict())
            self.actor1_target.load_state_dict(
                self.actor1_target_share.state_dict())
            self.actor2.load_state_dict(self.actor2_share.state_dict())
            self.actor2_target.load_state_dict(
                self.actor2_target_share.state_dict())
            if self.args.alpha == 'auto':
                self.log_alpha = self.log_alpha_share.detach().clone(
                ).requires_grad_(True)
                self.alpha = self.log_alpha.exp().to(self.device)

            index = np.random.randint(0, self.args.num_buffers)
            while self.experience_out_queue[index].empty(
            ) and not self.stop_sign.value:
                index = np.random.randint(0, self.args.num_buffers)
                time.sleep(0.1)
            if not self.experience_out_queue[index].empty():
                s, info, a, r, s_next, info_next, done = self.experience_out_queue[
                    index].get()
                s, info, a, r, s_next, info_next, done = self.send_to_device(
                    s, info, a, r, s_next, info_next, done, self.device)

            q_1, q_std_1, _ = self.Q_net1.evaluate(s,
                                                   info,
                                                   a,
                                                   device=self.device,
                                                   min=False)
            if self.args.double_Q:
                q_2, q_std_2, _ = self.Q_net2.evaluate(s,
                                                       info,
                                                       a,
                                                       device=self.device,
                                                       min=False)

            smoothing_trick = False
            if not self.args.stochastic_actor:
                if self.args.policy_smooth:
                    smoothing_trick = True

            a_new_1, log_prob_a_new_1, a_new_std_1 = self.actor1.evaluate(
                s, info, smooth_policy=False, device=self.device)
            a_next_1, log_prob_a_next_1, _ = self.actor1_target.evaluate(
                s_next,
                info_next,
                smooth_policy=smoothing_trick,
                device=self.device)
            if self.args.double_actor:
                a_new_2, log_prob_a_new_2, _ = self.actor2.evaluate(
                    s, info, smooth_policy=False, device=self.device)
                a_next_2, log_prob_a_next_2, _ = self.actor2_target.evaluate(
                    s_next,
                    info_next,
                    smooth_policy=smoothing_trick,
                    device=self.device)

            if self.args.double_Q and self.args.double_actor:
                q_next_target_1, _, q_next_sample_1 = self.Q_net2_target.evaluate(
                    s_next, info_next, a_next_1, device=self.device, min=False)
                q_next_target_2, _, _ = self.Q_net1_target.evaluate(
                    s_next, info_next, a_next_2, device=self.device, min=False)
                target_q_1, target_q_1_bound = self.target_q(
                    r, done, q_1.detach(), q_std_1.detach(),
                    q_next_target_1.detach(), log_prob_a_next_1.detach())
                target_q_2, target_q_2_bound = self.target_q(
                    r, done, q_2.detach(), q_std_2.detach(),
                    q_next_target_2.detach(), log_prob_a_next_2.detach())
            else:
                q_next_1, _, q_next_sample_1 = self.Q_net1_target.evaluate(
                    s_next, info_next, a_next_1, device=self.device, min=False)
                if self.args.double_Q:
                    q_next_2, _, _ = self.Q_net2_target.evaluate(
                        s_next,
                        info_next,
                        a_next_1,
                        device=self.device,
                        min=False)
                    q_next_target_1 = torch.min(q_next_1, q_next_2)
                elif self.args.distributional_Q:
                    q_next_target_1 = q_next_sample_1
                else:
                    q_next_target_1 = q_next_1
                target_q_1, target_q_1_bound = self.target_q(
                    r, done, q_1.detach(), q_std_1.detach(),
                    q_next_target_1.detach(), log_prob_a_next_1.detach())

            if self.args.double_Q and self.args.double_actor:
                q_object_1, _, _ = self.Q_net1.evaluate(s,
                                                        info,
                                                        a_new_1,
                                                        device=self.device,
                                                        min=False)
                q_object_2, _, _ = self.Q_net2.evaluate(s,
                                                        info,
                                                        a_new_2,
                                                        device=self.device,
                                                        min=False)
            else:
                q_new_1, _, _ = self.Q_net1.evaluate(s,
                                                     info,
                                                     a_new_1,
                                                     device=self.device,
                                                     min=False)
                if self.args.double_Q:
                    q_new_2, _, _ = self.Q_net2.evaluate(s,
                                                         info,
                                                         a_new_1,
                                                         device=self.device,
                                                         min=False)
                    q_object_1 = torch.min(q_new_1, q_new_2)
                elif self.args.distributional_Q:
                    q_object_1 = q_new_1
                else:
                    q_object_1 = q_new_1

            if local_iteration % self.args.delay_update == 0:
                if self.args.alpha == 'auto':
                    alpha_loss = -(self.log_alpha *
                                   (log_prob_a_new_1.detach().cpu() +
                                    self.target_entropy)).mean()
                    self.update_net(alpha_loss, self.alpha_optimizer,
                                    self.log_alpha, self.log_alpha_share,
                                    self.scheduler_alpha)

            q_loss_1 = self.get_qloss(q_1, q_std_1, target_q_1,
                                      target_q_1_bound)
            self.update_net(q_loss_1, self.Q_net1_optimizer, self.Q_net1,
                            self.Q_net1_share, self.scheduler_Q_net1)
            if self.args.double_Q:
                if self.args.double_actor:
                    q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_2,
                                              target_q_2_bound)
                    self.update_net(q_loss_2, self.Q_net2_optimizer,
                                    self.Q_net2, self.Q_net2_share,
                                    self.scheduler_Q_net2)
                else:
                    q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_1,
                                              target_q_1_bound)
                    self.update_net(q_loss_2, self.Q_net2_optimizer,
                                    self.Q_net2, self.Q_net2_share,
                                    self.scheduler_Q_net2)

            if self.args.code_model == "train":
                if local_iteration % self.args.delay_update == 0:
                    policy_loss_1 = self.get_policyloss(
                        q_object_1, log_prob_a_new_1)
                    self.update_net(policy_loss_1, self.actor1_optimizer,
                                    self.actor1, self.actor1_share,
                                    self.scheduler_actor1)
                    slow_sync_param(self.actor1_share,
                                    self.actor1_target_share, self.args.tau,
                                    self.gpu)
                    if self.args.double_actor:
                        policy_loss_2 = self.get_policyloss(
                            q_object_2, log_prob_a_new_2)
                        self.update_net(policy_loss_2, self.actor2_optimizer,
                                        self.actor2, self.actor2_share,
                                        self.scheduler_actor2)
                        slow_sync_param(self.actor2_share,
                                        self.actor2_target_share,
                                        self.args.tau, self.gpu)

            if local_iteration % self.args.delay_update == 0:
                slow_sync_param(self.Q_net1_share, self.Q_net1_target_share,
                                self.args.tau, self.gpu)
                if self.args.double_Q:
                    slow_sync_param(self.Q_net2_share,
                                    self.Q_net2_target_share, self.args.tau,
                                    self.gpu)

            with self.lock:
                self.iteration_counter.value += 1
            local_iteration += 1

            if self.iteration % self.args.save_model_period == 0 or (
                    self.iteration == 0 and self.agent_id == 0):
                torch.save(
                    self.actor1.state_dict(), './' + self.args.env_name +
                    '/method_' + str(self.args.method) + '/model/policy1_' +
                    str(self.iteration) + '.pkl')
                torch.save(
                    self.Q_net1.state_dict(), './' + self.args.env_name +
                    '/method_' + str(self.args.method) + '/model/Q1_' +
                    str(self.iteration) + '.pkl')
                if self.args.alpha == 'auto':
                    np.save(
                        './' + self.args.env_name + '/method_' +
                        str(self.args.method) + '/model/log_alpha' +
                        str(self.iteration),
                        self.log_alpha.detach().cpu().numpy())
                if self.args.double_Q:
                    torch.save(
                        self.Q_net2.state_dict(), './' + self.args.env_name +
                        '/method_' + str(self.args.method) + '/model/Q2_' +
                        str(self.iteration) + '.pkl')
                if self.args.double_actor:
                    torch.save(
                        self.actor2.state_dict(), './' + self.args.env_name +
                        '/method_' + str(self.args.method) +
                        '/model/policy2_' + str(self.iteration) + '.pkl')

            if self.iteration % 500 == 0 or self.iteration == 0 and self.agent_id == 0:
                print("agent", self.agent_id, "method", self.args.method,
                      "iteration", self.iteration, "time",
                      time.time() - self.init_time)
                print("loss_1", q_loss_1, "alpha", self.alpha, "lr",
                      self.scheduler_Q_net1.get_lr(),
                      self.scheduler_Q_net2.get_lr(),
                      self.scheduler_actor1.get_lr(),
                      self.scheduler_actor2.get_lr(),
                      self.scheduler_alpha.get_lr())
                print("q_std", q_std_1.t()[0][0:8])
                print("a_std", a_new_std_1.t()[0][0:8])
Example #15
0
class Test():
    def __init__(self, args, shared_queue, shared_value):
        super(Test, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.policy_test_queue = shared_queue[3]
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell,
                               args.action_high, args.action_low,
                               args.NN_type).to(self.device)
        self.test_step = 0
        self.epoch_length = 1000
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.iteration_history = []

    def load_param(self):
        if self.policy_test_queue.empty():
            pass
        else:
            self.iteration, param = self.policy_test_queue.get()
            self.actor.load_state_dict(param)

    def run(self):

        epoch = 0
        step = 0
        epoch_reward = 0
        """
        write_stop = 0
        writer = SummaryWriter(comment="test", log_dir='compare'+str(self.args.method))
        """
        while not self.stop_sign.value:
            self.state = self.env.reset()
            self.episode_step = 0
            self.micro_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0),
                                                     False)
            self.u = self.u.squeeze(0)
            accumulate_reward = 0
            for i in range(self.args.max_step):
                self.state, self.reward, self.done, self.load_action = self.env.step(
                    self.u)
                if step % 10000 >= 0 and step % 10000 <= 9999:
                    epoch_reward += self.reward / self.epoch_length
                    self.env.render(mode='human')
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                    self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob = self.actor.get_action(
                    state_tensor.unsqueeze(0), False)
                self.u = self.u.squeeze(0)
                if self.done == True:
                    time.sleep(1)
                    break

                step += 1
                self.episode_step += 1

                if step % self.epoch_length == 0:
                    self.iteration_history.append(self.iteration)
                    self.reward_history.append(epoch_reward)
                    self.load_param()
                    epoch_reward = 0
                    epoch += 1
                    print(epoch)

                if step % self.save_interval == 0:
                    np.save(
                        './data/method_' + str(self.args.method) +
                        '/result/iteration', np.array(self.iteration_history))
                    np.save(
                        './data/method_' + str(self.args.method) +
                        '/result/reward', np.array(self.reward_history))
                    if self.iteration >= self.args.max_train:
                        self.stop_sign.value = 1
                        break
                """
Example #16
0
class Evaluator(object):
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        eval_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name, params=eval_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 50  # 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []

    def average_max_n(self, list_for_average, n):
        sorted_list = sorted(list_for_average, reverse=True)
        return sum(sorted_list[:n]) / n

    def run_an_episode(self, deterministic):
        #state_list = []
        action_list = []
        log_prob_list = []
        reward_list = []
        evaluated_Q_list = []
        Q_std_list = []
        a_std_list = []
        done = 0
        state, info = self.env.reset()
        while not done and len(reward_list) < (self.args.max_step - 1):
            state_tensor = torch.FloatTensor(state.copy()).float().to(
                self.device)
            info_tensor = torch.FloatTensor(info.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)  # 3, 256, 256

            u, log_prob, a_std = self.actor.get_action(
                state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                deterministic)
            log_prob_list.append(log_prob)
            a_std_list.append(a_std)
            if self.args.double_Q and not self.args.double_actor:
                q = torch.min(
                    self.Q_net1.evaluate(
                        state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                        torch.FloatTensor(u.copy()).to(self.device))[0],
                    self.Q_net2.evaluate(
                        state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                        torch.FloatTensor(u.copy()).to(self.device))[0])
            else:
                q, q_std, _ = self.Q_net1.evaluate(
                    state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                    torch.FloatTensor(u.copy()).to(self.device))
            evaluated_Q_list.append(q.detach().item())
            if self.args.distributional_Q:
                Q_std_list.append(q_std.detach().item())
            else:
                Q_std_list.append(0)
            u = u.squeeze(0)
            state, reward, done, info = self.env.step(u)
            # self.env.render(mode='human')
            action_list.append(u)
            reward_list.append(reward * self.args.reward_scale)

        if not deterministic:
            entropy_list = list(-self.alpha * np.array(log_prob_list))
            true_gamma_return_list = cal_gamma_return_of_an_episode(
                reward_list, entropy_list, self.args.gamma)
            policy_entropy = -sum(log_prob_list) / len(log_prob_list)
            a_std_mean = np.mean(np.array(a_std_list), axis=0)
            a_abs_mean = np.mean(np.abs(np.array(action_list)), axis=0)
            return dict(  #state_list=np.array(state_list),
                #action_list=np.array(action_list),
                log_prob_list=np.array(log_prob_list),
                policy_entropy=policy_entropy,
                #reward_list=np.array(reward_list),
                a_std_mean=a_std_mean,
                a_abs_mean=a_abs_mean,
                evaluated_Q_list=np.array(evaluated_Q_list),
                Q_std_list=np.array(Q_std_list),
                true_gamma_return_list=true_gamma_return_list,
            )
        else:
            episode_return = sum(reward_list) / self.args.reward_scale
            episode_len = len(reward_list)
            return dict(episode_return=episode_return, episode_len=episode_len)

    def run_n_episodes(self, n, max_state, deterministic):
        n_episode_state_list = []
        n_episode_action_list = []
        n_episode_log_prob_list = []
        n_episode_reward_list = []
        n_episode_evaluated_Q_list = []
        n_episode_Q_std_list = []
        n_episode_true_gamma_return_list = []
        n_episode_return_list = []
        n_episode_len_list = []
        n_episode_policyentropy_list = []
        n_episode_a_std_list = []
        for _ in range(n):
            episode_info = self.run_an_episode(deterministic)
            # n_episode_state_list.append(episode_info['state_list'])
            # n_episode_action_list.append(episode_info['action_list'])
            # n_episode_log_prob_list.append(episode_info['log_prob_list'])
            # n_episode_reward_list.append(episode_info['reward_list'])
            if not deterministic:
                n_episode_evaluated_Q_list.append(
                    episode_info['evaluated_Q_list'])
                n_episode_Q_std_list.append(episode_info['Q_std_list'])
                n_episode_true_gamma_return_list.append(
                    episode_info['true_gamma_return_list'])
                n_episode_policyentropy_list.append(
                    episode_info['policy_entropy'])
                n_episode_a_std_list.append(episode_info['a_std_mean'])
                n_episode_action_list.append(episode_info['a_abs_mean'])
            else:
                n_episode_return_list.append(episode_info['episode_return'])
                n_episode_len_list.append(episode_info['episode_len'])

        if not deterministic:
            average_policy_entropy = sum(n_episode_policyentropy_list) / len(
                n_episode_policyentropy_list)
            average_a_std = np.mean(np.array(n_episode_a_std_list), axis=0)
            average_a_abs = np.mean(np.array(n_episode_action_list), axis=0)

            # n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history))
            # n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history))

            def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi):
                tmp = list(copy.deepcopy(list_of_n_epi))
                tmp[0] = tmp[0] if len(
                    tmp[0]) <= max_state else tmp[0][:max_state]

                def reduce_fuc(a, b):
                    return np.concatenate(
                        [a, b]) if len(b) < max_state else np.concatenate(
                            [a, b[:max_state]])

                interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp)
                return sum(interest_epi_part_of_one_ite) / len(
                    interest_epi_part_of_one_ite)

            evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_evaluated_Q_list))
            evaluated_Q_std = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_Q_std_list))
            true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_true_gamma_return_list))

            return dict(evaluated_Q_mean=evaluated_Q_mean,
                        true_gamma_return_mean=true_gamma_return_mean,
                        evaluated_Q_std=evaluated_Q_std,
                        n_episode_reward_list=np.array(n_episode_reward_list),
                        policy_entropy=average_policy_entropy,
                        a_std=average_a_std,
                        a_abs=average_a_abs)
        else:
            average_return_with_diff_base = np.array([
                self.average_max_n(n_episode_return_list, x)
                for x in [1, self.episode_num_test - 2, self.episode_num_test]
            ])
            average_reward = sum(n_episode_return_list) / sum(
                n_episode_len_list)
            return dict(
                n_episode_reward_list=np.array(n_episode_reward_list),
                average_return_with_diff_base=average_return_with_diff_base,
                average_reward=average_reward,
            )

    def run(self):
        while not self.stop_sign.value:
            if self.iteration_counter.value % self.evaluation_interval == 0:
                self.alpha = np.exp(self.log_alpha_share.detach().item()
                                    ) if self.args.alpha == 'auto' else 0
                self.iteration = self.iteration_counter.value
                self.actor.load_state_dict(self.actor_share.state_dict())
                self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
                self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())

                delta_time = time.time() - self.time
                self.time = time.time()
                n_episode_info = self.run_n_episodes(
                    self.episode_num_evaluation,
                    self.max_state_num_evaluated_in_an_episode, False)
                self.iteration_history.append(self.iteration)
                self.evaluated_Q_mean_history.append(
                    n_episode_info['evaluated_Q_mean'])
                self.evaluated_Q_std_history.append(
                    n_episode_info['evaluated_Q_std'])
                self.true_gamma_return_mean_history.append(
                    n_episode_info['true_gamma_return_mean'])
                self.time_history.append(delta_time)
                # self.list_of_n_episode_rewards_history.append(list_of_n_episode_rewards)
                self.alpha_history.append(self.alpha.item())
                self.policy_entropy_history.append(
                    n_episode_info['policy_entropy'])
                self.a_std_history.append(n_episode_info['a_std'])
                self.a_abs_history.append(n_episode_info['a_abs'])
                n_episode_info_test = self.run_n_episodes(
                    self.episode_num_test,
                    self.max_state_num_evaluated_in_an_episode, True)
                self.average_return_with_diff_base_history.append(
                    n_episode_info_test['average_return_with_diff_base'])
                self.average_reward_history.append(
                    n_episode_info_test['average_reward'])

                print('Saving evaluation results of the {} iteration.'.format(
                    self.iteration))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/iteration',
                    np.array(self.iteration_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/evaluated_Q_mean',
                    np.array(self.evaluated_Q_mean_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/evaluated_Q_std',
                    np.array(self.evaluated_Q_std_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/true_gamma_return_mean',
                    np.array(self.true_gamma_return_mean_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/time',
                    np.array(self.time_history))
                # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards',
                #         np.array(self.list_of_n_episode_rewards_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) +
                    '/result/average_return_with_diff_base',
                    np.array(self.average_return_with_diff_base_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/average_reward',
                    np.array(self.average_reward_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/alpha',
                    np.array(self.alpha_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/policy_entropy',
                    np.array(self.policy_entropy_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/a_std',
                    np.array(self.a_std_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/a_abs',
                    np.array(self.a_abs_history))

                # plot_online(self.args.env_name, self.args.method, self.args.method_name,
                #             self.max_state_num_evaluated_in_an_episode)

                if self.iteration >= self.args.max_train:
                    self.stop_sign.value = 1
                    break
class Actor():
    def __init__(self, args, shared_queue, shared_value, share_net, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)

        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name)
        self.args = args
        self.experience_in_queue = []
        for i in range(args.num_buffers):
            self.experience_in_queue.append(shared_queue[0][i])

        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)

        #share_net = [Q_net1,Q_net1_target,Q_net2,Q_net2_target,actor,actor_target,log_alpha]
        #share_optimizer=[Q_net1_optimizer,Q_net2_optimizer,actor_optimizer,alpha_optimizer]
        self.Q_net1_share = share_net[1]
        self.actor_share = share_net[0]

    def put_data(self):
        if not self.stop_sign.value:
            index = np.random.randint(0, self.args.num_buffers)
            if self.experience_in_queue[index].full():
                #print("agent", self.agent_id, "is waiting queue space")
                time.sleep(0.5)
                self.put_data()
            else:
                self.experience_in_queue[index].put(
                    (self.last_state, self.last_u,
                     [self.reward * self.args.reward_scale], self.state,
                     [self.done], self.TD.detach().cpu().numpy().squeeze()))
        else:
            pass

    def run(self):
        time_init = time.time()
        step = 0
        while not self.stop_sign.value:
            self.state = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, _ = self.actor.get_action(state_tensor.unsqueeze(0), False)
            #q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]
            self.u = self.u.squeeze(0)
            self.last_state = self.state.copy()
            self.last_u = self.u.copy()
            #last_q_1 = q_1
            for i in range(self.args.max_step - 1):
                self.state, self.reward, self.done, _ = self.env.step(self.u)
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                    self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, _ = self.actor.get_action(state_tensor.unsqueeze(0),
                                                  False)
                #q_1 = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]
                self.u = self.u.squeeze(0)

                self.TD = torch.zeros(
                    1
                )  #self.reward + (1 - self.done) * self.args.gamma * q_1 - last_q_1
                self.put_data()
                self.last_state = self.state.copy()
                self.last_u = self.u.copy()
                #last_q_1 = q_1

                with self.lock:
                    self.counter.value += 1

                if self.done == True:
                    break

                if step % self.args.load_param_period == 0:
                    #self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
                    self.actor.load_state_dict(self.actor_share.state_dict())
                step += 1
                self.episode_step += 1
class Simulation():
    def __init__(self, args, shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/policy1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/Q1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(
                torch.load('./' + self.args.env_name + '/method_' +
                           str(self.args.method) + '/model/Q2_' +
                           str(self.load_index) + '.pkl',
                           map_location='cpu'))

        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history = []
        self.done_history = []
        self.Q_real_history = []
        self.Q_history = []
        self.Q_std_history = []

    def run(self):
        alpha = 0.004

        step = 0
        while True:
            self.state = self.env.reset()
            self.episode_step = 0

            for i in range(300):
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                    self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob, _ = self.actor.get_action(
                    state_tensor.unsqueeze(0), True)

                q = self.Q_net1(state_tensor.unsqueeze(0),
                                torch.FloatTensor(self.u).to(self.device))[0]
                if self.args.double_Q:
                    q = torch.min(
                        self.Q_net1(state_tensor.unsqueeze(0),
                                    torch.FloatTensor(self.u).to(
                                        self.device))[0],
                        self.Q_net2(state_tensor.unsqueeze(0),
                                    torch.FloatTensor(self.u).to(
                                        self.device))[0])

                self.u = self.u.squeeze(0)
                self.state, self.reward, self.done, _ = self.env.step(self.u)

                self.Q_history.append(q.detach().item())
                self.reward_history.append(self.reward)
                self.done_history.append(self.done)
                self.entropy_history.append(log_prob)

                if step % 10000 >= 0 and step % 10000 <= 9999:
                    self.env.render(mode='human')

                if self.done == True:
                    time.sleep(1)
                    print("!!!!!!!!!!!!!!!")
                    break
                step += 1
                self.episode_step += 1

            if self.done == True:
                pass
                #break

        print(self.reward_history)
        for i in range(len(self.Q_history)):
            a = 0
            for j in range(i, len(self.Q_history), 1):
                a += pow(self.args.gamma, j - i) * self.reward_history[j]
            for z in range(i + 1, len(self.Q_history), 1):
                a -= alpha * pow(self.args.gamma,
                                 z - i) * self.entropy_history[z]
            self.Q_real_history.append(a)

        plt.figure()
        x = np.arange(0, len(self.Q_history), 1)
        plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0)
        plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0)

        plt.show()
Example #19
0
class Actor():
    def __init__(self, args, shared_queue, shared_value, share_net, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)

        actor_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * self.agent_id),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'train',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name, params=actor_params)
        self.args = args
        self.experience_in_queue = []
        for i in range(args.num_buffers):
            self.experience_in_queue.append(shared_queue[0][i])

        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        # self.Q_net1 = QNet(args).to(self.device)

        #share_net = [Q_net1,Q_net1_target,Q_net2,Q_net2_target,actor,actor_target,log_alpha]
        #share_optimizer=[Q_net1_optimizer,Q_net2_optimizer,actor_optimizer,alpha_optimizer]
        self.Q_net1_share = share_net[1]
        self.actor_share = share_net[0]

    def put_data(self):
        if not self.stop_sign.value:
            index = np.random.randint(0, self.args.num_buffers)
            if self.experience_in_queue[index].full():
                #print("agent", self.agent_id, "is waiting queue space")
                time.sleep(0.5)
                self.put_data()
            else:
                self.experience_in_queue[index].put((self.state, self.info, self.u, \
                    [self.reward*self.args.reward_scale], self.state_next, self.info_next, [self.done], self.TD.detach().cpu().numpy().squeeze()))
        else:
            pass

    def run(self):
        time_init = time.time()
        step = 0
        while not self.stop_sign.value:
            self.state, self.info = self.env.reset()
            self.episode_step = 0

            for i in range(self.args.max_step - 1):
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                    self.device)
                info_tensor = torch.FloatTensor(self.info.copy()).float().to(
                    self.device)

                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, _, _ = self.actor.get_action(state_tensor.unsqueeze(0),
                                                     info_tensor.unsqueeze(0),
                                                     False)
                self.u = self.u.squeeze(0)
                self.state_next, self.reward, self.done, self.info_next = self.env.step(
                    self.u)
                self.TD = torch.zeros(1)
                self.put_data()
                self.state = self.state_next.copy()
                self.info = self.info_next.copy()

                with self.lock:
                    self.counter.value += 1

                if self.done == True:
                    break

                if step % self.args.load_param_period == 0:
                    #self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
                    self.actor.load_state_dict(self.actor_share.state_dict())
                step += 1
                self.episode_step += 1
def main(method):
    args = built_parser(method=method)
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  #+ sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
        for i in range(args.num_learners):
            #device = torch.device("cuda")
            device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
class Evaluator(object):
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0

        self.evaluation_interval = 50000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_to_run = 10
        self.iteration_history = []
        self.evaluated_Q_mean_history=[]
        self.true_gamma_return_mean_history=[]
        # self.n_episodes_info_history = []
        self.evaluated_Q_history = []
        self.true_gamma_return_history = []

    def run_an_episode(self):
        state_list = []
        action_list = []
        log_prob_list = []
        reward_list = []
        evaluated_Q_list = []
        done = 0
        state = self.env.reset()
        while not done and len(reward_list) < self.args.max_step:
            state_tensor = torch.FloatTensor(state.copy()).float().to(self.device)
            u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), self.args.stochastic_actor)
            state_list.append(state.copy())
            action_list.append(u.copy())
            log_prob_list.append(log_prob)
            if self.args.double_Q and not self.args.double_actor:
                q = torch.min(
                    self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0],
                    self.Q_net2(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0])
            else:
                q = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0]
            evaluated_Q_list.append(q.detach().item())
            u = u.squeeze(0)
            state, reward, done, load_action = self.env.step(u)
            # self.env.render(mode='human')
            reward_list.append(reward * self.args.reward_scale)
        entropy_list = list(-self.alpha * np.array(log_prob_list))
        true_gamma_return_list = cal_gamma_return_of_an_episode(reward_list, entropy_list, self.args.gamma)
        episode_return = sum(reward_list)
        episode_len = len(reward_list)

        return dict(state_list=np.array(state_list),
                    action_list=np.array(action_list),
                    log_prob_list=np.array(log_prob_list),
                    reward_list=np.array(reward_list),
                    evaluated_Q_list=np.array(evaluated_Q_list),
                    true_gamma_return_list=true_gamma_return_list,
                    episode_return=episode_return,
                    episode_len=episode_len)

    def run_n_episodes(self, n, max_state):
        n_episode_state_list = []
        n_episode_action_list = []
        n_episode_log_prob_list = []
        n_episode_reward_list = []
        n_episode_evaluated_Q_list = []
        n_episode_true_gamma_return_list = []
        n_episode_return_list = []
        n_episode_len_list = []
        for _ in range(n):
            episode_info = self.run_an_episode()
            n_episode_state_list.append(episode_info['state_list'])
            n_episode_action_list.append(episode_info['action_list'])
            n_episode_log_prob_list.append(episode_info['log_prob_list'])
            n_episode_reward_list.append(episode_info['reward_list'])
            n_episode_evaluated_Q_list.append(episode_info['evaluated_Q_list'])
            n_episode_true_gamma_return_list.append(episode_info['true_gamma_return_list'])
            n_episode_return_list.append(episode_info['episode_return'])
            n_episode_len_list.append(episode_info['episode_len'])

        #n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history))
        #n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history))

        def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi):
            tmp = list(copy.deepcopy(list_of_n_epi))
            tmp[0] = tmp[0] if len(tmp[0]) <= max_state else tmp[0][:max_state]

            def reduce_fuc(a, b):
                return np.concatenate([a, b]) if len(b) < max_state else np.concatenate([a, b[:max_state]])

            interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp)
            return sum(interest_epi_part_of_one_ite) / len(interest_epi_part_of_one_ite)

        evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean(np.array(n_episode_evaluated_Q_list))

        true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean(
            np.array(n_episode_true_gamma_return_list))
        return evaluated_Q_mean, true_gamma_return_mean
        # return dict(n_episode_state_list=np.array(n_episode_state_list),
        #             n_episode_action_list=np.array(n_episode_action_list),
        #             n_episode_log_prob_list=np.array(n_episode_log_prob_list),
        #             n_episode_reward_list=np.array(n_episode_reward_list),
        #             n_episode_evaluated_Q_list=np.array(n_episode_evaluated_Q_list),
        #             n_episode_true_gamma_return_list=np.array(n_episode_true_gamma_return_list),
        #             n_episode_return_list=np.array(n_episode_return_list),
        #             n_episode_len_list=np.array(n_episode_len_list))

    def run(self):
        while not self.stop_sign.value:
            if self.iteration_counter.value % self.evaluation_interval == 0:
                self.alpha = np.exp(self.log_alpha_share.detach().item()) if self.args.alpha == 'auto' else 0
                self.iteration = self.iteration_counter.value
                self.actor.load_state_dict(self.actor_share.state_dict())
                self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
                self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())
                evaluated_Q_mean, true_gamma_return_mean = self.run_n_episodes(self.episode_num_to_run,self.max_state_num_evaluated_in_an_episode)
                self.iteration_history.append(self.iteration)
                self.evaluated_Q_mean_history.append(evaluated_Q_mean)
                self.true_gamma_return_mean_history.append(true_gamma_return_mean)
                print('Saving evaluation results of the {} iteration.'.format(self.iteration))
                np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration_evaluation',
                        np.array(self.iteration_history))
                np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_mean',
                        np.array(self.evaluated_Q_mean_history))
                np.save('./' + self.args.env_name + '/method_' + str(
                    self.args.method) + '/result/true_gamma_return_mean',
                        np.array(self.true_gamma_return_mean_history))

                plot_online(self.args.env_name, self.args.method, self.args.method_name,
                            self.max_state_num_evaluated_in_an_episode)
class Application():
    def __init__(self, args):
        super(Application, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.args = args
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train

        self.PI_net = PINet(args).to(self.device)

        self.PI_net.load_state_dict(
            torch.load('./Net1-0_0816/PI_' + str(self.load_index) + '.pkl',
                       map_location='cpu'))

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(
            torch.load('./Net1-0_0816/policy1_' + str(self.load_index) +
                       '.pkl',
                       map_location='cpu'))

        # self.Q_net0 = QNet(args).to(self.device)
        # self.Q_net0.load_state_dict(torch.load('./Net1-0_0816/Q1_' + str(self.load_index) + '.pkl',map_location='cpu'))
        self.speed_limit_max = 20 / 3.6
        self.steer_factor = 20
        self.action_multiplier = np.array(
            [math.pi / (9 * self.steer_factor), 2], dtype=np.float32)
        self.safe_gap = 0.
        self.noise_factor = 0.

        self.step = 0
        self.frequency = 10
        self.lanekeeping_max = 5
        self.lane_number = 2
        self.lane_width = 3.5

        self.car_length = 4.5
        self.car_width = 1.855

        self.road_width = self.lane_number * self.lane_width
        self.detect_rear_dist = 80
        self.detect_front_dist = 100

        self.other_length = 5.169
        self.other_width = 2.392
        self.position_index = 0
        self.position_other = np.zeros(20, dtype=np.int)

        self.max_state_other = np.array([
            100, self.road_width, self.speed_limit_max, math.pi / 6, 6.5, 2.5
        ],
                                        dtype=np.float32)

        self.max_state_ego = np.array(
            [self.speed_limit_max, 1, math.pi / 6, math.pi / 6, math.pi / 6, 4, self.lane_width / 2, self.road_width,
             self.road_width, self.speed_limit_max, self.speed_limit_max, self.lane_number - 1, 5, 2, \
             math.pi / 6, math.pi / 6, math.pi / 6, math.pi / 6, math.pi / 6], dtype=np.float32)

        self.wheel_steer_bound = [-1.5 * math.pi, 1.5 * math.pi]

        self.lane_list, self.lane_center_list, self.road_angle = load_map()

    def _get_dist_to_roadside(self, lane_index, dist_to_center):
        dist_left = (self.lane_number - 1 - lane_index
                     ) * self.lane_width + self.lane_width / 2 - dist_to_center
        dist_right = lane_index * self.lane_width + self.lane_width / 2 + dist_to_center
        return dist_left, dist_right

    def _get_road_related_info(self, ego_x, ego_y, ego_angle):
        self.ego_x = ego_x
        self.ego_y = ego_y
        if self.step >= 1:
            self.lane_index_old = self.lane_index
        for i in range(max(self.position_index - 100, 0),
                       len(self.road_angle) - 1, 1):
            if self.ego_x <= self.lane_list[1][
                    i, 0] and self.ego_x >= self.lane_list[1][i + 1, 0]:
                self.position_index = i
                if ego_y <= self.lane_list[1][i, 1]:
                    lane_index = 1
                else:
                    lane_index = 0
                break
            if i == len(self.road_angle) - 2:
                lane_index = 0
                self.position_index = i
        # print("lane_index",lane_index,"road_angle",len(self.road_angle))
        if ego_y > self.lane_center_list[lane_index][self.position_index, 1]:
            dist_flag = -1.0
        else:
            dist_flag = 1.0
        dist2center = dist_flag * np.sqrt(
            (ego_x -
             self.lane_center_list[lane_index][self.position_index, 0])**2 +
            (ego_y -
             self.lane_center_list[lane_index][self.position_index, 1])**2)
        dist_left, dist_right = self._get_dist_to_roadside(
            lane_index, dist2center)
        self.lane_index = lane_index
        self.dist2center = np.float32(dist2center)
        self.dist_left = np.float32(dist_left)
        self.dist_right = np.float32(dist_right)
        self.delta_angle = -(ego_angle - self.road_angle[self.position_index])
        self.current_road_angle = self.road_angle[self.position_index]

        # print(self.lane_index,self.dist2center,self.dist_left ,self.dist_right,self.delta_angle )

    def _get_next_vehicle(self, t_interval):
        lane_list = []
        for i in range(len(self.x_other)):
            for j in range(max(self.position_other[i] - 100, 0),
                           len(self.road_angle) - 1, 1):
                if self.x_other[i] <= self.lane_center_list[0][
                        j, 0] and self.x_other[i] >= self.lane_center_list[0][
                            j + 1, 0]:
                    index = j
                    break
            dist_1_0 = np.sqrt(
                (self.x_other[i] - self.lane_center_list[0][index, 0])**2 +
                (self.y_other[i] - self.lane_center_list[0][index, 1])**2)
            dist_1_1 = np.sqrt(
                (self.x_other[i] - self.lane_center_list[1][index, 0])**2 +
                (self.y_other[i] - self.lane_center_list[1][index, 1])**2)
            self.position_other[i] = index
            if dist_1_0 < dist_1_1:
                lane_list.append(0)
            else:
                lane_list.append(1)

        x_next = []
        y_next = []
        heading_next = []

        for i in range(len(self.x_other)):
            x_next.append(self.x_other[i] - self.v_other[i] * t_interval)
            if len(self.road_angle) - self.position_other[i] < 1000:
                x_next[i] = self.lane_center_list[lane_list[i]][0, 0]
                y_next.append(self.lane_center_list[lane_list[i]][0, 1])
                heading_next.append(self.road_angle[0])
                self.position_other[i] = 0
            else:
                y_next.append(
                    self.lane_center_list[lane_list[i]][self.position_other[i],
                                                        1])
                heading_next.append(self.road_angle[self.position_other[i]])
                # for j in range(len(self.road_angle) - 1):
                #     if x_next[i]<=self.lane_center_list[lane_list[i]][j, 0] and x_next[i]>=self.lane_center_list[lane_list[i]][j+1, 0]:
                #         y_next.append(self.lane_center_list[lane_list[i]][j, 1])

        return x_next, y_next, self.v_other, heading_next

    def _get_ego_state(self, v=1, v_lat=0, yaw_rate=0, wheel_steer=0, acc=0):
        if self.step == 0:
            self.lanekeeping_time = 4
        else:
            if self.lane_index == self.lane_index_old:
                self.lanekeeping_time += 1 / self.frequency
            else:
                self.lanekeeping_time = 0
            self.lanekeeping_time = min(self.lanekeeping_time,
                                        self.lanekeeping_max)

        self.state_ego_dict_real = dict(
            v=v,
            v_lat=v_lat,
            yaw_rate=yaw_rate * math.pi / 180,
            heading=self.delta_angle * math.pi / 180,
            steer=wheel_steer / self.steer_factor * math.pi / 180,
            acc=acc,
            #v_long=self.ego_dynamics['Longitudinal_speed'],
            dist2center=self.dist2center,
            dist2road_bound_1=self.dist_left - self.car_width / 2 -
            self.safe_gap,
            dist2road_bound_2=self.dist_right - self.car_width / 2 -
            self.safe_gap,
            dist2speed_limit=self.speed_limit_max - v,
            dist2speed_limit_low=v - 0,
            lane=self.lane_index,
            other_veh_num=self.veh_num,
            lanekeeping_time=self.lanekeeping_time,
            future_heading_10=0,
            future_heading_20=0,
            future_heading_30=0,
            future_heading_40=0,
            future_heading_50=0,
        )
        position_noise = self.noise_factor * np.clip(
            np.random.normal(0, 0.033), -0.1, 0.1)
        heading_noise = self.noise_factor * np.clip(np.random.normal(0, 0.33),
                                                    -1, 1)

        self.state_ego_dict = dict(
            v=v,
            v_lat=v_lat,
            yaw_rate=yaw_rate * math.pi / 180,
            heading=(self.delta_angle + heading_noise) * math.pi / 180,
            steer=wheel_steer / self.steer_factor * math.pi / 180,
            acc=acc,
            #v_long=self.ego_dynamics['Longitudinal_speed'],
            dist2center=self.dist2center + position_noise,
            dist2road_bound_1=self.dist_left - self.car_width / 2 -
            self.safe_gap - position_noise,
            dist2road_bound_2=self.dist_right - self.car_width / 2 -
            self.safe_gap + position_noise,
            dist2speed_limit=self.speed_limit_max - v,
            dist2speed_limit_low=v - 0,
            lane=self.lane_index,
            other_veh_num=self.veh_num,
            lanekeeping_time=self.lanekeeping_time,
            future_heading_10=0,
            future_heading_20=0,
            future_heading_30=0,
            future_heading_40=0,
            future_heading_50=0,
        )

        self.state_ego = np.array(list(self.state_ego_dict.values()),
                                  dtype=np.float32) / self.max_state_ego

    def _get_other_info(self, v_ego, x, y, v, heading):
        self.x_other = x
        self.y_other = y
        self.v_other = v
        heading_other = heading
        self.veh_num = 0
        veh_index = []
        for i in range(len(x)):
            if self.ego_x - x[i] < self.detect_front_dist and x[
                    i] - self.ego_x < self.detect_rear_dist:
                self.veh_num += 1
                veh_index.append(i)
        if self.veh_num != 0:
            self.element_ori = np.zeros([len(veh_index), 6], dtype=np.float32)
            self.element_ori_real = self.element_ori.copy()

            for i in range(len(veh_index)):
                other_x = x[veh_index[i]]
                other_y = y[veh_index[i]]
                other_v = v[veh_index[i]]
                other_heading = heading_other[veh_index[i]]
                delta_x = self.ego_x - other_x
                delta_y = self.ego_y - other_y
                dist_ego2other = np.sqrt(delta_x**2 + delta_y**2)
                if delta_x >= 0:
                    heading_ego2other = np.arctan(delta_y / (delta_x + 1e-6))
                else:
                    heading_ego2other = np.arctan(delta_y /
                                                  (delta_x - 1e-6)) + math.pi
                if heading_ego2other >= math.pi:
                    heading_ego2other = heading_ego2other - 2 * math.pi
                elif heading_ego2other < -math.pi:
                    heading_ego2other = heading_ego2other + 2 * math.pi
                delta_heading = heading_ego2other - (
                    270 * math.pi / 180 -
                    self.current_road_angle * math.pi / 180)
                relate_x = dist_ego2other * np.cos(delta_heading)
                relate_y = dist_ego2other * np.sin(delta_heading)
                self.element_ori_real[i] = np.array([
                    relate_x, relate_y, other_v - v_ego, other_heading,
                    self.other_length, self.other_width
                ],
                                                    dtype=np.float32)
                self.element_ori[i] = np.array([
                    relate_x + self.noise_factor *
                    np.clip(np.random.normal(0, 0.1), -0.3, 0.3),
                    relate_y + self.noise_factor *
                    np.clip(np.random.normal(0, 0.1), -0.3, 0.3),
                    other_v - v_ego + self.noise_factor *
                    np.clip(np.random.normal(0, 0.1), -0.3, 0.3),
                    other_heading + self.noise_factor *
                    np.clip(np.random.normal(0, 0.05), -0.15, 0.15),
                    self.other_length + self.noise_factor *
                    np.clip(np.random.normal(0, 0.02), -0.06, 0.06),
                    self.other_width + self.noise_factor *
                    np.clip(np.random.normal(0, 0.02), -0.06, 0.06)
                ],
                                               dtype=np.float32)
        else:
            self.veh_num = 1
            self.element_ori = np.array([[
                self.detect_front_dist, 0, 0, 0, self.other_length,
                self.other_width
            ]],
                                        dtype=np.float32)
            self.element_ori_real = np.array([[
                self.detect_front_dist, 0, 0, 0, self.other_length,
                self.other_width
            ]],
                                             dtype=np.float32)

        # f2=plt.figure(0,figsize=(20, 5))
        # plt.ion()
        # for i in range(len(self.lane_list)):
        #     plt.plot(self.lane_list[i][:,0], self.lane_list[i][:,1], color='green', linewidth='2')
        # for i in range(len(self.lane_center_list)):
        #     plt.plot(self.lane_center_list[i][:,0], self.lane_center_list[i][:,1], color='red', linewidth='2')
        # plt.scatter(self.ego_x,self.ego_y, color='red')
        # for i in range(len(x)):
        #     plt.scatter(x[i], y[i], color='blue')
        # ax = plt.gca()
        # ax.set_aspect('equal')
        # ax.invert_xaxis()
        # ax.invert_yaxis()
        # plt.title(['relate_x:' + str(self.element_ori[:,0]) + ' relate_y:' + str(self.element_ori[:,1])+ "  relate_angle:"+str(round(self.delta_angle,2))+\
        #            'lane:' + str(round(self.lane_index,2)) + ' dist2center:' + str(round(self.dist2center,2))+ "  distleft:"+str(round(self.dist_left,2))+\
        #            ' dist_right:' + str(round(self.dist_right,2)) + ' road angle:' + str(round(self.current_road_angle,2))])
        # plt.pause(0.01)
        # f2.clf()
        # plt.figure(figsize=(20, 5))
        # plt.plot(road_info[:,0], road_info[:,2], color='green', linewidth='2')
        # ax = plt.gca()
        # ax.set_aspect('equal')
        # ax.invert_xaxis()
        # plt.show()
        self.element = self.element_ori / self.max_state_other

    def simu(self):
        self.env = gym.make("Experiment-v3")
        time_init = time.time()
        step = 0
        observation = self.env.reset(random=False,
                                     sensor_used=self.args.if_sensor_used,
                                     veh_num=self.args.veh_num,
                                     simu=True)
        self.episode_step = 0
        simu_state_list = []
        while True:
            state_ego = observation[0]
            element = observation[1]
            veh_number = observation[1].shape[0]
            state_other_vector = observation[2]
            simu_state_list.append(observation[3])
            element_tensor = torch.FloatTensor(element.copy()).to(self.device)
            ego_tensor = torch.FloatTensor(state_ego.copy()).to(self.device)
            state_other = torch.sum(self.PI_net.evaluate(element_tensor),
                                    dim=0)
            state_tensor = torch.cat([ego_tensor, state_other.detach()])
            self.u, log_prob, _ = self.actor.get_action(
                state_tensor.unsqueeze(0), True)
            self.u = self.u.squeeze(0)

            observation, self.reward, self.done, _ = self.env.step(self.u)
            self.env.render(mode='human')
            step += 1
            if self.done or step == 1000:
                time.sleep(1)
                print('method', self.args.method, 'step', step, 'time',
                      time.time() - time_init)
                print("!!!!!!!!!!!!!!!")
                break

    def control_step(self):
        time_init = time.time()
        element_tensor = torch.FloatTensor(self.element.copy()).to(self.device)
        ego_tensor = torch.FloatTensor(self.state_ego.copy()).to(self.device)
        state_other = torch.sum(self.PI_net.evaluate(element_tensor), dim=0)
        state_tensor = torch.cat([ego_tensor, state_other.detach()])
        self.u, log_prob, _ = self.actor.get_action(state_tensor.unsqueeze(0),
                                                    True)
        self.u = self.u.squeeze(0)
        self.step += 1
        #print(time.time()-time_init)

        return self.u
Example #23
0
class Simulation():
    def __init__(self, args,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        simu_params = {
            'number_of_vehicles': 0,
            'number_of_walkers': 0,
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': 2000,  # connection port
            'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name, params=simu_params)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train
        # self.load_index = 40000

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu'))


        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_history =[]
        self.Q_std_history = []


    def run(self):
        alpha = 0.004
        step = 0

        summaryFlag = True
        while True:
            self.state, self.info = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
            info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device)

            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True)


            for i in range(500):
                q = self.Q_net1(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]
                if self.args.double_Q:
                    q = torch.min(
                        q,
                        self.Q_net2(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0])


                self.Q_history.append(q.detach().item())

                self.u = self.u.squeeze(0)

                # TODO
                if summaryFlag:
                    with SummaryWriter(log_dir='./logs') as writer:
                        # writer.add_scalar('random', np.random.randint(0, 10), i)
                        v = self.env.ego.get_velocity()
                        v = np.array([v.x, v.y, v.z])
                        writer.add_scalar('v_x', self.env.state_info['velocity_t'][0], i)
                        writer.add_scalar('v_y', self.env.state_info['velocity_t'][1], i)
                        writer.add_scalar('accelaration_x', self.env.state_info['acceleration_t'][0], i)
                        writer.add_scalar('accelaration_y', self.env.state_info['acceleration_t'][1], i)
                        # writer.add_scalar('distance2terminal', self.env.state_info['dist_to_dest'], i)
                        # writer.add_scalar('delta_yaw', self.state[5]*2, i)
                        writer.add_scalar('angular_speed_z', self.env.state_info['dyaw_dt_t'], i)
                        # writer.add_scalar('lateral_dist', self.state[7]/10, i)
                        writer.add_scalar('action_throttle', self.env.state_info['action_t_1'][0], i)
                        writer.add_scalar('action_steer', self.env.state_info['action_t_1'][1], i)
                        writer.add_scalar('delta_yaw', self.env.state_info['delta_yaw_t'], i)
                        writer.add_scalar('dist2center', self.env.state_info['lateral_dist_t'], i)

                self.state, self.reward, self.done, self.info = self.env.step(self.u)

                self.reward_history.append(self.reward)
                self.done_history.append(self.done)
                self.entropy_history.append(log_prob)

                # render the image
                cv2.imshow("camera img", self.state.squeeze())
                cv2.waitKey(1)
                # if step%10000 >=0 and step%10000 <=9999:
                #     self.env.render(mode='human')
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
                info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device)

                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True)



                if self.done == True or self.env.isTimeOut:
                    time.sleep(1)
                    print("Episode Done!")
                    summaryFlag = False
                    # return
                    break
                step += 1
                self.episode_step += 1

            if self.done == True:
                pass
                #break


        print(self.reward_history)
        for i in range(len(self.Q_history)):
            a = 0
            for j in range(i, len(self.Q_history), 1):
                a += pow(self.args.gamma, j-i)*self.reward_history[j]
            for z in range(i+1, len(self.Q_history), 1):
                a -= alpha * pow(self.args.gamma, z-i) * self.entropy_history[z]
            self.Q_real_history.append(a)


        plt.figure()
        x = np.arange(0,len(self.Q_history),1)
        plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0)
        plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0)

        plt.show()
Example #24
0
class Simulation():
    def __init__(self, args, shared_queue,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.policy_test_queue = shared_queue[3]
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = 20000



        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device)
        self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl'))
        self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl'))



        self.test_step = 0

        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_m0_history =[]
        self.Q_m1_history = []
        self.Q_m2_history = []
        self.Q_std_m2_history = []



    def load_param(self):
        if self.policy_test_queue.empty():
            pass
        else:
            self.iteration, param = self.policy_test_queue.get()
            self.actor.load_state_dict(param)

    def run(self):

        step = 0
        while True:
            self.state = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False)


            for i in range(self.args.max_step):
                q_m0 = self.Q_net1_m0(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]
                q_m1 = torch.min(
                    self.Q_net1_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0],
                    self.Q_net2_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0])
                q_m2, q_std, _ = self.Q_net1_m2.evaluate(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))



                self.Q_m0_history.append(q_m0.detach().item())
                self.Q_m1_history.append(q_m1.detach().item())
                self.Q_m2_history.append(q_m2.detach().item())
                self.Q_std_m2_history.append(q_std.detach().item())

                self.u = self.u.squeeze(0)
                self.state, self.reward, self.done, _ = self.env.step(self.u)

                self.reward_history.append(self.reward)
                self.done_history.append(self.done)
                self.entropy_history.append(log_prob)


                if step%10000 >=0 and step%10000 <=9999:
                    self.env.render(mode='human')
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False)



                if self.done == True:
                    time.sleep(1)
                    print("!!!!!!!!!!!!!!!")
                    break
                step += 1
                self.episode_step += 1

            if self.done == True:
                break



        for i in range(len(self.Q_m0_history)):
            a = 0
            for j in range(i, len(self.Q_m0_history), 1):
                a += pow(self.args.gamma, j-i)*self.reward_history[j]
            for z in range(i+1, len(self.Q_m0_history), 1):
                a -= self.args.alpha * pow(self.args.gamma, z-i) * self.entropy_history[z]
            self.Q_real_history.append(a)

        print(self.reward_history)
        print(self.entropy_history)
        print(self.Q_m2_history)
        print(self.Q_std_m2_history)

        plt.figure()
        x = np.arange(0,len(self.Q_m0_history),1)
        plt.plot(x, np.array(self.Q_m0_history), 'r', linewidth=2.0)
        plt.plot(x, np.array(self.Q_m1_history), 'g', linewidth=2.0)
        plt.plot(x, np.array(self.Q_m2_history), 'b', linewidth=2.0)


        plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0)

        plt.show()
    def __init__(self, args):
        super(Application, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.args = args
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train

        self.PI_net = PINet(args).to(self.device)

        self.PI_net.load_state_dict(
            torch.load('./Net1-0_0816/PI_' + str(self.load_index) + '.pkl',
                       map_location='cpu'))

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(
            torch.load('./Net1-0_0816/policy1_' + str(self.load_index) +
                       '.pkl',
                       map_location='cpu'))

        # self.Q_net0 = QNet(args).to(self.device)
        # self.Q_net0.load_state_dict(torch.load('./Net1-0_0816/Q1_' + str(self.load_index) + '.pkl',map_location='cpu'))
        self.speed_limit_max = 20 / 3.6
        self.steer_factor = 20
        self.action_multiplier = np.array(
            [math.pi / (9 * self.steer_factor), 2], dtype=np.float32)
        self.safe_gap = 0.
        self.noise_factor = 0.

        self.step = 0
        self.frequency = 10
        self.lanekeeping_max = 5
        self.lane_number = 2
        self.lane_width = 3.5

        self.car_length = 4.5
        self.car_width = 1.855

        self.road_width = self.lane_number * self.lane_width
        self.detect_rear_dist = 80
        self.detect_front_dist = 100

        self.other_length = 5.169
        self.other_width = 2.392
        self.position_index = 0
        self.position_other = np.zeros(20, dtype=np.int)

        self.max_state_other = np.array([
            100, self.road_width, self.speed_limit_max, math.pi / 6, 6.5, 2.5
        ],
                                        dtype=np.float32)

        self.max_state_ego = np.array(
            [self.speed_limit_max, 1, math.pi / 6, math.pi / 6, math.pi / 6, 4, self.lane_width / 2, self.road_width,
             self.road_width, self.speed_limit_max, self.speed_limit_max, self.lane_number - 1, 5, 2, \
             math.pi / 6, math.pi / 6, math.pi / 6, math.pi / 6, math.pi / 6], dtype=np.float32)

        self.wheel_steer_bound = [-1.5 * math.pi, 1.5 * math.pi]

        self.lane_list, self.lane_center_list, self.road_angle = load_map()
from __future__ import print_function
Example #27
0
def main(method):

    params = {
        'obs_size': (160, 100),  # screen size of cv2 window
        'dt': 0.025,  # time interval between two frames
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'task_mode':
        'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'train',
        'max_time_episode': 100,  # maximum timesteps per episode
        'desired_speed': 15,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    args = built_parser(method=method)
    env = gym.make(args.env_name, params=params)
    state_dim = env.state_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()
    num_cpu = mp.cpu_count()
    print(state_dim, action_dim, action_high, num_cpu)

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  # + sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    print("Network inited")

    if args.code_model == "eval":
        actor1.load_state_dict(
            torch.load('./' + args.env_name + '/method_' + str(args.method) +
                       '/model/policy_' + str(args.max_train) + '.pkl'))
    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    print("Network set")

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    print("Network loaded!")

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    print("Optimizer done")

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_learners):
            if i % 2 == 0:
                device = torch.device("cuda:1")
            else:
                device = torch.device("cuda:0")
            # device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
Example #28
0
class Actor():
    def __init__(self, args, shared_queue, shared_value, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.experience_queue = shared_queue[0]
        self.policy_param_queue = shared_queue[1]
        self.q_param_queue = shared_queue[2]
        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name)
        self.args = args
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell,
                               args.action_high, args.action_low,
                               args.NN_type).to(self.device)
        self.Q_net1 = QNet(args.state_dim, args.action_dim,
                           args.num_hidden_cell, args.NN_type).to(self.device)

    def update_actor_net(self, current_dict, actor_net):
        params_target = get_flat_params_from(actor_net)
        params = get_flat_params_from_dict(current_dict)
        set_flat_params_to(actor_net, (1 - self.args.syn_tau) * params_target +
                           self.args.syn_tau * params)

    def load_param(self):
        if self.policy_param_queue.empty():
            #pass
            #print("agent", self.agent_id, "is waiting param")
            time.sleep(0.5)
            #self.load_param()
        else:
            param = self.policy_param_queue.get()
            if self.args.syn_method == "copy":
                self.actor.load_state_dict(param)
            elif self.args.syn_method == "slow":
                self.update_actor_net(param, self.actor)
        if self.q_param_queue.empty():
            time.sleep(0.5)
            #self.load_param()
        else:
            param = self.q_param_queue.get()
            self.Q_net1.load_state_dict(param)

    def put_data(self):
        if not self.stop_sign.value:
            if self.experience_queue.full():
                #print("agent", self.agent_id, "is waiting queue space")
                time.sleep(0.5)
                self.put_data()
            else:
                self.experience_queue.put(
                    (self.last_state, self.last_u, [self.reward],
                     self.state, [self.micro_step], [self.done],
                     self.TD.detach().cpu().numpy().squeeze()))
        else:
            pass

    def run(self):
        time_init = time.time()
        step = 0
        self.micro_step = 0
        while not self.stop_sign.value:
            self.state = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0),
                                                     False)
            q_1 = self.Q_net1(state_tensor.unsqueeze(0),
                              torch.FloatTensor(self.u).to(self.device))[0]
            self.u = self.u.squeeze(0)
            self.last_state = self.state.copy()
            self.last_u = self.u.copy()
            last_q_1 = q_1

            for i in range(self.args.max_step):
                self.state, self.reward, self.done, _ = self.env.step(self.u)
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                    self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob = self.actor.get_action(
                    state_tensor.unsqueeze(0), False)
                q_1 = self.Q_net1(state_tensor.unsqueeze(0),
                                  torch.FloatTensor(self.u).to(self.device))[0]
                self.u = self.u.squeeze(0)
                if self.episode_step > 0:
                    self.TD = self.reward + (
                        1 - self.done) * self.args.gamma * q_1 - last_q_1
                    self.put_data()
                self.last_state = self.state.copy()
                self.last_u = self.u.copy()
                last_q_1 = q_1

                with self.lock:
                    self.counter.value += 1

                if self.done == True:
                    break

                if step % self.args.load_param_period == 0:
                    self.load_param()
                step += 1
                self.episode_step += 1
Example #29
0
def main():
    # parameters for the gym_carla environment
    params = {
        'display_size': 256,  # screen size of bird-eye render
        'obs_size': 128,  # screen size of cv2 window
        'dt': 0.1,  # time interval between two frames
        'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        # 'town': 'Town01',  # which town to simulate
        'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'test',
        'max_time_episode': 5000,  # maximum timesteps per episode
        'desired_speed': 8,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    # Set gym-carla environment
    env = gym.make('carla-v0', params=params)

    # load net
    device = torch.device('cpu')
    args = Args()
    args.NN_type
    actor = PolicyNet(args).to(device)
    actor.load_state_dict(torch.load('./policy1_500000.pkl',map_location='cpu'))

    Q_net1 = QNet(args).to(device)
    Q_net1.load_state_dict(torch.load('./Q1_500000.pkl',map_location='cpu'))

    obs, info_dict = env.reset()
    info = info_dict_to_array(info_dict)

    state_tensor = torch.FloatTensor(obs.copy()).float().to(device)
    info_tensor = torch.FloatTensor(info.copy()).float().to(device)

    # print(env.ego.get_location())
    tic = time.time()
    done = False
    ret = 0
    start = carla.Location(x=env.start[0], y=env.start[1], z=0.22)
    end = carla.Location(x=env.dest[0], y=env.dest[1], z=0.22)

    if args.NN_type == "CNN":
        state_tensor = state_tensor.permute(2, 0, 1)

    while not done:
        tac = time.time()

        u, log_prob = actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True)
        u = u.squeeze(0)

        obs, r, done, info = env.step(u)

        info = info_dict_to_array(info_dict)
        state_tensor = torch.FloatTensor(obs.copy()).float().to(device)
        if args.NN_type == "CNN":
            state_tensor = state_tensor.permute(2, 0, 1)
        info_tensor = torch.FloatTensor(info.copy()).float().to(device)

        ret += r
        cv2.imshow("camera img", obs)
        cv2.waitKey(1)
        # print(info['acceleration_t'].shape)
        env.world.debug.draw_point(start)
        env.world.debug.draw_point(end)

        if done:
            toc = time.time()
            print("An episode took %f s" %(toc - tic))
            print("total reward is", ret)
            print("time steps", env.time_step)
            env.close()
            env.reset()
            ret = 0
            # print(env.ego.get_location())
            done = False