from __future__ import print_function
Exemple #2
0
class Evaluator(object):
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        eval_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name, params=eval_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 50  # 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []

    def average_max_n(self, list_for_average, n):
        sorted_list = sorted(list_for_average, reverse=True)
        return sum(sorted_list[:n]) / n

    def run_an_episode(self, deterministic):
        #state_list = []
        action_list = []
        log_prob_list = []
        reward_list = []
        evaluated_Q_list = []
        Q_std_list = []
        a_std_list = []
        done = 0
        state, info = self.env.reset()
        while not done and len(reward_list) < (self.args.max_step - 1):
            state_tensor = torch.FloatTensor(state.copy()).float().to(
                self.device)
            info_tensor = torch.FloatTensor(info.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)  # 3, 256, 256

            u, log_prob, a_std = self.actor.get_action(
                state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                deterministic)
            log_prob_list.append(log_prob)
            a_std_list.append(a_std)
            if self.args.double_Q and not self.args.double_actor:
                q = torch.min(
                    self.Q_net1.evaluate(
                        state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                        torch.FloatTensor(u.copy()).to(self.device))[0],
                    self.Q_net2.evaluate(
                        state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                        torch.FloatTensor(u.copy()).to(self.device))[0])
            else:
                q, q_std, _ = self.Q_net1.evaluate(
                    state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                    torch.FloatTensor(u.copy()).to(self.device))
            evaluated_Q_list.append(q.detach().item())
            if self.args.distributional_Q:
                Q_std_list.append(q_std.detach().item())
            else:
                Q_std_list.append(0)
            u = u.squeeze(0)
            state, reward, done, info = self.env.step(u)
            # self.env.render(mode='human')
            action_list.append(u)
            reward_list.append(reward * self.args.reward_scale)

        if not deterministic:
            entropy_list = list(-self.alpha * np.array(log_prob_list))
            true_gamma_return_list = cal_gamma_return_of_an_episode(
                reward_list, entropy_list, self.args.gamma)
            policy_entropy = -sum(log_prob_list) / len(log_prob_list)
            a_std_mean = np.mean(np.array(a_std_list), axis=0)
            a_abs_mean = np.mean(np.abs(np.array(action_list)), axis=0)
            return dict(  #state_list=np.array(state_list),
                #action_list=np.array(action_list),
                log_prob_list=np.array(log_prob_list),
                policy_entropy=policy_entropy,
                #reward_list=np.array(reward_list),
                a_std_mean=a_std_mean,
                a_abs_mean=a_abs_mean,
                evaluated_Q_list=np.array(evaluated_Q_list),
                Q_std_list=np.array(Q_std_list),
                true_gamma_return_list=true_gamma_return_list,
            )
        else:
            episode_return = sum(reward_list) / self.args.reward_scale
            episode_len = len(reward_list)
            return dict(episode_return=episode_return, episode_len=episode_len)

    def run_n_episodes(self, n, max_state, deterministic):
        n_episode_state_list = []
        n_episode_action_list = []
        n_episode_log_prob_list = []
        n_episode_reward_list = []
        n_episode_evaluated_Q_list = []
        n_episode_Q_std_list = []
        n_episode_true_gamma_return_list = []
        n_episode_return_list = []
        n_episode_len_list = []
        n_episode_policyentropy_list = []
        n_episode_a_std_list = []
        for _ in range(n):
            episode_info = self.run_an_episode(deterministic)
            # n_episode_state_list.append(episode_info['state_list'])
            # n_episode_action_list.append(episode_info['action_list'])
            # n_episode_log_prob_list.append(episode_info['log_prob_list'])
            # n_episode_reward_list.append(episode_info['reward_list'])
            if not deterministic:
                n_episode_evaluated_Q_list.append(
                    episode_info['evaluated_Q_list'])
                n_episode_Q_std_list.append(episode_info['Q_std_list'])
                n_episode_true_gamma_return_list.append(
                    episode_info['true_gamma_return_list'])
                n_episode_policyentropy_list.append(
                    episode_info['policy_entropy'])
                n_episode_a_std_list.append(episode_info['a_std_mean'])
                n_episode_action_list.append(episode_info['a_abs_mean'])
            else:
                n_episode_return_list.append(episode_info['episode_return'])
                n_episode_len_list.append(episode_info['episode_len'])

        if not deterministic:
            average_policy_entropy = sum(n_episode_policyentropy_list) / len(
                n_episode_policyentropy_list)
            average_a_std = np.mean(np.array(n_episode_a_std_list), axis=0)
            average_a_abs = np.mean(np.array(n_episode_action_list), axis=0)

            # n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history))
            # n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history))

            def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi):
                tmp = list(copy.deepcopy(list_of_n_epi))
                tmp[0] = tmp[0] if len(
                    tmp[0]) <= max_state else tmp[0][:max_state]

                def reduce_fuc(a, b):
                    return np.concatenate(
                        [a, b]) if len(b) < max_state else np.concatenate(
                            [a, b[:max_state]])

                interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp)
                return sum(interest_epi_part_of_one_ite) / len(
                    interest_epi_part_of_one_ite)

            evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_evaluated_Q_list))
            evaluated_Q_std = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_Q_std_list))
            true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_true_gamma_return_list))

            return dict(evaluated_Q_mean=evaluated_Q_mean,
                        true_gamma_return_mean=true_gamma_return_mean,
                        evaluated_Q_std=evaluated_Q_std,
                        n_episode_reward_list=np.array(n_episode_reward_list),
                        policy_entropy=average_policy_entropy,
                        a_std=average_a_std,
                        a_abs=average_a_abs)
        else:
            average_return_with_diff_base = np.array([
                self.average_max_n(n_episode_return_list, x)
                for x in [1, self.episode_num_test - 2, self.episode_num_test]
            ])
            average_reward = sum(n_episode_return_list) / sum(
                n_episode_len_list)
            return dict(
                n_episode_reward_list=np.array(n_episode_reward_list),
                average_return_with_diff_base=average_return_with_diff_base,
                average_reward=average_reward,
            )

    def run(self):
        while not self.stop_sign.value:
            if self.iteration_counter.value % self.evaluation_interval == 0:
                self.alpha = np.exp(self.log_alpha_share.detach().item()
                                    ) if self.args.alpha == 'auto' else 0
                self.iteration = self.iteration_counter.value
                self.actor.load_state_dict(self.actor_share.state_dict())
                self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
                self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())

                delta_time = time.time() - self.time
                self.time = time.time()
                n_episode_info = self.run_n_episodes(
                    self.episode_num_evaluation,
                    self.max_state_num_evaluated_in_an_episode, False)
                self.iteration_history.append(self.iteration)
                self.evaluated_Q_mean_history.append(
                    n_episode_info['evaluated_Q_mean'])
                self.evaluated_Q_std_history.append(
                    n_episode_info['evaluated_Q_std'])
                self.true_gamma_return_mean_history.append(
                    n_episode_info['true_gamma_return_mean'])
                self.time_history.append(delta_time)
                # self.list_of_n_episode_rewards_history.append(list_of_n_episode_rewards)
                self.alpha_history.append(self.alpha.item())
                self.policy_entropy_history.append(
                    n_episode_info['policy_entropy'])
                self.a_std_history.append(n_episode_info['a_std'])
                self.a_abs_history.append(n_episode_info['a_abs'])
                n_episode_info_test = self.run_n_episodes(
                    self.episode_num_test,
                    self.max_state_num_evaluated_in_an_episode, True)
                self.average_return_with_diff_base_history.append(
                    n_episode_info_test['average_return_with_diff_base'])
                self.average_reward_history.append(
                    n_episode_info_test['average_reward'])

                print('Saving evaluation results of the {} iteration.'.format(
                    self.iteration))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/iteration',
                    np.array(self.iteration_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/evaluated_Q_mean',
                    np.array(self.evaluated_Q_mean_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/evaluated_Q_std',
                    np.array(self.evaluated_Q_std_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/true_gamma_return_mean',
                    np.array(self.true_gamma_return_mean_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/time',
                    np.array(self.time_history))
                # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards',
                #         np.array(self.list_of_n_episode_rewards_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) +
                    '/result/average_return_with_diff_base',
                    np.array(self.average_return_with_diff_base_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/average_reward',
                    np.array(self.average_reward_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/alpha',
                    np.array(self.alpha_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/policy_entropy',
                    np.array(self.policy_entropy_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/a_std',
                    np.array(self.a_std_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/a_abs',
                    np.array(self.a_abs_history))

                # plot_online(self.args.env_name, self.args.method, self.args.method_name,
                #             self.max_state_num_evaluated_in_an_episode)

                if self.iteration >= self.args.max_train:
                    self.stop_sign.value = 1
                    break
Exemple #3
0
class Learner():
    def __init__(self, args, shared_queue, shared_value, share_net,
                 share_optimizer, device, lock, i):
        super(Learner, self).__init__()
        self.args = args
        seed = self.args.seed
        self.init_time = self.args.init_time
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.agent_id = i

        self.experience_out_queue = []
        for i in range(args.num_buffers):
            self.experience_out_queue.append(shared_queue[1][i])

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value

        self.device = device
        if self.device == torch.device("cpu"):
            self.gpu = False
        else:
            self.gpu = True
        self.lock = lock

        self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \
                                        self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net
        self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer

        self.Q_net1 = QNet(args).to(self.device)
        self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR(
            self.Q_net1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net1.train()

        self.Q_net1_target = QNet(args).to(self.device)
        self.Q_net1_target.train()

        self.Q_net2 = QNet(args).to(self.device)
        self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR(
            self.Q_net2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net2.train()

        self.Q_net2_target = QNet(args).to(self.device)
        self.Q_net2_target.train()

        self.actor1 = PolicyNet(args).to(self.device)
        self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR(
            self.actor1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor1.train()

        self.actor1_target = PolicyNet(args).to(self.device)
        self.actor1_target.train()

        self.actor2 = PolicyNet(args).to(self.device)
        self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR(
            self.actor2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor2.train()

        self.actor2_target = PolicyNet(args).to(self.device)
        self.actor2_target.train()

        self.scheduler_alpha = lr_scheduler.CosineAnnealingLR(
            self.alpha_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)

        if self.args.alpha == 'auto':
            self.target_entropy = args.target_entropy
        else:
            self.alpha = torch.tensor(self.args.alpha)

    def get_qloss(self, q, q_std, target_q, target_q_bound):
        if self.args.distributional_Q:
            # loss = -Normal(q, q_std).log_prob(target_q).mean()
            # loss = torch.mean(-Normal(q, q_std).log_prob(target_q_bound)*self.weight \
            #                   + self.weight.logical_not()*torch.pow(q-target_q,2))
            loss = torch.mean(torch.pow(q-target_q,2)/(2*torch.pow(q_std.detach(),2)) \
                   + torch.pow(q.detach()-target_q_bound,2)/(2*torch.pow(q_std,2))\
                   + torch.log(q_std))
        else:
            criterion = nn.MSELoss()
            loss = criterion(q, target_q)
        return loss

    def get_policyloss(self, q, log_prob_a_new):
        loss = (self.alpha.detach() * log_prob_a_new - q).mean()
        return loss

    def update_net(self, loss, optimizer, net, net_share, scheduler):
        optimizer.zero_grad()
        if self.gpu:
            if self.args.alpha == 'auto':
                if net is not self.log_alpha:
                    net.zero_grad()
            else:
                net.zero_grad()
        loss.backward()
        if self.args.alpha == 'auto':
            if net is self.log_alpha:
                if self.log_alpha_share.grad is None or self.log_alpha_share.grad == 0:
                    self.log_alpha_share._grad = self.log_alpha.grad
            else:
                ensure_shared_grads(model=net,
                                    shared_model=net_share,
                                    gpu=self.gpu)
        else:
            ensure_shared_grads(model=net,
                                shared_model=net_share,
                                gpu=self.gpu)
        optimizer.step()
        scheduler.step(self.iteration)

    def target_q(self, r, done, q, q_std, q_next, log_prob_a_next):
        target_q = r + (1 - done) * self.args.gamma * (
            q_next - self.alpha.detach() * log_prob_a_next)
        if self.args.distributional_Q:
            if self.args.adaptive_bound:
                target_max = q + 3 * q_std
                target_min = q - 3 * q_std
                target_q = torch.min(target_q, target_max)
                target_q = torch.max(target_q, target_min)
            difference = torch.clamp(target_q - q, -self.args.TD_bound,
                                     self.args.TD_bound)
            target_q_bound = q + difference
            self.weight = torch.le(torch.abs(target_q - q),
                                   self.args.TD_bound).detach()
        else:
            target_q_bound = target_q
        return target_q.detach(), target_q_bound.detach()

    def send_to_device(self, s, info, a, r, s_next, info_next, done, device):
        s = s.to(device)
        info = info.to(device)
        a = a.to(device)
        r = r.to(device)
        s_next = s_next.to(device)
        info_next = info_next.to(device)
        done = done.to(device)
        return s, info, a, r, s_next, info_next, done

    def run(self):
        local_iteration = 0
        index = np.random.randint(0, self.args.num_buffers)
        while self.experience_out_queue[index].empty(
        ) and not self.stop_sign.value:
            index = np.random.randint(0, self.args.num_buffers)
            time.sleep(0.1)

        while not self.stop_sign.value:
            self.iteration = self.iteration_counter.value
            self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
            self.Q_net1_target.load_state_dict(
                self.Q_net1_target_share.state_dict())
            self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())
            self.Q_net2_target.load_state_dict(
                self.Q_net2_target_share.state_dict())
            self.actor1.load_state_dict(self.actor1_share.state_dict())
            self.actor1_target.load_state_dict(
                self.actor1_target_share.state_dict())
            self.actor2.load_state_dict(self.actor2_share.state_dict())
            self.actor2_target.load_state_dict(
                self.actor2_target_share.state_dict())
            if self.args.alpha == 'auto':
                self.log_alpha = self.log_alpha_share.detach().clone(
                ).requires_grad_(True)
                self.alpha = self.log_alpha.exp().to(self.device)

            index = np.random.randint(0, self.args.num_buffers)
            while self.experience_out_queue[index].empty(
            ) and not self.stop_sign.value:
                index = np.random.randint(0, self.args.num_buffers)
                time.sleep(0.1)
            if not self.experience_out_queue[index].empty():
                s, info, a, r, s_next, info_next, done = self.experience_out_queue[
                    index].get()
                s, info, a, r, s_next, info_next, done = self.send_to_device(
                    s, info, a, r, s_next, info_next, done, self.device)

            q_1, q_std_1, _ = self.Q_net1.evaluate(s,
                                                   info,
                                                   a,
                                                   device=self.device,
                                                   min=False)
            if self.args.double_Q:
                q_2, q_std_2, _ = self.Q_net2.evaluate(s,
                                                       info,
                                                       a,
                                                       device=self.device,
                                                       min=False)

            smoothing_trick = False
            if not self.args.stochastic_actor:
                if self.args.policy_smooth:
                    smoothing_trick = True

            a_new_1, log_prob_a_new_1, a_new_std_1 = self.actor1.evaluate(
                s, info, smooth_policy=False, device=self.device)
            a_next_1, log_prob_a_next_1, _ = self.actor1_target.evaluate(
                s_next,
                info_next,
                smooth_policy=smoothing_trick,
                device=self.device)
            if self.args.double_actor:
                a_new_2, log_prob_a_new_2, _ = self.actor2.evaluate(
                    s, info, smooth_policy=False, device=self.device)
                a_next_2, log_prob_a_next_2, _ = self.actor2_target.evaluate(
                    s_next,
                    info_next,
                    smooth_policy=smoothing_trick,
                    device=self.device)

            if self.args.double_Q and self.args.double_actor:
                q_next_target_1, _, q_next_sample_1 = self.Q_net2_target.evaluate(
                    s_next, info_next, a_next_1, device=self.device, min=False)
                q_next_target_2, _, _ = self.Q_net1_target.evaluate(
                    s_next, info_next, a_next_2, device=self.device, min=False)
                target_q_1, target_q_1_bound = self.target_q(
                    r, done, q_1.detach(), q_std_1.detach(),
                    q_next_target_1.detach(), log_prob_a_next_1.detach())
                target_q_2, target_q_2_bound = self.target_q(
                    r, done, q_2.detach(), q_std_2.detach(),
                    q_next_target_2.detach(), log_prob_a_next_2.detach())
            else:
                q_next_1, _, q_next_sample_1 = self.Q_net1_target.evaluate(
                    s_next, info_next, a_next_1, device=self.device, min=False)
                if self.args.double_Q:
                    q_next_2, _, _ = self.Q_net2_target.evaluate(
                        s_next,
                        info_next,
                        a_next_1,
                        device=self.device,
                        min=False)
                    q_next_target_1 = torch.min(q_next_1, q_next_2)
                elif self.args.distributional_Q:
                    q_next_target_1 = q_next_sample_1
                else:
                    q_next_target_1 = q_next_1
                target_q_1, target_q_1_bound = self.target_q(
                    r, done, q_1.detach(), q_std_1.detach(),
                    q_next_target_1.detach(), log_prob_a_next_1.detach())

            if self.args.double_Q and self.args.double_actor:
                q_object_1, _, _ = self.Q_net1.evaluate(s,
                                                        info,
                                                        a_new_1,
                                                        device=self.device,
                                                        min=False)
                q_object_2, _, _ = self.Q_net2.evaluate(s,
                                                        info,
                                                        a_new_2,
                                                        device=self.device,
                                                        min=False)
            else:
                q_new_1, _, _ = self.Q_net1.evaluate(s,
                                                     info,
                                                     a_new_1,
                                                     device=self.device,
                                                     min=False)
                if self.args.double_Q:
                    q_new_2, _, _ = self.Q_net2.evaluate(s,
                                                         info,
                                                         a_new_1,
                                                         device=self.device,
                                                         min=False)
                    q_object_1 = torch.min(q_new_1, q_new_2)
                elif self.args.distributional_Q:
                    q_object_1 = q_new_1
                else:
                    q_object_1 = q_new_1

            if local_iteration % self.args.delay_update == 0:
                if self.args.alpha == 'auto':
                    alpha_loss = -(self.log_alpha *
                                   (log_prob_a_new_1.detach().cpu() +
                                    self.target_entropy)).mean()
                    self.update_net(alpha_loss, self.alpha_optimizer,
                                    self.log_alpha, self.log_alpha_share,
                                    self.scheduler_alpha)

            q_loss_1 = self.get_qloss(q_1, q_std_1, target_q_1,
                                      target_q_1_bound)
            self.update_net(q_loss_1, self.Q_net1_optimizer, self.Q_net1,
                            self.Q_net1_share, self.scheduler_Q_net1)
            if self.args.double_Q:
                if self.args.double_actor:
                    q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_2,
                                              target_q_2_bound)
                    self.update_net(q_loss_2, self.Q_net2_optimizer,
                                    self.Q_net2, self.Q_net2_share,
                                    self.scheduler_Q_net2)
                else:
                    q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_1,
                                              target_q_1_bound)
                    self.update_net(q_loss_2, self.Q_net2_optimizer,
                                    self.Q_net2, self.Q_net2_share,
                                    self.scheduler_Q_net2)

            if self.args.code_model == "train":
                if local_iteration % self.args.delay_update == 0:
                    policy_loss_1 = self.get_policyloss(
                        q_object_1, log_prob_a_new_1)
                    self.update_net(policy_loss_1, self.actor1_optimizer,
                                    self.actor1, self.actor1_share,
                                    self.scheduler_actor1)
                    slow_sync_param(self.actor1_share,
                                    self.actor1_target_share, self.args.tau,
                                    self.gpu)
                    if self.args.double_actor:
                        policy_loss_2 = self.get_policyloss(
                            q_object_2, log_prob_a_new_2)
                        self.update_net(policy_loss_2, self.actor2_optimizer,
                                        self.actor2, self.actor2_share,
                                        self.scheduler_actor2)
                        slow_sync_param(self.actor2_share,
                                        self.actor2_target_share,
                                        self.args.tau, self.gpu)

            if local_iteration % self.args.delay_update == 0:
                slow_sync_param(self.Q_net1_share, self.Q_net1_target_share,
                                self.args.tau, self.gpu)
                if self.args.double_Q:
                    slow_sync_param(self.Q_net2_share,
                                    self.Q_net2_target_share, self.args.tau,
                                    self.gpu)

            with self.lock:
                self.iteration_counter.value += 1
            local_iteration += 1

            if self.iteration % self.args.save_model_period == 0 or (
                    self.iteration == 0 and self.agent_id == 0):
                torch.save(
                    self.actor1.state_dict(), './' + self.args.env_name +
                    '/method_' + str(self.args.method) + '/model/policy1_' +
                    str(self.iteration) + '.pkl')
                torch.save(
                    self.Q_net1.state_dict(), './' + self.args.env_name +
                    '/method_' + str(self.args.method) + '/model/Q1_' +
                    str(self.iteration) + '.pkl')
                if self.args.alpha == 'auto':
                    np.save(
                        './' + self.args.env_name + '/method_' +
                        str(self.args.method) + '/model/log_alpha' +
                        str(self.iteration),
                        self.log_alpha.detach().cpu().numpy())
                if self.args.double_Q:
                    torch.save(
                        self.Q_net2.state_dict(), './' + self.args.env_name +
                        '/method_' + str(self.args.method) + '/model/Q2_' +
                        str(self.iteration) + '.pkl')
                if self.args.double_actor:
                    torch.save(
                        self.actor2.state_dict(), './' + self.args.env_name +
                        '/method_' + str(self.args.method) +
                        '/model/policy2_' + str(self.iteration) + '.pkl')

            if self.iteration % 500 == 0 or self.iteration == 0 and self.agent_id == 0:
                print("agent", self.agent_id, "method", self.args.method,
                      "iteration", self.iteration, "time",
                      time.time() - self.init_time)
                print("loss_1", q_loss_1, "alpha", self.alpha, "lr",
                      self.scheduler_Q_net1.get_lr(),
                      self.scheduler_Q_net2.get_lr(),
                      self.scheduler_actor1.get_lr(),
                      self.scheduler_actor2.get_lr(),
                      self.scheduler_alpha.get_lr())
                print("q_std", q_std_1.t()[0][0:8])
                print("a_std", a_new_std_1.t()[0][0:8])
Exemple #4
0
class Simulation():
    def __init__(self, args, shared_queue,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.policy_test_queue = shared_queue[3]
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = 20000



        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device)
        self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl'))
        self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl'))



        self.test_step = 0

        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_m0_history =[]
        self.Q_m1_history = []
        self.Q_m2_history = []
        self.Q_std_m2_history = []



    def load_param(self):
        if self.policy_test_queue.empty():
            pass
        else:
            self.iteration, param = self.policy_test_queue.get()
            self.actor.load_state_dict(param)

    def run(self):

        step = 0
        while True:
            self.state = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False)


            for i in range(self.args.max_step):
                q_m0 = self.Q_net1_m0(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]
                q_m1 = torch.min(
                    self.Q_net1_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0],
                    self.Q_net2_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0])
                q_m2, q_std, _ = self.Q_net1_m2.evaluate(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))



                self.Q_m0_history.append(q_m0.detach().item())
                self.Q_m1_history.append(q_m1.detach().item())
                self.Q_m2_history.append(q_m2.detach().item())
                self.Q_std_m2_history.append(q_std.detach().item())

                self.u = self.u.squeeze(0)
                self.state, self.reward, self.done, _ = self.env.step(self.u)

                self.reward_history.append(self.reward)
                self.done_history.append(self.done)
                self.entropy_history.append(log_prob)


                if step%10000 >=0 and step%10000 <=9999:
                    self.env.render(mode='human')
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False)



                if self.done == True:
                    time.sleep(1)
                    print("!!!!!!!!!!!!!!!")
                    break
                step += 1
                self.episode_step += 1

            if self.done == True:
                break



        for i in range(len(self.Q_m0_history)):
            a = 0
            for j in range(i, len(self.Q_m0_history), 1):
                a += pow(self.args.gamma, j-i)*self.reward_history[j]
            for z in range(i+1, len(self.Q_m0_history), 1):
                a -= self.args.alpha * pow(self.args.gamma, z-i) * self.entropy_history[z]
            self.Q_real_history.append(a)

        print(self.reward_history)
        print(self.entropy_history)
        print(self.Q_m2_history)
        print(self.Q_std_m2_history)

        plt.figure()
        x = np.arange(0,len(self.Q_m0_history),1)
        plt.plot(x, np.array(self.Q_m0_history), 'r', linewidth=2.0)
        plt.plot(x, np.array(self.Q_m1_history), 'g', linewidth=2.0)
        plt.plot(x, np.array(self.Q_m2_history), 'b', linewidth=2.0)


        plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0)

        plt.show()