Beispiel #1
0
    def __init__(self, state_size, action_size, seed, double_agent=False,dueling_agent=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
	    double_agent(bool) : True if we want to use DDQN
            dueling_agent (bool): True if we want to use Dueling
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_agent=double_agent
        self.dueling_agent=dueling_agent

        self.qnetwork_local = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device)
        self.qnetwork_target = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device)
        self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0

        self.evaluation_interval = 50000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_to_run = 10
        self.iteration_history = []
        self.evaluated_Q_mean_history=[]
        self.true_gamma_return_mean_history=[]
        # self.n_episodes_info_history = []
        self.evaluated_Q_history = []
        self.true_gamma_return_history = []
Beispiel #3
0
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        eval_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name, params=eval_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 50  # 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []
Beispiel #4
0
    def __init__(self, args,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        simu_params = {
            'number_of_vehicles': 0,
            'number_of_walkers': 0,
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': 2000,  # connection port
            'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name, params=simu_params)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train
        # self.load_index = 40000

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu'))


        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_history =[]
        self.Q_std_history = []
    def __init__(self, args, shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/policy1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/Q1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(
                torch.load('./' + self.args.env_name + '/method_' +
                           str(self.args.method) + '/model/Q2_' +
                           str(self.load_index) + '.pkl',
                           map_location='cpu'))

        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history = []
        self.done_history = []
        self.Q_real_history = []
        self.Q_history = []
        self.Q_std_history = []
Beispiel #6
0
 def __init__(self, args, shared_queue, shared_value, lock, i):
     super(Actor, self).__init__()
     self.agent_id = i
     seed = args.seed + np.int64(self.agent_id)
     np.random.seed(seed)
     torch.manual_seed(seed)
     self.experience_queue = shared_queue[0]
     self.policy_param_queue = shared_queue[1]
     self.q_param_queue = shared_queue[2]
     self.counter = shared_value[0]
     self.stop_sign = shared_value[1]
     self.lock = lock
     self.env = gym.make(args.env_name)
     self.args = args
     self.device = torch.device("cpu")
     self.actor = PolicyNet(args.state_dim, args.num_hidden_cell,
                            args.action_high, args.action_low,
                            args.NN_type).to(self.device)
     self.Q_net1 = QNet(args.state_dim, args.action_dim,
                        args.num_hidden_cell, args.NN_type).to(self.device)
Beispiel #7
0
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []
Beispiel #8
0
    def __init__(self, args, shared_queue,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.policy_test_queue = shared_queue[3]
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = 20000



        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device)
        self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl'))
        self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl'))



        self.test_step = 0

        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_m0_history =[]
        self.Q_m1_history = []
        self.Q_m2_history = []
        self.Q_std_m2_history = []
Beispiel #9
0
    def __init__(self, args, shared_queue, shared_value, share_net, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)

        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name)
        self.args = args
        self.experience_in_queue = []
        for i in range(args.num_buffers):
            self.experience_in_queue.append(shared_queue[0][i])

        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)

        self.Q_net1_share = share_net[1]
        self.actor_share = share_net[0]
from __future__ import print_function
Beispiel #11
0
    def __init__(self, args, shared_queue, shared_value, share_net,
                 share_optimizer, device, lock, i):
        super(Learner, self).__init__()
        self.args = args
        seed = self.args.seed
        self.init_time = self.args.init_time
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.agent_id = i

        self.experience_out_queue = []
        for i in range(args.num_buffers):
            self.experience_out_queue.append(shared_queue[1][i])

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value

        self.device = device
        if self.device == torch.device("cpu"):
            self.gpu = False
        else:
            self.gpu = True
        self.lock = lock

        self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \
                                        self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net
        self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer

        self.Q_net1 = QNet(args).to(self.device)
        self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR(
            self.Q_net1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net1.train()

        self.Q_net1_target = QNet(args).to(self.device)
        self.Q_net1_target.train()

        self.Q_net2 = QNet(args).to(self.device)
        self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR(
            self.Q_net2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net2.train()

        self.Q_net2_target = QNet(args).to(self.device)
        self.Q_net2_target.train()

        self.actor1 = PolicyNet(args).to(self.device)
        self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR(
            self.actor1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor1.train()

        self.actor1_target = PolicyNet(args).to(self.device)
        self.actor1_target.train()

        self.actor2 = PolicyNet(args).to(self.device)
        self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR(
            self.actor2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor2.train()

        self.actor2_target = PolicyNet(args).to(self.device)
        self.actor2_target.train()

        self.scheduler_alpha = lr_scheduler.CosineAnnealingLR(
            self.alpha_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)

        if self.args.alpha == 'auto':
            self.target_entropy = args.target_entropy
        else:
            self.alpha = torch.tensor(self.args.alpha)
Beispiel #12
0
class Learner():
    def __init__(self, args, shared_queue, shared_value, share_net,
                 share_optimizer, device, lock, i):
        super(Learner, self).__init__()
        self.args = args
        seed = self.args.seed
        self.init_time = self.args.init_time
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.agent_id = i

        self.experience_out_queue = []
        for i in range(args.num_buffers):
            self.experience_out_queue.append(shared_queue[1][i])

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value

        self.device = device
        if self.device == torch.device("cpu"):
            self.gpu = False
        else:
            self.gpu = True
        self.lock = lock

        self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \
                                        self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net
        self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer

        self.Q_net1 = QNet(args).to(self.device)
        self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR(
            self.Q_net1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net1.train()

        self.Q_net1_target = QNet(args).to(self.device)
        self.Q_net1_target.train()

        self.Q_net2 = QNet(args).to(self.device)
        self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR(
            self.Q_net2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net2.train()

        self.Q_net2_target = QNet(args).to(self.device)
        self.Q_net2_target.train()

        self.actor1 = PolicyNet(args).to(self.device)
        self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR(
            self.actor1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor1.train()

        self.actor1_target = PolicyNet(args).to(self.device)
        self.actor1_target.train()

        self.actor2 = PolicyNet(args).to(self.device)
        self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR(
            self.actor2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor2.train()

        self.actor2_target = PolicyNet(args).to(self.device)
        self.actor2_target.train()

        self.scheduler_alpha = lr_scheduler.CosineAnnealingLR(
            self.alpha_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)

        if self.args.alpha == 'auto':
            self.target_entropy = args.target_entropy
        else:
            self.alpha = torch.tensor(self.args.alpha)

    def get_qloss(self, q, q_std, target_q, target_q_bound):
        if self.args.distributional_Q:
            # loss = -Normal(q, q_std).log_prob(target_q).mean()
            # loss = torch.mean(-Normal(q, q_std).log_prob(target_q_bound)*self.weight \
            #                   + self.weight.logical_not()*torch.pow(q-target_q,2))
            loss = torch.mean(torch.pow(q-target_q,2)/(2*torch.pow(q_std.detach(),2)) \
                   + torch.pow(q.detach()-target_q_bound,2)/(2*torch.pow(q_std,2))\
                   + torch.log(q_std))
        else:
            criterion = nn.MSELoss()
            loss = criterion(q, target_q)
        return loss

    def get_policyloss(self, q, log_prob_a_new):
        loss = (self.alpha.detach() * log_prob_a_new - q).mean()
        return loss

    def update_net(self, loss, optimizer, net, net_share, scheduler):
        optimizer.zero_grad()
        if self.gpu:
            if self.args.alpha == 'auto':
                if net is not self.log_alpha:
                    net.zero_grad()
            else:
                net.zero_grad()
        loss.backward()
        if self.args.alpha == 'auto':
            if net is self.log_alpha:
                if self.log_alpha_share.grad is None or self.log_alpha_share.grad == 0:
                    self.log_alpha_share._grad = self.log_alpha.grad
            else:
                ensure_shared_grads(model=net,
                                    shared_model=net_share,
                                    gpu=self.gpu)
        else:
            ensure_shared_grads(model=net,
                                shared_model=net_share,
                                gpu=self.gpu)
        optimizer.step()
        scheduler.step(self.iteration)

    def target_q(self, r, done, q, q_std, q_next, log_prob_a_next):
        target_q = r + (1 - done) * self.args.gamma * (
            q_next - self.alpha.detach() * log_prob_a_next)
        if self.args.distributional_Q:
            if self.args.adaptive_bound:
                target_max = q + 3 * q_std
                target_min = q - 3 * q_std
                target_q = torch.min(target_q, target_max)
                target_q = torch.max(target_q, target_min)
            difference = torch.clamp(target_q - q, -self.args.TD_bound,
                                     self.args.TD_bound)
            target_q_bound = q + difference
            self.weight = torch.le(torch.abs(target_q - q),
                                   self.args.TD_bound).detach()
        else:
            target_q_bound = target_q
        return target_q.detach(), target_q_bound.detach()

    def send_to_device(self, s, info, a, r, s_next, info_next, done, device):
        s = s.to(device)
        info = info.to(device)
        a = a.to(device)
        r = r.to(device)
        s_next = s_next.to(device)
        info_next = info_next.to(device)
        done = done.to(device)
        return s, info, a, r, s_next, info_next, done

    def run(self):
        local_iteration = 0
        index = np.random.randint(0, self.args.num_buffers)
        while self.experience_out_queue[index].empty(
        ) and not self.stop_sign.value:
            index = np.random.randint(0, self.args.num_buffers)
            time.sleep(0.1)

        while not self.stop_sign.value:
            self.iteration = self.iteration_counter.value
            self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
            self.Q_net1_target.load_state_dict(
                self.Q_net1_target_share.state_dict())
            self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())
            self.Q_net2_target.load_state_dict(
                self.Q_net2_target_share.state_dict())
            self.actor1.load_state_dict(self.actor1_share.state_dict())
            self.actor1_target.load_state_dict(
                self.actor1_target_share.state_dict())
            self.actor2.load_state_dict(self.actor2_share.state_dict())
            self.actor2_target.load_state_dict(
                self.actor2_target_share.state_dict())
            if self.args.alpha == 'auto':
                self.log_alpha = self.log_alpha_share.detach().clone(
                ).requires_grad_(True)
                self.alpha = self.log_alpha.exp().to(self.device)

            index = np.random.randint(0, self.args.num_buffers)
            while self.experience_out_queue[index].empty(
            ) and not self.stop_sign.value:
                index = np.random.randint(0, self.args.num_buffers)
                time.sleep(0.1)
            if not self.experience_out_queue[index].empty():
                s, info, a, r, s_next, info_next, done = self.experience_out_queue[
                    index].get()
                s, info, a, r, s_next, info_next, done = self.send_to_device(
                    s, info, a, r, s_next, info_next, done, self.device)

            q_1, q_std_1, _ = self.Q_net1.evaluate(s,
                                                   info,
                                                   a,
                                                   device=self.device,
                                                   min=False)
            if self.args.double_Q:
                q_2, q_std_2, _ = self.Q_net2.evaluate(s,
                                                       info,
                                                       a,
                                                       device=self.device,
                                                       min=False)

            smoothing_trick = False
            if not self.args.stochastic_actor:
                if self.args.policy_smooth:
                    smoothing_trick = True

            a_new_1, log_prob_a_new_1, a_new_std_1 = self.actor1.evaluate(
                s, info, smooth_policy=False, device=self.device)
            a_next_1, log_prob_a_next_1, _ = self.actor1_target.evaluate(
                s_next,
                info_next,
                smooth_policy=smoothing_trick,
                device=self.device)
            if self.args.double_actor:
                a_new_2, log_prob_a_new_2, _ = self.actor2.evaluate(
                    s, info, smooth_policy=False, device=self.device)
                a_next_2, log_prob_a_next_2, _ = self.actor2_target.evaluate(
                    s_next,
                    info_next,
                    smooth_policy=smoothing_trick,
                    device=self.device)

            if self.args.double_Q and self.args.double_actor:
                q_next_target_1, _, q_next_sample_1 = self.Q_net2_target.evaluate(
                    s_next, info_next, a_next_1, device=self.device, min=False)
                q_next_target_2, _, _ = self.Q_net1_target.evaluate(
                    s_next, info_next, a_next_2, device=self.device, min=False)
                target_q_1, target_q_1_bound = self.target_q(
                    r, done, q_1.detach(), q_std_1.detach(),
                    q_next_target_1.detach(), log_prob_a_next_1.detach())
                target_q_2, target_q_2_bound = self.target_q(
                    r, done, q_2.detach(), q_std_2.detach(),
                    q_next_target_2.detach(), log_prob_a_next_2.detach())
            else:
                q_next_1, _, q_next_sample_1 = self.Q_net1_target.evaluate(
                    s_next, info_next, a_next_1, device=self.device, min=False)
                if self.args.double_Q:
                    q_next_2, _, _ = self.Q_net2_target.evaluate(
                        s_next,
                        info_next,
                        a_next_1,
                        device=self.device,
                        min=False)
                    q_next_target_1 = torch.min(q_next_1, q_next_2)
                elif self.args.distributional_Q:
                    q_next_target_1 = q_next_sample_1
                else:
                    q_next_target_1 = q_next_1
                target_q_1, target_q_1_bound = self.target_q(
                    r, done, q_1.detach(), q_std_1.detach(),
                    q_next_target_1.detach(), log_prob_a_next_1.detach())

            if self.args.double_Q and self.args.double_actor:
                q_object_1, _, _ = self.Q_net1.evaluate(s,
                                                        info,
                                                        a_new_1,
                                                        device=self.device,
                                                        min=False)
                q_object_2, _, _ = self.Q_net2.evaluate(s,
                                                        info,
                                                        a_new_2,
                                                        device=self.device,
                                                        min=False)
            else:
                q_new_1, _, _ = self.Q_net1.evaluate(s,
                                                     info,
                                                     a_new_1,
                                                     device=self.device,
                                                     min=False)
                if self.args.double_Q:
                    q_new_2, _, _ = self.Q_net2.evaluate(s,
                                                         info,
                                                         a_new_1,
                                                         device=self.device,
                                                         min=False)
                    q_object_1 = torch.min(q_new_1, q_new_2)
                elif self.args.distributional_Q:
                    q_object_1 = q_new_1
                else:
                    q_object_1 = q_new_1

            if local_iteration % self.args.delay_update == 0:
                if self.args.alpha == 'auto':
                    alpha_loss = -(self.log_alpha *
                                   (log_prob_a_new_1.detach().cpu() +
                                    self.target_entropy)).mean()
                    self.update_net(alpha_loss, self.alpha_optimizer,
                                    self.log_alpha, self.log_alpha_share,
                                    self.scheduler_alpha)

            q_loss_1 = self.get_qloss(q_1, q_std_1, target_q_1,
                                      target_q_1_bound)
            self.update_net(q_loss_1, self.Q_net1_optimizer, self.Q_net1,
                            self.Q_net1_share, self.scheduler_Q_net1)
            if self.args.double_Q:
                if self.args.double_actor:
                    q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_2,
                                              target_q_2_bound)
                    self.update_net(q_loss_2, self.Q_net2_optimizer,
                                    self.Q_net2, self.Q_net2_share,
                                    self.scheduler_Q_net2)
                else:
                    q_loss_2 = self.get_qloss(q_2, q_std_2, target_q_1,
                                              target_q_1_bound)
                    self.update_net(q_loss_2, self.Q_net2_optimizer,
                                    self.Q_net2, self.Q_net2_share,
                                    self.scheduler_Q_net2)

            if self.args.code_model == "train":
                if local_iteration % self.args.delay_update == 0:
                    policy_loss_1 = self.get_policyloss(
                        q_object_1, log_prob_a_new_1)
                    self.update_net(policy_loss_1, self.actor1_optimizer,
                                    self.actor1, self.actor1_share,
                                    self.scheduler_actor1)
                    slow_sync_param(self.actor1_share,
                                    self.actor1_target_share, self.args.tau,
                                    self.gpu)
                    if self.args.double_actor:
                        policy_loss_2 = self.get_policyloss(
                            q_object_2, log_prob_a_new_2)
                        self.update_net(policy_loss_2, self.actor2_optimizer,
                                        self.actor2, self.actor2_share,
                                        self.scheduler_actor2)
                        slow_sync_param(self.actor2_share,
                                        self.actor2_target_share,
                                        self.args.tau, self.gpu)

            if local_iteration % self.args.delay_update == 0:
                slow_sync_param(self.Q_net1_share, self.Q_net1_target_share,
                                self.args.tau, self.gpu)
                if self.args.double_Q:
                    slow_sync_param(self.Q_net2_share,
                                    self.Q_net2_target_share, self.args.tau,
                                    self.gpu)

            with self.lock:
                self.iteration_counter.value += 1
            local_iteration += 1

            if self.iteration % self.args.save_model_period == 0 or (
                    self.iteration == 0 and self.agent_id == 0):
                torch.save(
                    self.actor1.state_dict(), './' + self.args.env_name +
                    '/method_' + str(self.args.method) + '/model/policy1_' +
                    str(self.iteration) + '.pkl')
                torch.save(
                    self.Q_net1.state_dict(), './' + self.args.env_name +
                    '/method_' + str(self.args.method) + '/model/Q1_' +
                    str(self.iteration) + '.pkl')
                if self.args.alpha == 'auto':
                    np.save(
                        './' + self.args.env_name + '/method_' +
                        str(self.args.method) + '/model/log_alpha' +
                        str(self.iteration),
                        self.log_alpha.detach().cpu().numpy())
                if self.args.double_Q:
                    torch.save(
                        self.Q_net2.state_dict(), './' + self.args.env_name +
                        '/method_' + str(self.args.method) + '/model/Q2_' +
                        str(self.iteration) + '.pkl')
                if self.args.double_actor:
                    torch.save(
                        self.actor2.state_dict(), './' + self.args.env_name +
                        '/method_' + str(self.args.method) +
                        '/model/policy2_' + str(self.iteration) + '.pkl')

            if self.iteration % 500 == 0 or self.iteration == 0 and self.agent_id == 0:
                print("agent", self.agent_id, "method", self.args.method,
                      "iteration", self.iteration, "time",
                      time.time() - self.init_time)
                print("loss_1", q_loss_1, "alpha", self.alpha, "lr",
                      self.scheduler_Q_net1.get_lr(),
                      self.scheduler_Q_net2.get_lr(),
                      self.scheduler_actor1.get_lr(),
                      self.scheduler_actor2.get_lr(),
                      self.scheduler_alpha.get_lr())
                print("q_std", q_std_1.t()[0][0:8])
                print("a_std", a_new_std_1.t()[0][0:8])
Beispiel #13
0
        # 1	Cart Velocity             -Inf            Inf
        int(check_bound(np.degrees(observation[2]), np.arange(-11, 11, 1))),
        # 2	Pole Angle                 -24 deg        24 deg
        int(check_bound(observation[3], np.arange(-0.88, 0.88, 0.08)))
        # 3	Pole Velocity At Tip      -Inf            Inf
    ]


# Create Agent
actions = range(env.action_space.n)
agent = Agent(None, (25, 25, 25), actions)
temp_agent = agent.__copy__()

# Create Network
net_size = 128
net = QNet(env.observation_space.shape[0], env.action_space.n, net_size,
           device).to(device)
optimizer = optim.Adam(net.parameters(), lr=1e-3)
net.train()

ok = False
guts = 0
i_episode = 0
total = 0
loss = 0
guts_required = 100
guts_print_div = 10
big_data = [[], []]
print("Learning...")
while not ok:
    # Agent learning
    while guts < guts_required:
Beispiel #14
0
class Evaluator(object):
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        eval_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name, params=eval_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 50  # 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []

    def average_max_n(self, list_for_average, n):
        sorted_list = sorted(list_for_average, reverse=True)
        return sum(sorted_list[:n]) / n

    def run_an_episode(self, deterministic):
        #state_list = []
        action_list = []
        log_prob_list = []
        reward_list = []
        evaluated_Q_list = []
        Q_std_list = []
        a_std_list = []
        done = 0
        state, info = self.env.reset()
        while not done and len(reward_list) < (self.args.max_step - 1):
            state_tensor = torch.FloatTensor(state.copy()).float().to(
                self.device)
            info_tensor = torch.FloatTensor(info.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)  # 3, 256, 256

            u, log_prob, a_std = self.actor.get_action(
                state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                deterministic)
            log_prob_list.append(log_prob)
            a_std_list.append(a_std)
            if self.args.double_Q and not self.args.double_actor:
                q = torch.min(
                    self.Q_net1.evaluate(
                        state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                        torch.FloatTensor(u.copy()).to(self.device))[0],
                    self.Q_net2.evaluate(
                        state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                        torch.FloatTensor(u.copy()).to(self.device))[0])
            else:
                q, q_std, _ = self.Q_net1.evaluate(
                    state_tensor.unsqueeze(0), info_tensor.unsqueeze(0),
                    torch.FloatTensor(u.copy()).to(self.device))
            evaluated_Q_list.append(q.detach().item())
            if self.args.distributional_Q:
                Q_std_list.append(q_std.detach().item())
            else:
                Q_std_list.append(0)
            u = u.squeeze(0)
            state, reward, done, info = self.env.step(u)
            # self.env.render(mode='human')
            action_list.append(u)
            reward_list.append(reward * self.args.reward_scale)

        if not deterministic:
            entropy_list = list(-self.alpha * np.array(log_prob_list))
            true_gamma_return_list = cal_gamma_return_of_an_episode(
                reward_list, entropy_list, self.args.gamma)
            policy_entropy = -sum(log_prob_list) / len(log_prob_list)
            a_std_mean = np.mean(np.array(a_std_list), axis=0)
            a_abs_mean = np.mean(np.abs(np.array(action_list)), axis=0)
            return dict(  #state_list=np.array(state_list),
                #action_list=np.array(action_list),
                log_prob_list=np.array(log_prob_list),
                policy_entropy=policy_entropy,
                #reward_list=np.array(reward_list),
                a_std_mean=a_std_mean,
                a_abs_mean=a_abs_mean,
                evaluated_Q_list=np.array(evaluated_Q_list),
                Q_std_list=np.array(Q_std_list),
                true_gamma_return_list=true_gamma_return_list,
            )
        else:
            episode_return = sum(reward_list) / self.args.reward_scale
            episode_len = len(reward_list)
            return dict(episode_return=episode_return, episode_len=episode_len)

    def run_n_episodes(self, n, max_state, deterministic):
        n_episode_state_list = []
        n_episode_action_list = []
        n_episode_log_prob_list = []
        n_episode_reward_list = []
        n_episode_evaluated_Q_list = []
        n_episode_Q_std_list = []
        n_episode_true_gamma_return_list = []
        n_episode_return_list = []
        n_episode_len_list = []
        n_episode_policyentropy_list = []
        n_episode_a_std_list = []
        for _ in range(n):
            episode_info = self.run_an_episode(deterministic)
            # n_episode_state_list.append(episode_info['state_list'])
            # n_episode_action_list.append(episode_info['action_list'])
            # n_episode_log_prob_list.append(episode_info['log_prob_list'])
            # n_episode_reward_list.append(episode_info['reward_list'])
            if not deterministic:
                n_episode_evaluated_Q_list.append(
                    episode_info['evaluated_Q_list'])
                n_episode_Q_std_list.append(episode_info['Q_std_list'])
                n_episode_true_gamma_return_list.append(
                    episode_info['true_gamma_return_list'])
                n_episode_policyentropy_list.append(
                    episode_info['policy_entropy'])
                n_episode_a_std_list.append(episode_info['a_std_mean'])
                n_episode_action_list.append(episode_info['a_abs_mean'])
            else:
                n_episode_return_list.append(episode_info['episode_return'])
                n_episode_len_list.append(episode_info['episode_len'])

        if not deterministic:
            average_policy_entropy = sum(n_episode_policyentropy_list) / len(
                n_episode_policyentropy_list)
            average_a_std = np.mean(np.array(n_episode_a_std_list), axis=0)
            average_a_abs = np.mean(np.array(n_episode_action_list), axis=0)

            # n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history))
            # n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history))

            def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi):
                tmp = list(copy.deepcopy(list_of_n_epi))
                tmp[0] = tmp[0] if len(
                    tmp[0]) <= max_state else tmp[0][:max_state]

                def reduce_fuc(a, b):
                    return np.concatenate(
                        [a, b]) if len(b) < max_state else np.concatenate(
                            [a, b[:max_state]])

                interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp)
                return sum(interest_epi_part_of_one_ite) / len(
                    interest_epi_part_of_one_ite)

            evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_evaluated_Q_list))
            evaluated_Q_std = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_Q_std_list))
            true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean(
                np.array(n_episode_true_gamma_return_list))

            return dict(evaluated_Q_mean=evaluated_Q_mean,
                        true_gamma_return_mean=true_gamma_return_mean,
                        evaluated_Q_std=evaluated_Q_std,
                        n_episode_reward_list=np.array(n_episode_reward_list),
                        policy_entropy=average_policy_entropy,
                        a_std=average_a_std,
                        a_abs=average_a_abs)
        else:
            average_return_with_diff_base = np.array([
                self.average_max_n(n_episode_return_list, x)
                for x in [1, self.episode_num_test - 2, self.episode_num_test]
            ])
            average_reward = sum(n_episode_return_list) / sum(
                n_episode_len_list)
            return dict(
                n_episode_reward_list=np.array(n_episode_reward_list),
                average_return_with_diff_base=average_return_with_diff_base,
                average_reward=average_reward,
            )

    def run(self):
        while not self.stop_sign.value:
            if self.iteration_counter.value % self.evaluation_interval == 0:
                self.alpha = np.exp(self.log_alpha_share.detach().item()
                                    ) if self.args.alpha == 'auto' else 0
                self.iteration = self.iteration_counter.value
                self.actor.load_state_dict(self.actor_share.state_dict())
                self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
                self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())

                delta_time = time.time() - self.time
                self.time = time.time()
                n_episode_info = self.run_n_episodes(
                    self.episode_num_evaluation,
                    self.max_state_num_evaluated_in_an_episode, False)
                self.iteration_history.append(self.iteration)
                self.evaluated_Q_mean_history.append(
                    n_episode_info['evaluated_Q_mean'])
                self.evaluated_Q_std_history.append(
                    n_episode_info['evaluated_Q_std'])
                self.true_gamma_return_mean_history.append(
                    n_episode_info['true_gamma_return_mean'])
                self.time_history.append(delta_time)
                # self.list_of_n_episode_rewards_history.append(list_of_n_episode_rewards)
                self.alpha_history.append(self.alpha.item())
                self.policy_entropy_history.append(
                    n_episode_info['policy_entropy'])
                self.a_std_history.append(n_episode_info['a_std'])
                self.a_abs_history.append(n_episode_info['a_abs'])
                n_episode_info_test = self.run_n_episodes(
                    self.episode_num_test,
                    self.max_state_num_evaluated_in_an_episode, True)
                self.average_return_with_diff_base_history.append(
                    n_episode_info_test['average_return_with_diff_base'])
                self.average_reward_history.append(
                    n_episode_info_test['average_reward'])

                print('Saving evaluation results of the {} iteration.'.format(
                    self.iteration))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/iteration',
                    np.array(self.iteration_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/evaluated_Q_mean',
                    np.array(self.evaluated_Q_mean_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/evaluated_Q_std',
                    np.array(self.evaluated_Q_std_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/true_gamma_return_mean',
                    np.array(self.true_gamma_return_mean_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/time',
                    np.array(self.time_history))
                # np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/list_of_n_episode_rewards',
                #         np.array(self.list_of_n_episode_rewards_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) +
                    '/result/average_return_with_diff_base',
                    np.array(self.average_return_with_diff_base_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/average_reward',
                    np.array(self.average_reward_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/alpha',
                    np.array(self.alpha_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/policy_entropy',
                    np.array(self.policy_entropy_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/a_std',
                    np.array(self.a_std_history))
                np.save(
                    './' + self.args.env_name + '/method_' +
                    str(self.args.method) + '/result/a_abs',
                    np.array(self.a_abs_history))

                # plot_online(self.args.env_name, self.args.method, self.args.method_name,
                #             self.max_state_num_evaluated_in_an_episode)

                if self.iteration >= self.args.max_train:
                    self.stop_sign.value = 1
                    break
def main(method):
    args = built_parser(method=method)
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  #+ sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
        for i in range(args.num_learners):
            #device = torch.device("cuda")
            device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
Beispiel #16
0
class Actor():
    def __init__(self, args, shared_queue, shared_value, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.experience_queue = shared_queue[0]
        self.policy_param_queue = shared_queue[1]
        self.q_param_queue = shared_queue[2]
        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name)
        self.args = args
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell,
                               args.action_high, args.action_low,
                               args.NN_type).to(self.device)
        self.Q_net1 = QNet(args.state_dim, args.action_dim,
                           args.num_hidden_cell, args.NN_type).to(self.device)

    def update_actor_net(self, current_dict, actor_net):
        params_target = get_flat_params_from(actor_net)
        params = get_flat_params_from_dict(current_dict)
        set_flat_params_to(actor_net, (1 - self.args.syn_tau) * params_target +
                           self.args.syn_tau * params)

    def load_param(self):
        if self.policy_param_queue.empty():
            #pass
            #print("agent", self.agent_id, "is waiting param")
            time.sleep(0.5)
            #self.load_param()
        else:
            param = self.policy_param_queue.get()
            if self.args.syn_method == "copy":
                self.actor.load_state_dict(param)
            elif self.args.syn_method == "slow":
                self.update_actor_net(param, self.actor)
        if self.q_param_queue.empty():
            time.sleep(0.5)
            #self.load_param()
        else:
            param = self.q_param_queue.get()
            self.Q_net1.load_state_dict(param)

    def put_data(self):
        if not self.stop_sign.value:
            if self.experience_queue.full():
                #print("agent", self.agent_id, "is waiting queue space")
                time.sleep(0.5)
                self.put_data()
            else:
                self.experience_queue.put(
                    (self.last_state, self.last_u, [self.reward],
                     self.state, [self.micro_step], [self.done],
                     self.TD.detach().cpu().numpy().squeeze()))
        else:
            pass

    def run(self):
        time_init = time.time()
        step = 0
        self.micro_step = 0
        while not self.stop_sign.value:
            self.state = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0),
                                                     False)
            q_1 = self.Q_net1(state_tensor.unsqueeze(0),
                              torch.FloatTensor(self.u).to(self.device))[0]
            self.u = self.u.squeeze(0)
            self.last_state = self.state.copy()
            self.last_u = self.u.copy()
            last_q_1 = q_1

            for i in range(self.args.max_step):
                self.state, self.reward, self.done, _ = self.env.step(self.u)
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                    self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob = self.actor.get_action(
                    state_tensor.unsqueeze(0), False)
                q_1 = self.Q_net1(state_tensor.unsqueeze(0),
                                  torch.FloatTensor(self.u).to(self.device))[0]
                self.u = self.u.squeeze(0)
                if self.episode_step > 0:
                    self.TD = self.reward + (
                        1 - self.done) * self.args.gamma * q_1 - last_q_1
                    self.put_data()
                self.last_state = self.state.copy()
                self.last_u = self.u.copy()
                last_q_1 = q_1

                with self.lock:
                    self.counter.value += 1

                if self.done == True:
                    break

                if step % self.args.load_param_period == 0:
                    self.load_param()
                step += 1
                self.episode_step += 1
Beispiel #17
0
def main():
    # parameters for the gym_carla environment
    params = {
        'display_size': 256,  # screen size of bird-eye render
        'obs_size': 128,  # screen size of cv2 window
        'dt': 0.1,  # time interval between two frames
        'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        # 'town': 'Town01',  # which town to simulate
        'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'test',
        'max_time_episode': 5000,  # maximum timesteps per episode
        'desired_speed': 8,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    # Set gym-carla environment
    env = gym.make('carla-v0', params=params)

    # load net
    device = torch.device('cpu')
    args = Args()
    args.NN_type
    actor = PolicyNet(args).to(device)
    actor.load_state_dict(torch.load('./policy1_500000.pkl',map_location='cpu'))

    Q_net1 = QNet(args).to(device)
    Q_net1.load_state_dict(torch.load('./Q1_500000.pkl',map_location='cpu'))

    obs, info_dict = env.reset()
    info = info_dict_to_array(info_dict)

    state_tensor = torch.FloatTensor(obs.copy()).float().to(device)
    info_tensor = torch.FloatTensor(info.copy()).float().to(device)

    # print(env.ego.get_location())
    tic = time.time()
    done = False
    ret = 0
    start = carla.Location(x=env.start[0], y=env.start[1], z=0.22)
    end = carla.Location(x=env.dest[0], y=env.dest[1], z=0.22)

    if args.NN_type == "CNN":
        state_tensor = state_tensor.permute(2, 0, 1)

    while not done:
        tac = time.time()

        u, log_prob = actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True)
        u = u.squeeze(0)

        obs, r, done, info = env.step(u)

        info = info_dict_to_array(info_dict)
        state_tensor = torch.FloatTensor(obs.copy()).float().to(device)
        if args.NN_type == "CNN":
            state_tensor = state_tensor.permute(2, 0, 1)
        info_tensor = torch.FloatTensor(info.copy()).float().to(device)

        ret += r
        cv2.imshow("camera img", obs)
        cv2.waitKey(1)
        # print(info['acceleration_t'].shape)
        env.world.debug.draw_point(start)
        env.world.debug.draw_point(end)

        if done:
            toc = time.time()
            print("An episode took %f s" %(toc - tic))
            print("total reward is", ret)
            print("time steps", env.time_step)
            env.close()
            env.reset()
            ret = 0
            # print(env.ego.get_location())
            done = False
class Simulation():
    def __init__(self, args, shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/policy1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/Q1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(
                torch.load('./' + self.args.env_name + '/method_' +
                           str(self.args.method) + '/model/Q2_' +
                           str(self.load_index) + '.pkl',
                           map_location='cpu'))

        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history = []
        self.done_history = []
        self.Q_real_history = []
        self.Q_history = []
        self.Q_std_history = []

    def run(self):
        alpha = 0.004

        step = 0
        while True:
            self.state = self.env.reset()
            self.episode_step = 0

            for i in range(300):
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(
                    self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob, _ = self.actor.get_action(
                    state_tensor.unsqueeze(0), True)

                q = self.Q_net1(state_tensor.unsqueeze(0),
                                torch.FloatTensor(self.u).to(self.device))[0]
                if self.args.double_Q:
                    q = torch.min(
                        self.Q_net1(state_tensor.unsqueeze(0),
                                    torch.FloatTensor(self.u).to(
                                        self.device))[0],
                        self.Q_net2(state_tensor.unsqueeze(0),
                                    torch.FloatTensor(self.u).to(
                                        self.device))[0])

                self.u = self.u.squeeze(0)
                self.state, self.reward, self.done, _ = self.env.step(self.u)

                self.Q_history.append(q.detach().item())
                self.reward_history.append(self.reward)
                self.done_history.append(self.done)
                self.entropy_history.append(log_prob)

                if step % 10000 >= 0 and step % 10000 <= 9999:
                    self.env.render(mode='human')

                if self.done == True:
                    time.sleep(1)
                    print("!!!!!!!!!!!!!!!")
                    break
                step += 1
                self.episode_step += 1

            if self.done == True:
                pass
                #break

        print(self.reward_history)
        for i in range(len(self.Q_history)):
            a = 0
            for j in range(i, len(self.Q_history), 1):
                a += pow(self.args.gamma, j - i) * self.reward_history[j]
            for z in range(i + 1, len(self.Q_history), 1):
                a -= alpha * pow(self.args.gamma,
                                 z - i) * self.entropy_history[z]
            self.Q_real_history.append(a)

        plt.figure()
        x = np.arange(0, len(self.Q_history), 1)
        plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0)
        plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0)

        plt.show()
Beispiel #19
0
def main(method):

    params = {
        'obs_size': (160, 100),  # screen size of cv2 window
        'dt': 0.025,  # time interval between two frames
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'task_mode':
        'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'train',
        'max_time_episode': 100,  # maximum timesteps per episode
        'desired_speed': 15,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    args = built_parser(method=method)
    env = gym.make(args.env_name, params=params)
    state_dim = env.state_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()
    num_cpu = mp.cpu_count()
    print(state_dim, action_dim, action_high, num_cpu)

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  # + sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    print("Network inited")

    if args.code_model == "eval":
        actor1.load_state_dict(
            torch.load('./' + args.env_name + '/method_' + str(args.method) +
                       '/model/policy_' + str(args.max_train) + '.pkl'))
    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    print("Network set")

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    print("Network loaded!")

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    print("Optimizer done")

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_learners):
            if i % 2 == 0:
                device = torch.device("cuda:1")
            else:
                device = torch.device("cuda:0")
            # device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
Beispiel #20
0
class Simulation():
    def __init__(self, args, shared_queue,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.policy_test_queue = shared_queue[3]
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = 20000



        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device)
        self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl'))
        self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl'))



        self.test_step = 0

        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_m0_history =[]
        self.Q_m1_history = []
        self.Q_m2_history = []
        self.Q_std_m2_history = []



    def load_param(self):
        if self.policy_test_queue.empty():
            pass
        else:
            self.iteration, param = self.policy_test_queue.get()
            self.actor.load_state_dict(param)

    def run(self):

        step = 0
        while True:
            self.state = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False)


            for i in range(self.args.max_step):
                q_m0 = self.Q_net1_m0(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]
                q_m1 = torch.min(
                    self.Q_net1_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0],
                    self.Q_net2_m1(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0])
                q_m2, q_std, _ = self.Q_net1_m2.evaluate(state_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))



                self.Q_m0_history.append(q_m0.detach().item())
                self.Q_m1_history.append(q_m1.detach().item())
                self.Q_m2_history.append(q_m2.detach().item())
                self.Q_std_m2_history.append(q_std.detach().item())

                self.u = self.u.squeeze(0)
                self.state, self.reward, self.done, _ = self.env.step(self.u)

                self.reward_history.append(self.reward)
                self.done_history.append(self.done)
                self.entropy_history.append(log_prob)


                if step%10000 >=0 and step%10000 <=9999:
                    self.env.render(mode='human')
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), False)



                if self.done == True:
                    time.sleep(1)
                    print("!!!!!!!!!!!!!!!")
                    break
                step += 1
                self.episode_step += 1

            if self.done == True:
                break



        for i in range(len(self.Q_m0_history)):
            a = 0
            for j in range(i, len(self.Q_m0_history), 1):
                a += pow(self.args.gamma, j-i)*self.reward_history[j]
            for z in range(i+1, len(self.Q_m0_history), 1):
                a -= self.args.alpha * pow(self.args.gamma, z-i) * self.entropy_history[z]
            self.Q_real_history.append(a)

        print(self.reward_history)
        print(self.entropy_history)
        print(self.Q_m2_history)
        print(self.Q_std_m2_history)

        plt.figure()
        x = np.arange(0,len(self.Q_m0_history),1)
        plt.plot(x, np.array(self.Q_m0_history), 'r', linewidth=2.0)
        plt.plot(x, np.array(self.Q_m1_history), 'g', linewidth=2.0)
        plt.plot(x, np.array(self.Q_m2_history), 'b', linewidth=2.0)


        plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0)

        plt.show()
Beispiel #21
0
class Simulation():
    def __init__(self, args,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        simu_params = {
            'number_of_vehicles': 0,
            'number_of_walkers': 0,
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': 2000,  # connection port
            'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name, params=simu_params)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train
        # self.load_index = 40000

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu'))


        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_history =[]
        self.Q_std_history = []


    def run(self):
        alpha = 0.004
        step = 0

        summaryFlag = True
        while True:
            self.state, self.info = self.env.reset()
            self.episode_step = 0
            state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
            info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device)

            if self.args.NN_type == "CNN":
                state_tensor = state_tensor.permute(2, 0, 1)
            self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True)


            for i in range(500):
                q = self.Q_net1(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0]
                if self.args.double_Q:
                    q = torch.min(
                        q,
                        self.Q_net2(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), torch.FloatTensor(self.u).to(self.device))[0])


                self.Q_history.append(q.detach().item())

                self.u = self.u.squeeze(0)

                # TODO
                if summaryFlag:
                    with SummaryWriter(log_dir='./logs') as writer:
                        # writer.add_scalar('random', np.random.randint(0, 10), i)
                        v = self.env.ego.get_velocity()
                        v = np.array([v.x, v.y, v.z])
                        writer.add_scalar('v_x', self.env.state_info['velocity_t'][0], i)
                        writer.add_scalar('v_y', self.env.state_info['velocity_t'][1], i)
                        writer.add_scalar('accelaration_x', self.env.state_info['acceleration_t'][0], i)
                        writer.add_scalar('accelaration_y', self.env.state_info['acceleration_t'][1], i)
                        # writer.add_scalar('distance2terminal', self.env.state_info['dist_to_dest'], i)
                        # writer.add_scalar('delta_yaw', self.state[5]*2, i)
                        writer.add_scalar('angular_speed_z', self.env.state_info['dyaw_dt_t'], i)
                        # writer.add_scalar('lateral_dist', self.state[7]/10, i)
                        writer.add_scalar('action_throttle', self.env.state_info['action_t_1'][0], i)
                        writer.add_scalar('action_steer', self.env.state_info['action_t_1'][1], i)
                        writer.add_scalar('delta_yaw', self.env.state_info['delta_yaw_t'], i)
                        writer.add_scalar('dist2center', self.env.state_info['lateral_dist_t'], i)

                self.state, self.reward, self.done, self.info = self.env.step(self.u)

                self.reward_history.append(self.reward)
                self.done_history.append(self.done)
                self.entropy_history.append(log_prob)

                # render the image
                cv2.imshow("camera img", self.state.squeeze())
                cv2.waitKey(1)
                # if step%10000 >=0 and step%10000 <=9999:
                #     self.env.render(mode='human')
                state_tensor = torch.FloatTensor(self.state.copy()).float().to(self.device)
                info_tensor = torch.FloatTensor(self.info.copy()).float().to(self.device)

                if self.args.NN_type == "CNN":
                    state_tensor = state_tensor.permute(2, 0, 1)
                self.u, log_prob, std = self.actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True)



                if self.done == True or self.env.isTimeOut:
                    time.sleep(1)
                    print("Episode Done!")
                    summaryFlag = False
                    # return
                    break
                step += 1
                self.episode_step += 1

            if self.done == True:
                pass
                #break


        print(self.reward_history)
        for i in range(len(self.Q_history)):
            a = 0
            for j in range(i, len(self.Q_history), 1):
                a += pow(self.args.gamma, j-i)*self.reward_history[j]
            for z in range(i+1, len(self.Q_history), 1):
                a -= alpha * pow(self.args.gamma, z-i) * self.entropy_history[z]
            self.Q_real_history.append(a)


        plt.figure()
        x = np.arange(0,len(self.Q_history),1)
        plt.plot(x, np.array(self.Q_history), 'r', linewidth=2.0)
        plt.plot(x, np.array(self.Q_real_history), 'k', linewidth=2.0)

        plt.show()
Beispiel #22
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, double_agent=False,dueling_agent=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
	    double_agent(bool) : True if we want to use DDQN
            dueling_agent (bool): True if we want to use Dueling
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_agent=double_agent
        self.dueling_agent=dueling_agent

        self.qnetwork_local = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device)
        self.qnetwork_target = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device)
        self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory 
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def weighted_mse_loss(self,Q_expected, Q_targets,deltas):
        """ Returns the weighted mean square error between Q_expected and Q_target
  
        Params
        ======
            Q_expected, Q_targets : target and current guesses
            deltas : weights
        """
        weight =( deltas/torch.sum(deltas)*BATCH_SIZE )** (-1)
        return torch.mean(weight * (Q_expected - Q_targets) ** 2)

    def get_q_target(self,next_states,rewards,gamma,dones):
        """ Returns the target expected Q value  
  
        Params
        ======
            next_states : list of states we arrived in
            rewards : rewards we got
            gamma : discounting factor
            dones : list of bool telling if the episode is done
        """
        # Get max predicted Q values (for next states) from target model
        if (not self.double_agent):
                Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        else :
                indices= torch.argmax(self.qnetwork_local(next_states).detach(),1)
                Q_targets_next = self.qnetwork_target(next_states).detach().gather(1,indices.unsqueeze(1))
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        return Q_targets

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones,deltas = experiences

        Q_targets = self.get_q_target(next_states,rewards,gamma,dones)

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss

        loss = self.weighted_mse_loss(Q_expected, Q_targets,deltas)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Evaluator(object):
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0

        self.evaluation_interval = 50000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_to_run = 10
        self.iteration_history = []
        self.evaluated_Q_mean_history=[]
        self.true_gamma_return_mean_history=[]
        # self.n_episodes_info_history = []
        self.evaluated_Q_history = []
        self.true_gamma_return_history = []

    def run_an_episode(self):
        state_list = []
        action_list = []
        log_prob_list = []
        reward_list = []
        evaluated_Q_list = []
        done = 0
        state = self.env.reset()
        while not done and len(reward_list) < self.args.max_step:
            state_tensor = torch.FloatTensor(state.copy()).float().to(self.device)
            u, log_prob = self.actor.get_action(state_tensor.unsqueeze(0), self.args.stochastic_actor)
            state_list.append(state.copy())
            action_list.append(u.copy())
            log_prob_list.append(log_prob)
            if self.args.double_Q and not self.args.double_actor:
                q = torch.min(
                    self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0],
                    self.Q_net2(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0])
            else:
                q = self.Q_net1(state_tensor.unsqueeze(0), torch.FloatTensor(u.copy()).to(self.device))[0]
            evaluated_Q_list.append(q.detach().item())
            u = u.squeeze(0)
            state, reward, done, load_action = self.env.step(u)
            # self.env.render(mode='human')
            reward_list.append(reward * self.args.reward_scale)
        entropy_list = list(-self.alpha * np.array(log_prob_list))
        true_gamma_return_list = cal_gamma_return_of_an_episode(reward_list, entropy_list, self.args.gamma)
        episode_return = sum(reward_list)
        episode_len = len(reward_list)

        return dict(state_list=np.array(state_list),
                    action_list=np.array(action_list),
                    log_prob_list=np.array(log_prob_list),
                    reward_list=np.array(reward_list),
                    evaluated_Q_list=np.array(evaluated_Q_list),
                    true_gamma_return_list=true_gamma_return_list,
                    episode_return=episode_return,
                    episode_len=episode_len)

    def run_n_episodes(self, n, max_state):
        n_episode_state_list = []
        n_episode_action_list = []
        n_episode_log_prob_list = []
        n_episode_reward_list = []
        n_episode_evaluated_Q_list = []
        n_episode_true_gamma_return_list = []
        n_episode_return_list = []
        n_episode_len_list = []
        for _ in range(n):
            episode_info = self.run_an_episode()
            n_episode_state_list.append(episode_info['state_list'])
            n_episode_action_list.append(episode_info['action_list'])
            n_episode_log_prob_list.append(episode_info['log_prob_list'])
            n_episode_reward_list.append(episode_info['reward_list'])
            n_episode_evaluated_Q_list.append(episode_info['evaluated_Q_list'])
            n_episode_true_gamma_return_list.append(episode_info['true_gamma_return_list'])
            n_episode_return_list.append(episode_info['episode_return'])
            n_episode_len_list.append(episode_info['episode_len'])

        #n_episode_evaluated_Q_list_history = list(map(lambda x: x['n_episode_evaluated_Q_list'], n_episodes_info_history))
        #n_episode_true_gamma_return_list_history = list(map(lambda x: x['n_episode_true_gamma_return_list'], n_episodes_info_history))

        def concat_interest_epi_part_of_one_ite_and_mean(list_of_n_epi):
            tmp = list(copy.deepcopy(list_of_n_epi))
            tmp[0] = tmp[0] if len(tmp[0]) <= max_state else tmp[0][:max_state]

            def reduce_fuc(a, b):
                return np.concatenate([a, b]) if len(b) < max_state else np.concatenate([a, b[:max_state]])

            interest_epi_part_of_one_ite = reduce(reduce_fuc, tmp)
            return sum(interest_epi_part_of_one_ite) / len(interest_epi_part_of_one_ite)

        evaluated_Q_mean = concat_interest_epi_part_of_one_ite_and_mean(np.array(n_episode_evaluated_Q_list))

        true_gamma_return_mean = concat_interest_epi_part_of_one_ite_and_mean(
            np.array(n_episode_true_gamma_return_list))
        return evaluated_Q_mean, true_gamma_return_mean
        # return dict(n_episode_state_list=np.array(n_episode_state_list),
        #             n_episode_action_list=np.array(n_episode_action_list),
        #             n_episode_log_prob_list=np.array(n_episode_log_prob_list),
        #             n_episode_reward_list=np.array(n_episode_reward_list),
        #             n_episode_evaluated_Q_list=np.array(n_episode_evaluated_Q_list),
        #             n_episode_true_gamma_return_list=np.array(n_episode_true_gamma_return_list),
        #             n_episode_return_list=np.array(n_episode_return_list),
        #             n_episode_len_list=np.array(n_episode_len_list))

    def run(self):
        while not self.stop_sign.value:
            if self.iteration_counter.value % self.evaluation_interval == 0:
                self.alpha = np.exp(self.log_alpha_share.detach().item()) if self.args.alpha == 'auto' else 0
                self.iteration = self.iteration_counter.value
                self.actor.load_state_dict(self.actor_share.state_dict())
                self.Q_net1.load_state_dict(self.Q_net1_share.state_dict())
                self.Q_net2.load_state_dict(self.Q_net2_share.state_dict())
                evaluated_Q_mean, true_gamma_return_mean = self.run_n_episodes(self.episode_num_to_run,self.max_state_num_evaluated_in_an_episode)
                self.iteration_history.append(self.iteration)
                self.evaluated_Q_mean_history.append(evaluated_Q_mean)
                self.true_gamma_return_mean_history.append(true_gamma_return_mean)
                print('Saving evaluation results of the {} iteration.'.format(self.iteration))
                np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/iteration_evaluation',
                        np.array(self.iteration_history))
                np.save('./' + self.args.env_name + '/method_' + str(self.args.method) + '/result/evaluated_Q_mean',
                        np.array(self.evaluated_Q_mean_history))
                np.save('./' + self.args.env_name + '/method_' + str(
                    self.args.method) + '/result/true_gamma_return_mean',
                        np.array(self.true_gamma_return_mean_history))

                plot_online(self.args.env_name, self.args.method, self.args.method_name,
                            self.max_state_num_evaluated_in_an_episode)