Esempio n. 1
0
 def __init__(self):
     AgentBase.__init__(self)
     self.ClassAct = ShareBiConv
     self.ClassCri = self.ClassAct
     self.if_use_cri_target = True
     self.if_use_act_target = True
     self.obj_critic = (-np.log(0.5))**0.5  # for reliable_lambda
Esempio n. 2
0
 def init(self,
          net_dim=256,
          state_dim=8,
          action_dim=2,
          reward_scale=1.0,
          gamma=0.99,
          learning_rate=1e-4,
          if_per_or_gae=False,
          env_num=1,
          gpu_id=0):
     AgentBase.init(
         self,
         net_dim=net_dim,
         state_dim=state_dim,
         action_dim=action_dim,
         reward_scale=reward_scale,
         gamma=gamma,
         learning_rate=learning_rate,
         if_per_or_gae=if_per_or_gae,
         env_num=env_num,
         gpu_id=gpu_id,
     )
     if if_per_or_gae:  # if_use_per
         self.criterion = torch.nn.MSELoss(reduction='none')
         self.get_obj_critic = self.get_obj_critic_per
     else:
         self.criterion = torch.nn.MSELoss(reduction='mean')
         self.get_obj_critic = self.get_obj_critic_raw
     self.get_obj_critic = self.get_obj_critic_raw
Esempio n. 3
0
 def init(self,
          net_dim=256,
          state_dim=8,
          action_dim=2,
          reward_scale=1.0,
          gamma=0.99,
          learning_rate=1e-4,
          if_per_or_gae=False,
          env_num=1,
          gpu_id=0):
     """
     Explict call ``self.init()`` to overwrite the ``self.object`` in ``__init__()`` for multiprocessing. 
     """
     AgentBase.init(
         self,
         net_dim=net_dim,
         state_dim=state_dim,
         action_dim=action_dim,
         reward_scale=reward_scale,
         gamma=gamma,
         learning_rate=learning_rate,
         if_per_or_gae=if_per_or_gae,
         env_num=env_num,
         gpu_id=gpu_id,
     )
     if if_per_or_gae:  # if_use_per
         self.criterion = torch.nn.SmoothL1Loss(reduction='none')
         self.get_obj_critic = self.get_obj_critic_per
     else:
         self.criterion = torch.nn.SmoothL1Loss(reduction='mean')
         self.get_obj_critic = self.get_obj_critic_raw
Esempio n. 4
0
 def __init__(self):
     AgentBase.__init__(self)
     self.ClassAct = ActorBiConv
     self.ClassCri = CriticBiConv
     self.if_use_cri_target = False
     self.if_use_act_target = False
     self.explore_noise = 2**-8
     self.obj_critic = (-np.log(0.5))**0.5  # for reliable_lambda
Esempio n. 5
0
    def init(self,
             net_dim=256,
             state_dim=8,
             action_dim=2,
             reward_scale=1.0,
             gamma=0.99,
             learning_rate=1e-4,
             if_per_or_gae=False,
             env_num=1,
             gpu_id=0):
        AgentBase.init(
            self,
            net_dim=net_dim,
            state_dim=state_dim,
            action_dim=action_dim,
            reward_scale=reward_scale,
            gamma=gamma,
            learning_rate=learning_rate,
            if_per_or_gae=if_per_or_gae,
            env_num=env_num,
            gpu_id=gpu_id,
        )
        self.act = self.cri = self.ClassAct(net_dim, state_dim,
                                            action_dim).to(self.device)
        if self.if_use_act_target:
            self.act_target = self.cri_target = deepcopy(self.act)
        else:
            self.act_target = self.cri_target = self.act

        self.cri_optim = torch.optim.Adam([
            {
                'params': self.act.enc_s.parameters(),
                'lr': learning_rate * 1.25
            },
            {
                'params': self.act.enc_a.parameters(),
            },
            {
                'params': self.act.mid_n.parameters(),
                'lr': learning_rate * 1.25
            },
            {
                'params': self.act.dec_a.parameters(),
            },
            {
                'params': self.act.dec_q.parameters(),
            },
        ],
                                          lr=learning_rate)
        self.act_optim = self.cri_optim

        if if_per_or_gae:  # if_use_per
            self.criterion = torch.nn.MSELoss(reduction='none')
            self.get_obj_critic = self.get_obj_critic_per
        else:
            self.criterion = torch.nn.MSELoss(reduction='mean')
            self.get_obj_critic = self.get_obj_critic_raw
Esempio n. 6
0
    def __init__(self):
        AgentBase.__init__(self)
        self.ClassAct = Actor
        self.ClassCri = Critic
        self.if_use_cri_target = True
        self.if_use_act_target = True

        self.explore_noise = 0.3  # explore noise of action (OrnsteinUhlenbeckNoise)
        self.ou_noise = None
Esempio n. 7
0
    def __init__(self):
        AgentBase.__init__(self)
        self.ClassAct = Actor
        self.ClassCri = CriticTwin
        self.if_use_cri_target = True
        self.if_use_act_target = True

        self.explore_noise = 0.1  # standard deviation of exploration noise
        self.policy_noise = 0.2  # standard deviation of policy noise
        self.update_freq = 2  # delay update frequency
Esempio n. 8
0
    def __init__(self):
        AgentBase.__init__(self)
        self.ClassAct = ActorPPO
        self.ClassCri = CriticPPO

        self.if_off_policy = False
        self.ratio_clip = 0.2  # could be 0.00 ~ 0.50 ratio.clamp(1 - clip, 1 + clip)
        self.lambda_entropy = 0.02  # could be 0.00~0.10
        self.lambda_a_value = 1.00  # could be 0.25~8.00, the lambda of advantage value
        self.lambda_gae_adv = 0.98  # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.)
        self.get_reward_sum = None  # self.get_reward_sum_gae if if_use_gae else self.get_reward_sum_raw
Esempio n. 9
0
    def __init__(self):
        AgentBase.__init__(self)
        self.ClassCri = CriticTwin
        self.ClassAct = ActorSAC
        self.if_use_cri_target = True
        self.if_use_act_target = False

        self.alpha_log = None
        self.alpha_optim = None
        self.target_entropy = None
        self.obj_critic = (-np.log(0.5))**0.5  # for reliable_lambda
Esempio n. 10
0
 def __init__(self):
     AgentBase.__init__(self)
     self.ClassCri = DiscreteCriSAC
     self.ClassAct = DiscreteActSAC
     self.train_reward = []
     self.if_use_cri_target = True
     self.if_use_act_target = False
     self.trajectory_list = []
     self.alpha_log = None
     self.alpha_optim = None
     self.target_entropy = None
     self.obj_critic = (-np.log(0.5))**0.5  # for reliable_lambda
     self.train_iteraion = 0
Esempio n. 11
0
    def init(self, net_dim=256, state_dim=8, action_dim=2, reward_scale=1.0, gamma=0.99,
             learning_rate=1e-4, if_per_or_gae=False, env_num=1, gpu_id=0):
        """
        Explict call ``self.init()`` to overwrite the ``self.object`` in ``__init__()`` for multiprocessing. 
        """
        AgentBase.init(self, net_dim=net_dim, state_dim=state_dim, action_dim=action_dim,
                       reward_scale=reward_scale, gamma=gamma,
                       learning_rate=learning_rate, if_per_or_gae=if_per_or_gae,
                       env_num=env_num, gpu_id=gpu_id, )
        self.traj_list = [list() for _ in range(env_num)]
        self.env_num = env_num

        if if_per_or_gae:  # if_use_gae
            self.get_reward_sum = self.get_reward_sum_gae
        else:
            self.get_reward_sum = self.get_reward_sum_raw
        if env_num == 1:
            self.explore_env = self.explore_one_env
        else:
            self.explore_env = self.explore_vec_env
Esempio n. 12
0
    def init(self,
             net_dim=256,
             state_dim=8,
             action_dim=2,
             reward_scale=1.0,
             gamma=0.99,
             learning_rate=1e-4,
             if_per_or_gae=False,
             env_num=1,
             gpu_id=0):
        """
        Explict call ``self.init()`` to overwrite the ``self.object`` in ``__init__()`` for multiprocessing. 
        """
        AgentBase.init(
            self,
            net_dim=net_dim,
            state_dim=state_dim,
            action_dim=action_dim,
            reward_scale=reward_scale,
            gamma=gamma,
            learning_rate=learning_rate,
            if_per_or_gae=if_per_or_gae,
            env_num=env_num,
            gpu_id=gpu_id,
        )

        self.alpha_log = torch.tensor(
            (-np.log(action_dim) * np.e, ),
            dtype=torch.float32,
            requires_grad=True,
            device=self.device)  # trainable parameter
        self.alpha_optim = torch.optim.Adam((self.alpha_log, ),
                                            lr=learning_rate)
        self.target_entropy = np.log(action_dim)

        if if_per_or_gae:  # if_use_per
            self.criterion = torch.nn.SmoothL1Loss(reduction='none')
            self.get_obj_critic = self.get_obj_critic_per
        else:
            self.criterion = torch.nn.SmoothL1Loss(reduction='mean')
            self.get_obj_critic = self.get_obj_critic_raw
Esempio n. 13
0
    def init(self,
             net_dim=256,
             state_dim=8,
             action_dim=2,
             reward_scale=1.0,
             gamma=0.99,
             learning_rate=1e-4,
             if_per_or_gae=False,
             env_num=1,
             gpu_id=0):
        AgentBase.init(
            self,
            net_dim=net_dim,
            state_dim=state_dim,
            action_dim=action_dim,
            reward_scale=reward_scale,
            gamma=gamma,
            learning_rate=learning_rate,
            if_per_or_gae=if_per_or_gae,
            env_num=env_num,
            gpu_id=gpu_id,
        )

        #self.alpha_log = torch.tensor((-np.log(action_dim) * np.e,), dtype=torch.float32,
        #                              requires_grad=True, device=self.device)  # trainable parameter
        self.alpha_log = torch.zeros(1,
                                     dtype=torch.float32,
                                     requires_grad=True,
                                     device=self.device)
        self.alpha_optim = torch.optim.Adam((self.alpha_log, ),
                                            lr=learning_rate)
        self.target_entropy = np.log(action_dim)
        self.alpha = alpha = self.alpha_log.cpu().exp().item()
        self.trajectory_list = list()
        if if_per_or_gae:  # if_use_per
            self.criterion = torch.nn.SmoothL1Loss(reduction='none')
            self.get_obj_critic = self.get_obj_critic_per
        else:
            self.criterion = torch.nn.MSELoss(reduction='mean')
            self.get_obj_critic = self.get_obj_critic_raw
Esempio n. 14
0
 def __init__(self):
     AgentBase.__init__(self)
     self.ClassCri = None  # self.ClassCri = QNetDuel if self.if_use_dueling else QNet
     self.if_use_dueling = True  # self.ClassCri = QNetDuel if self.if_use_dueling else QNet
     self.explore_rate = 0.25  # the probability of choosing action randomly in epsilon-greedy