Beispiel #1
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu") 

        self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Beispiel #2
0
    def __init__(
            self,
            env,
            train_tasks,
            eval_tasks,
            latent_dim,
            nets,
            policy_lr=3e-4,
            qf_lr=1e-3,
            vf_lr=1e-3,
            alpha=1,
            automatic_entropy_tuning=True,
            lr=3e-4,
            context_lr=3e-4,
            kl_lambda=1.,
            policy_mean_reg_weight=1e-3,
            policy_std_reg_weight=1e-3,
            policy_pre_activation_weight=0.,
            optimizer_class=optim.Adam,
            recurrent=False,
            use_information_bottleneck=True,
            sparse_rewards=False,

            #soft_target_tau=1e-2,
            soft_target_tau=0.005,
            plotter=None,
            render_eval_paths=False,
            **kwargs):
        super().__init__(env=env,
                         agent=nets[0],
                         train_tasks=train_tasks,
                         eval_tasks=eval_tasks,
                         **kwargs)

        self.soft_target_tau = soft_target_tau
        self.policy_mean_reg_weight = policy_mean_reg_weight
        self.policy_std_reg_weight = policy_std_reg_weight
        self.policy_pre_activation_weight = policy_pre_activation_weight
        self.plotter = plotter
        self.render_eval_paths = render_eval_paths
        self.alpha = alpha
        self.automatic_entropy_tuning = automatic_entropy_tuning

        self.recurrent = recurrent
        self.latent_dim = latent_dim
        # self.qf_criterion = nn.MSELoss()
        # self.vf_criterion = nn.MSELoss()
        self.vib_criterion = nn.MSELoss()
        self.l2_reg_criterion = nn.MSELoss()
        self.kl_lambda = kl_lambda

        self.use_information_bottleneck = use_information_bottleneck
        self.sparse_rewards = sparse_rewards

        # self.qf1, self.qf2, self.vf = nets[1:]
        self.critic, self.critic_target = nets[1:]  #q1,q2,target q1,target q2
        # self.target_vf = self.vf.copy()
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)
        hard_update(self.critic_target, self.critic)
        self.policy_optimizer = Adam(self.agent.policy.parameters(),
                                     lr=policy_lr)

        self.target_entropy = -torch.prod(torch.Tensor(
            env.action_space.shape)).to(
                "cuda").item()  # torch.prod(input) : 返回所有元素的乘积
        self.log_alpha = torch.zeros(1, requires_grad=True, device="cuda")
        self.alpha_optim = Adam([self.log_alpha], lr=lr)

        # self.policy_optimizer = optimizer_class(
        #     self.agent.policy.parameters(),
        #     lr=policy_lr,
        # )
        # self.qf1_optimizer = optimizer_class(
        #     self.qf1.parameters(),
        #     lr=qf_lr,
        # )
        # self.qf2_optimizer = optimizer_class(
        #     self.qf2.parameters(),
        #     lr=qf_lr,
        # )
        # self.vf_optimizer = optimizer_class(
        #     self.vf.parameters(),
        #     lr=vf_lr,
        # )
        self.context_optimizer = optimizer_class(
            self.agent.context_encoder.parameters(),
            lr=context_lr,
        )
Beispiel #3
0
    def __init__(self,
                 action_space,
                 policy: str = "Gaussian",
                 gamma: float = 0.99,
                 tau: float = 0.005,
                 lr: float = 0.0003,
                 alpha: float = 0.2,
                 automatic_temperature_tuning: bool = False,
                 batch_size: int = 256,
                 hidden_size: int = 256,
                 target_update_interval: int = 1,
                 input_dim: int = 32):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha
        self.lr = lr

        self.policy_type = policy
        self.target_update_interval = target_update_interval
        self.automatic_temperature_tuning = automatic_temperature_tuning

        self.input_dim = input_dim
        self.hidden_size = hidden_size
        self.bs = batch_size

        self.critic = QNetwork(input_dim, action_space.shape[0],
                               hidden_size).to(DEVICE)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(input_dim, action_space.shape[0],
                                      hidden_size).to(DEVICE)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            if self.automatic_temperature_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(DEVICE)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=DEVICE)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(input_dim, action_space.shape[0],
                                         hidden_size).to(DEVICE)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        else:
            self.alpha = 0
            self.automatic_temperature_tuning = False
            self.policy = DeterministicPolicy(input_dim, action_space.shape[0],
                                              hidden_size).to(DEVICE)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        settings = (
            f"INITIALIZING SAC ALGORITHM WITH {self.policy_type} POLICY"
            f"\nRunning on: {DEVICE}"
            f"\nSettings: Automatic Temperature tuning = {self.automatic_temperature_tuning}, Update Interval = {self.target_update_interval}"
            f"\nParameters: Learning rate = {self.lr}, Batch Size = {self.bs} Gamma = {self.gamma}, Tau = {self.tau}, Alpha = {self.alpha}"
            f"\nArchitecture: Input dimension = {self.input_dim}, Hidden layer dimension = {self.hidden_size}"
            "\n--------------------------")

        print(settings)