Example #1
0
File: agent.py Project: Kavka1/RL
    def __init__(self, s_dim, a_dim, action_space, args):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.action_space = action_space
        self.lr_pi = args.lr_pi
        self.lr_q = args.lr_q
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_std = args.noise_std
        self.noise_clip = args.noise_clip
        self.batch_size = args.batch_size
        self.policy_update_interval = args.policy_update_interval
        self.device = torch.device(args.device)
        self.policy_loss_log = torch.tensor(0.).to(self.device)

        self.policy = DeterministicPolicy(self.s_dim,
                                          self.a_dim,
                                          self.device,
                                          action_space=self.action_space).to(
                                              self.device)
        self.policy_target = DeterministicPolicy(
            self.s_dim,
            self.a_dim,
            self.device,
            action_space=self.action_space).to(self.device)
        self.Q1 = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q1_target = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q2 = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.Q2_target = QFunction(self.s_dim, self.a_dim).to(self.device)
        self.hard_update_target()

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi)
        self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q)
        self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q)
Example #2
0
File: agent.py Project: Kavka1/RL
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.action_boundary = env_params['action_boundary']

        self.lr_a = args.lr_a
        self.lr_c = args.lr_c
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_eps = args.noise_eps
        self.batch_size = args.batch_size

        self.device = torch.device(args.device)

        self.actor = DeterministicPolicy(self.o_dim,
                                         self.a_dim).to(self.device)
        self.actor_tar = DeterministicPolicy(self.o_dim,
                                             self.a_dim).to(self.device)
        self.critic = QFunction(self.o_dim, self.a_dim).to(self.device)
        self.critic_tar = QFunction(self.o_dim, self.a_dim).to(self.device)

        self.optimizer_a = optim.Adam(self.actor.parameters(), lr=self.lr_a)
        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c)

        self.hard_update()
    def __init__(self):

        self.rate = rospy.Rate(100)
        rospy.Subscriber('robot_0/pose', Float32MultiArray,
                         self.loc_callback_0)
        rospy.Subscriber('robot_1/pose', Float32MultiArray,
                         self.loc_callback_1)
        self.states = np.zeros(6)

        self.pub0 = rospy.Publisher('/robot_0/cmd_vel', Twist, queue_size=10)
        self.pub1 = rospy.Publisher('/robot_1/cmd_vel', Twist, queue_size=10)

        # self.listener = TransformListener()
        # self.__timer_current = rospy.Timer(rospy.Duration(0.01), self.loc)

        self.control_cmd0 = Twist()
        self.control_cmd1 = Twist()

        self.u_range = np.array([0.5, 2.0])
        self.action_space = spaces.Box(low=-self.u_range,
                                       high=+self.u_range,
                                       shape=(2, ),
                                       dtype=np.float32)
        self.attacker = DeterministicPolicy(6, 2, 256,
                                            self.action_space).to('cuda')
        self.attacker.load_state_dict(torch.load("pretrain"))
        self.defender = DeterministicPolicy(6, 2, 256,
                                            self.action_space).to('cuda')
        self.defender.load_state_dict(torch.load("d_actor"))

        if (control_mode == 1):
            self.auto()
        else:
            self.manual()
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.action_range = [action_space.low, action_space.high]

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu") 

        self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)


            self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Example #5
0
    def __init__(self, state_shape, n_actions, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.action_range = [0.0, 1.0]

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.critic = QNetwork(state_shape, n_actions, args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(state_shape, n_actions, args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning:
                self.target_entropy = -torch.prod(torch.Tensor(n_actions).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)


            self.policy = GaussianPolicy(state_shape, n_actions, args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(state_shape, n_actions, args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Example #6
0
    def __init__(self, num_inputs, action_space, agent_args):

        self.gamma = agent_args["gamma"]
        self.tau = agent_args["tau"]
        self.alpha = agent_args["alpha"]

        self.policy_type = agent_args["policy"]
        self.target_update_interval = agent_args["target_update_interval"]
        self.automatic_entropy_tuning = agent_args["automatic_entropy_tuning"]

        self.device = torch.device("cuda" if agent_args["cuda"] else "cpu")

        # print("num_inputs::",num_inputs)
        # print("type(action_space)::",type(action_space))
        # print("type(action_space)::",isinstance(action_space,gym.spaces.discrete.Discrete))
        # print(" agent_args['hidden_size']::", agent_args["hidden_size"])

        if isinstance(action_space, gym.spaces.discrete.Discrete):
            action_shape = action_space.n
        else:
            action_shape = action_space.shape[0]
        self.critic = QNetwork(
            num_inputs, action_shape,
            agent_args["hidden_size"]).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=agent_args["lr"])

        self.critic_target = QNetwork(num_inputs, action_shape,
                                      agent_args["hidden_size"]).to(
                                          self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=agent_args["lr"])

            self.policy = GaussianPolicy(num_inputs, action_shape,
                                         agent_args["hidden_size"],
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(),
                                     lr=agent_args["lr"])

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs, action_shape,
                                              agent_args["hidden_size"],
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(),
                                     lr=agent_args["lr"])
Example #7
0
 def __init__(self):
     self.u_range = np.array([0.5,2.0])
     self.action_space = spaces.Box(low=-self.u_range, high=+self.u_range, shape=(2,), dtype=np.float32)
     self.observation_space = spaces.Box(low=-np.inf, high=+np.inf, shape=(6,), dtype=np.float32)
     
     a = np.array([-1 + np.random.rand() * 0.4 - 0.2, np.random.rand() * 0.8 - 0.4 ,0])
     d = np.array([0.5 + np.random.rand() * 0.4 - 0.2 ,np.random.rand() * 0.4 - 0.2 ,np.random.rand() * 0.4 - 0.2 ])
     self.initial_conditions = np.array([a,d]).T 
     self.agents = robotarium.Robotarium(number_of_robots=2, show_figure=False, initial_conditions=self.initial_conditions,sim_in_real_time=False)
     self.states = self.agents.get_poses().T
     self.agents.step()
     self._max_episode_steps = 400
     self.times = 0
     self.net = DeterministicPolicy(6, 2, 256, self.action_space).to('cuda')
Example #8
0
    def __init__(self):

        self.rate = rospy.Rate(100)
        rospy.Subscriber('gazebo/model_states',ModelStates,self.loc_callback)

        self.states = np.zeros(6)

        self.pub0 = rospy.Publisher('/tb3_0/cmd_vel',Twist,queue_size=10)
        self.pub1 = rospy.Publisher('/tb3_1/cmd_vel',Twist,queue_size=10)
        
        self.control_cmd0 = Twist()
        self.control_cmd1 = Twist()
        
        self.u_range = np.array([0.5,2.0])
        self.action_space = spaces.Box(low=-self.u_range, high=+self.u_range, shape=(2,), dtype=np.float32)
        self.attacker = DeterministicPolicy(6, 2, 256, self.action_space).to('cuda')
        self.attacker.load_state_dict(torch.load("pretrain"))
        self.defender = DeterministicPolicy(6, 2, 256, self.action_space).to('cuda')
        self.defender.load_state_dict(torch.load("d_actor"))
        if( control_mode == 1):
            self.auto()
        else:
            self.manual()
Example #9
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        # Q network, which yields a certain value for (a_t | s_t) pair
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        # a sort of a replica - since, due to Bellman recursive definition, Q network learns from itself- and its unstbale
        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        # the start point is same weights in both networks.
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # todo: crunch on this automatic alpha update
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            # instanciating of policy - given a state it produces probabilities for actions
            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            # todo: what's difference between deterministic to Gaussian
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Example #10
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma  #γ
        self.tau = args.tau  #τ
        self.alpha = args.alpha  #α

        self.policy_type = args.policy  #策略类型,高斯随机策略、确定性策略
        self.target_update_interval = args.target_update_interval  #target network更新间隔
        self.automatic_entropy_tuning = args.automatic_entropy_tuning  #自动调熵

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs,
                               action_space.shape[0], args.hidden_size).to(
                                   device=self.device)  #Critic Network,Q网络
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(
                                          self.device)  #Target Q Network
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(
                        self.device)).item()  #torch.prod(input) : 返回所有元素的乘积
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Example #11
0
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
            else:
                pass

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)
Example #12
0
    def __init__(self, num_inputs, action_space, variant):

        self.gamma = variant['gamma']
        self.tau = variant['tau']
        self.alpha = variant['alpha']
        self.policy_type = variant['policy_type']
        self.target_update_interval = variant['target_update_interval']
        self.automatic_entropy_tuning = variant['automatic_entropy_tuning']
        self.lr = variant.get("lr", 1e-3)

        self.device = torch.device("cuda" if variant['cuda'] else "cpu")
        self.hidden_size = variant.get('hidden_size', [128, 128])

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               self.hidden_size).to(self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == 'Gaussian':
            if self.automatic_entropy_tuning:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         self.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              self.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)
Example #13
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.Qapproximation = args.Qapproximation
        try:
            self.filter = args.filter
        except:
            self.filter = 'none'
        try:
            self.TDfilter = args.TDfilter
        except:
            self.TDfilter = 'none'

        if args.Qapproximation == 'fourier':
            self.critic = Qfourier(num_inputs,
                                   action_space.shape[0],
                                   256,
                                   action_space,
                                   gridsize=20).to(device=self.device)
            # target doesn't need to filter Q in high frequencies
            self.critic_target = Qfourier(num_inputs,
                                          action_space.shape[0],
                                          256,
                                          action_space,
                                          gridsize=20).to(device=self.device)

        if args.Qapproximation == 'byactiondim':
            self.critic = Qbyactiondim(num_inputs, action_space.shape[0], 256,
                                       8, 5,
                                       action_space).to(device=self.device)
            self.critic_target = Qbyactiondim(
                num_inputs, action_space.shape[0], 256, 8, 5,
                action_space).to(device=self.device)

#        self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        #        self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Example #14
0
# # time = 120
# print(x.shape)
torch_dataset = Data.TensorDataset(x, u)

BATCH_SIZE = 200
loader = Data.DataLoader(
    dataset=torch_dataset,  # torch TensorDataset format
    batch_size=BATCH_SIZE,  # mini batch size
    shuffle=True,  # 要不要打乱数据 (打乱比较好)
)

action_space = spaces.Box(low=-np.array([0.5, 2.0]),
                          high=+np.array([0.5, 2.0]),
                          shape=(2, ),
                          dtype=np.float32)
net = DeterministicPolicy(6, 2, 256, action_space).to('cuda')

optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

for epoch in range(1000):  # 训练所有!整套!数据 3 次
    for step, (batch_x,
               batch_u) in enumerate(loader):  # 每一步 loader 释放一小批数据用来学习
        # print(batch_x.shape)
        prediction = net(batch_x.cuda())  # input x and predict based on x

        loss = loss_func(prediction,
                         batch_u.cuda())  # must be (1. nn output, 2. target)

        optimizer.zero_grad()  # clear gradients for next train
        loss.backward()  # backpropagation, compute gradients
Example #15
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy_type
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        self.sysmodel = SysModel(num_inputs, action_space.shape[0],
                                 args.sys_hidden_size,
                                 args.sys_hidden_size).to(self.device)
        self.sysmodel_optimizer = Adam(self.sysmodel.parameters(), lr=args.lr)

        self.obs_upper_bound = 0  #state space upper bound
        self.obs_lower_bound = 0  #state space lower bound

        self.sysr = Sys_R(num_inputs, action_space.shape[0],
                          args.sysr_hidden_size,
                          args.sysr_hidden_size).to(self.device)
        self.sysr_optimizer = torch.optim.Adam(self.sysr.parameters(),
                                               lr=args.lr)

        self.sys_threshold = args.sys_threshold
        self.sys_weight = args.sys_weight
        self.sysmodel_loss = 0
        self.sysr_loss = 0

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Example #16
0
    def __init__(self,
                 num_inputs,
                 action_space,
                 args,
                 process_obs=None,
                 opt_level='O1'):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.device = torch.device("cuda" if args.cuda else "cpu")
        self.dtype = torch.float

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.process_obs = process_obs.to(self.device).to(self.dtype)
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device).to(
                                   self.dtype)
        self.critic_optim = Adam(list(self.critic.parameters()) +
                                 list(process_obs.parameters()),
                                 lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device).to(
                                          self.dtype)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device,
                                             dtype=self.dtype)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size, action_space).to(
                                             self.device).to(self.dtype)
            self.policy_optim = Adam(list(self.policy.parameters()) +
                                     list(process_obs.parameters()),
                                     lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(
                num_inputs, action_space.shape[0], args.hidden_size,
                action_space).to(self.device).to(self.dtype)
            self.policy_optim = Adam(list(self.policy.parameters()) +
                                     list(process_obs.parameters()),
                                     lr=args.lr)

        if opt_level is not None:
            model, optimizer = amp.initialize([
                self.policy, self.process_obs, self.critic, self.critic_target
            ], [self.policy_optim, self.critic_optim],
                                              opt_level=opt_level)