Beispiel #1
0
    def load_models(self, episode):
        """
		loads the target actor and critic models, and copies them onto actor and critic models
		:param episode: the count of episodes iterated (used to find the file name)
		:return:
		## NOTE: SOURCE MODELS ARE SAVED NOW, NOT TARGET
		"""
        self.actor.load_state_dict(
            torch.load('./Models/' + str(episode) + '_actor_' + str(version) +
                       '.pt'))
        self.critic.load_state_dict(
            torch.load('./Models/' + str(episode) + '_critic_' + str(version) +
                       '.pt'))
        utils.hard_update(self.target_actor, self.actor)
        utils.hard_update(self.target_critic, self.critic)
        print 'Models loaded succesfully'
Beispiel #2
0
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs,
                                      args.hidden_size).to(self.device)
            self.value_target = ValueNetwork(self.num_inputs,
                                             args.hidden_size).to(self.device)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size).to(self.device)
            hard_update(self.critic_target, self.critic)
Beispiel #3
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        # Q network, which yields a certain value for (a_t | s_t) pair
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        # a sort of a replica - since, due to Bellman recursive definition, Q network learns from itself- and its unstbale
        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        # the start point is same weights in both networks.
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # todo: crunch on this automatic alpha update
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            # instanciating of policy - given a state it produces probabilities for actions
            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            # todo: what's difference between deterministic to Gaussian
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Beispiel #4
0
 def load_models(self, name, test=False):
     """
     loads the target actor and critic models, and copies them onto actor and critic models
     """
     saved_state = torch.load(name,
                              map_location=lambda storage, loc: storage)
     if test:
         saved_state = {
             name: param
             for name, param in saved_state.items() if 'actor' in name
         }
         strict = False
     else:
         strict = True
     self.AC_T.load_state_dict(saved_state, strict=strict)
     utils.hard_update(self.AC, self.AC_T)
     print('Models loaded succesfully')
Beispiel #5
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma  #γ
        self.tau = args.tau  #τ
        self.alpha = args.alpha  #α

        self.policy_type = args.policy  #策略类型,高斯随机策略、确定性策略
        self.target_update_interval = args.target_update_interval  #target network更新间隔
        self.automatic_entropy_tuning = args.automatic_entropy_tuning  #自动调熵

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs,
                               action_space.shape[0], args.hidden_size).to(
                                   device=self.device)  #Critic Network,Q网络
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(
                                          self.device)  #Target Q Network
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(
                        self.device)).item()  #torch.prod(input) : 返回所有元素的乘积
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Beispiel #6
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        #Similar to Double-QNetwork
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        #The two networks are with the same initialization

        #Two option policy, stochastic(Gaussian) or Deterministic
        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
    def __init__(self, num_frame_obs, num_goal_obs, num_vel_obs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        #self.device = torch.device("cuda" if args.cuda else "cpu")
        #self.device = torch.device("cuda")
        self.device = torch.device("cuda")

        self.action_space_array = np.array(action_space)
        self.action_space = action_space
        self.critic_1 = QNetwork_1(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_1_optim = Adam(self.critic_1.parameters(), lr=args.lr)

        self.critic_1_target = QNetwork_1(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_1_target, self.critic_1)

        self.critic_2 = QNetwork_2(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_2_optim = Adam(self.critic_2.parameters(), lr=args.lr)

        self.critic_2_target = QNetwork_2(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_2_target, self.critic_2)


        if self.policy_type == "Gaussian":
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(torch.Tensor(self.action_space_array.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            #self.policy = GaussianPolicy(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space_array.shape[0], args.hidden_size, self.action_space_array).to(self.device)
            self.policy = GaussianPolicy(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size, self.action_space).to(self.device)

            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size, self.action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Beispiel #8
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        tmp_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.device = torch.device(tmp_device)

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
    def __init__(self, act_sp, ob_sp, all_obs, all_acts, hidden_dim=64, 
                 start_steps=10000, update_after=1000, update_every=50):
        self.lr = 1e-2
        self.target_noise = 0.2
        self.target_noise_clip = 0.3
        self.act_noise = 0.1
        self.act_sp = act_sp
        self.ob_sp = ob_sp
        self.start_steps = start_steps
        self.update_after = update_after
        self.update_every = update_every
        # self.start_steps = 1
        # self.update_after = 1
        # self.update_every = 2
        print(f"act_sp: {act_sp}, ob_sp: {ob_sp}, all_obs: {all_obs}, all_acts: {all_acts}")
        self.policy = MLPNetwork(ob_sp, act_sp,
                                constrain_out=True, discrete_action=False,
                                td3_policy=True, hidden_dim=hidden_dim).to(device)
        self.policy_targ = MLPNetwork(ob_sp, act_sp,
                                constrain_out=True, discrete_action=False,
                                td3_policy=True, hidden_dim=hidden_dim).to(device)
        self.q_nets_n = 2
        self.qnets = []
        self.qnet_targs = []
        self.q_optimizers = []
        for i in range(self.q_nets_n):
            qnet = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device)
            qnet_targ = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device)
            qnet.to(device)
            qnet_targ.to(device) 
            hard_update(qnet_targ, qnet)
            self.qnets.append(qnet)
            self.qnet_targs.append(qnet_targ)
            self.q_optimizers.append(optim.Adam(qnet.parameters(), lr=self.lr))

        self.policy.to(device)
        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=self.lr)

        self.policy_targ.to(device)
        self.p_targ_optimizer = optim.Adam(self.policy_targ.parameters(), lr=self.lr)

        self.action_count = 0
        self.use_warmup = True
 def __init__(self,
             state_size,
             action_size,                
             tau,
             lr_actor,
             lr_critic,
             num_agents,
             agent_idx,
             seed,
             device,
             gamma,
             tensorboard_writer=None):
     
     self.state_size = state_size
     self.action_size = action_size
     self.tau = tau
     self.lr_actor = lr_actor
     self.lr_critic = lr_critic
     self.num_agents = num_agents
     self.agent_idx = agent_idx
     self.seed = seed       
     self.device = device
     self.gamma = gamma
     random.seed(seed)
     self.tensorboard_writer = tensorboard_writer        
     
     self.actor_local = Actor(state_size, action_size, seed)
     self.actor_target = Actor(state_size, action_size, seed)
     
     critic_state_size = (state_size + action_size) * num_agents
     
     self.critic_local = Critic(critic_state_size, seed)
     self.critic_target = Critic(critic_state_size, seed)
     
     hard_update(self.actor_local, self.actor_target)
     hard_update(self.critic_local, self.critic_target) 
     
     self.actor_optim = torch.optim.Adam(self.actor_local.parameters(), lr=lr_actor)
     self.critic_optim = torch.optim.Adam(self.critic_local.parameters(), lr=lr_critic)
     
     self.noise = OUNoise(action_size, seed)
     
     self.iteration = 0
Beispiel #11
0
    def __init__(self, num_inputs, action_space, variant):

        self.gamma = variant['gamma']
        self.tau = variant['tau']
        self.alpha = variant['alpha']
        self.policy_type = variant['policy_type']
        self.target_update_interval = variant['target_update_interval']
        self.automatic_entropy_tuning = variant['automatic_entropy_tuning']
        self.lr = variant.get("lr", 1e-3)

        self.device = torch.device("cuda" if variant['cuda'] else "cpu")
        self.hidden_size = variant.get('hidden_size', [128, 128])

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               self.hidden_size).to(self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == 'Gaussian':
            if self.automatic_entropy_tuning:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         self.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              self.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)
    def __init__(self, act_sp, ob_sp, all_obs, all_acts, hidden_dim=64):
        self.act_sp = act_sp
        self.ob_sp = ob_sp
        # print(ob_sp)
        print(f"ob_sp: {ob_sp} act_sp: {act_sp}")
        self.policy = MLPNetwork(ob_sp, act_sp, constrain_out=True, hidden_dim=hidden_dim).to(device)
        self.policy_targ = MLPNetwork(ob_sp, act_sp, constrain_out=True, hidden_dim=hidden_dim).to(device)
        self.qnet = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device)
        self.qnet_targ = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device)

        self.policy.to(device)
        self.qnet.to(device)
        self.policy_targ.to(device)
        self.qnet_targ.to(device)

        hard_update(self.policy_targ, self.policy)
        hard_update(self.qnet_targ, self.qnet)

        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
Beispiel #13
0
    def __init__(self, obs_space, action_space, ram, writer, device, args):
        """
        :param obs_space: Dimensions of state (int)
        :param action_space: Dimension of action (int)
        :param ram: replay memory buffer object
        :return:
        """
        self.state_dim = obs_space.shape[0]
        self.action_dim = action_space.shape[0]
        self.action_high = action_space.high
        self.action_low = action_space.low
        self.ram = ram
        self.iter = 1
        self.steps = 0
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.decay_rate = args.decay_rate
        self.eps_start = args.eps_start
        self.eps_end = args.eps_end
        self.eps_decay = args.eps_decay
        self.start_step = args.start_learning
        self.device = device
        self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim)
        self.writer = writer
        self.args = args

        # init network
        target_net = DDPG(obs_space.shape, self.action_dim, args).to(device)
        learn_net = DDPG(obs_space.shape, self.action_dim, args).to(device)
        utils.hard_update(target_net, learn_net)
        self.AC = learn_net
        self.AC_T = target_net
        self.actor_optimizer = torch.optim.Adam(
            self.AC.actor.policyNet.parameters(), args.lr_a)
        self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(),
                                                 args.lr_c)
        self.actor = self.AC.actor
        self.target_actor = self.AC_T.actor
        self.critic = self.AC.critic
        self.target_critic = self.AC_T.critic
    def __init__(self, config):
        self.config = config
        self.online_actor = config.actor_fn().to(self.config.device)
        self.target_actor = config.actor_fn().to(self.config.device)
        self.online_actor_opt = config.actor_opt_fn(
            self.online_actor.parameters())

        self.online_critic = config.critic_fn().to(self.config.device)
        self.target_critic = config.critic_fn().to(self.config.device)
        self.online_critic_opt = config.critic_opt_fn(
            self.online_critic.parameters())

        self.noises = [
            config.noise_fn() for _ in range(self.config.num_agents)
        ]
        self.replay = config.replay_fn()

        hard_update(self.target_actor,
                    self.online_actor)  # initialize to be equal
        hard_update(self.target_critic,
                    self.online_critic)  # initialize to be equal
Beispiel #15
0
    def __init__(self, input_space, action_space, args):

        self.use_expert = args.use_expert
        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.action_range = [action_space.low, action_space.high]
        self.policy_type = args.policy

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        # self.device = torch.device("cuda" if args.cuda else "cpu")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # print(torch.cuda.is_available())
        # print(torch.cuda.current_device())
        # print(torch.cuda.device(0))
        # print(torch.cuda.device_count())
        # print(torch.cuda.get_device_name())
        # print(torch.backends.cudnn.version())
        # print(torch.backends.cudnn.is_available())

        self.critic = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size, action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            raise ValueError("Not supper another type yet.")
Beispiel #16
0
 def __init__(self, obs_space, action_space, ram):
     self.obs_dim = obs_space.shape[0]
     self.act_dim = action_space.n  # only for discrete space
     
     self.ram = ram
     self.iter = 1
     self.steps = 0
     self.gamma = 0.90
     self.batch_size = 64
     self.initial_e = 0.5
     self.end_e = 0.01
     self.e = self.initial_e
     self.target_update_freq = 100
     self.tau = 0.01
     self.lr = 0.001
     
     self.learning_net = DQN_Model.DQN(self.obs_dim, self.act_dim).cuda()
     self.target_net = DQN_Model.DQN(self.obs_dim, self.act_dim).cuda()
     utils.hard_update(self.target_net, self.learning_net)
     
     self.optimizer = torch.optim.Adam(self.learning_net.parameters(), self.lr)
     self.loss_f = nn.MSELoss()
Beispiel #17
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        self.actor = Actor(state_size, action_size).to(DEVICE)
        self.critic = Critic(state_size, action_size, num_agents).to(DEVICE)
        self.target_actor = Actor(state_size, action_size).to(DEVICE)
        self.target_critic = Critic(state_size, action_size,
                                    num_agents).to(DEVICE)

        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic)
    def __init__(self, num_inputs, action_space, \
                 device, hidden_size, lr, gamma, tau, alpha):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha

        self.device = device 

        self.critic = QNetwork(num_inputs, action_space.shape[0], hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0], hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        
        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = Adam([self.log_alpha], lr=lr)
        self.policy = GaussianPolicy(num_inputs, action_space.shape[0], \
                                         hidden_size, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)
Beispiel #19
0
	def __init__(self, obs_space, action_space, ram):
		self.obs_dim = obs_space.shape[0]
		self.act_dim = action_space.shape[0] # only for continiuous env

		# just for one action 
		self.action_low = action_space.low[0]
		self.action_high = action_space.high[0]

		self.ram = ram
		self.iter = 1
		self.steps = 0
		self.gamma = 0.9
		self.batch_size = 64
		self.initial_e = 0.5
		self.end_e = 0.01
		self.e = self.initial_e

		self.start_training = 100
		self.tau = 0.01
		self.critic_lr = 0.001
		self.actor_lr = 0.001
		self.noise = utils.RandomActionNoise(self.act_dim)

		target_net = DDPG_Model.DDPG(self.obs_dim, self.act_dim).cuda()
		learning_net = DDPG_Model.DDPG(self.obs_dim, self.act_dim).cuda()
		utils.hard_update(target_net, learning_net)

		self.AC = learning_net
		self.AC_T = target_net
		self.actor = self.AC.actor
		self.critic = self.AC.critic
		self.actor_T = self.AC_T.actor
		self.critic_T = self.AC_T.critic

		self.actor_optimizer = torch.optim.Adam(self.AC.actor.parameters(), self.actor_lr)
		self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(), self.critic_lr)

		self.loss_f = nn.MSELoss()
Beispiel #20
0
    def __init__(self, s_dim, a_dim, n_agents, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.n_agents = n_agents
        self.device = 'cuda' if self.config.use_cuda else 'cpu'
        # Networks
        self.policy = Actor(s_dim, a_dim, n_agents)
        self.policy_target = Actor(s_dim, a_dim, n_agents)
        self.critic = Critic(s_dim, a_dim, n_agents)
        self.critic_target = Critic(s_dim, a_dim, n_agents)

        if self.config.use_cuda:
            self.policy.cuda()
            self.policy_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)

        hard_update(self.policy, self.policy_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

        self.c_loss = None
        self.a_loss = None
        self.action_log = list()
Beispiel #21
0
    def __init__(self, nb_states, nb_actions, args):
        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        net_config = {
            'hidden1' : args.hidden1,
            'hidden2' : args.hidden2
        }

        # Actor and Critic initialization
        self.actor = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr)

        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)

        # Replay Buffer and noise
        self.memory = ReplayBuffer(args.memory_size)
        self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions))

        self.last_state = None
        self.last_action = None

        # Hyper parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount

        # CUDA
        self.use_cuda = args.cuda
        if self.use_cuda:
            self.cuda()
    def __init__(self, state_dim, action_dim, ram):
        """
		Initialize actor and critic networks
		"""
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.ram = ram
        self.iter = 0
        self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim)

        self.actor = model.Actor(self.state_dim, self.action_dim)
        self.target_actor = model.Actor(self.state_dim, self.action_dim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                LEARNING_RATE)

        self.critic = model.Critic(self.state_dim, self.action_dim)
        self.target_critic = model.Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 LEARNING_RATE)

        # copy parameters to target networks
        utils.hard_update(self.target_actor, self.actor)
        utils.hard_update(self.target_critic, self.critic)
Beispiel #23
0
    def __init__(self, gamma, tau, hidden_size, num_inputs, action_space):

        self.num_inputs = num_inputs
        self.action_space = action_space

        self.actor = Actor(hidden_size, self.num_inputs, self.action_space)
        self.actor_target = Actor(hidden_size, self.num_inputs,
                                  self.action_space)
        self.actor_perturbed = Actor(hidden_size, self.num_inputs,
                                     self.action_space)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(hidden_size, self.num_inputs, self.action_space)
        self.critic_target = Critic(hidden_size, self.num_inputs,
                                    self.action_space)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = gamma
        self.tau = tau

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
Beispiel #24
0
    def __init__(self, state_dim, action_dim, action_lim, ram):
        """
		:param state_dim: Dimensions of state (int)
		:param action_dim: Dimension of action (int)
		:param action_lim: Used to limit action in [-action_lim,action_lim]
		:param ram: replay memory buffer object
		:return:
		"""
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lim = action_lim
        self.ram = ram
        self.iter = 0
        self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim)

        self.actor = model.Actor(self.state_dim, self.action_dim,
                                 self.action_lim)
        self.target_actor = model.Actor(self.state_dim, self.action_dim,
                                        self.action_lim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                LEARNING_RATE,
                                                weight_decay=1e-5)

        self.critic = model.Critic(self.state_dim, self.action_dim)
        self.target_critic = model.Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 LEARNING_RATE * 10,
                                                 weight_decay=1e-5)

        if (USEGPU):
            self.target_actor = self.target_actor.cuda()
            self.actor = self.actor.cuda()
            self.target_critic = self.target_critic.cuda()
            self.critic = self.critic.cuda()

        utils.hard_update(self.target_actor, self.actor)
        utils.hard_update(self.target_critic, self.critic)
Beispiel #25
0
    def __init__(self, state_dim, action_dim, ram, LR_actor, LR_critic, gamma,
                 tau, batchsize, expl_rate, version):
        """
		:param state_dim: Dimensions of state (int)
		:param action_dim: Dimension of action (int)
		:param action_lim: Used to limit action in [-action_lim,action_lim]
		:param ram: replay memory buffer object
		:return:
		"""
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.LR_actor = LR_actor
        self.LR_critic = LR_critic
        self.gamma = gamma
        self.tau = tau
        self.ram = ram
        self.batchsize = batchsize
        self.iter = 0
        self.noise = utils.OrnsteinUhlenbeckActionNoise(
            self.action_dim, 0, 0.15, expl_rate)
        self.action_lim = 1.0

        self.actor = model.Actor(self.state_dim, self.action_dim,
                                 self.action_lim)
        self.target_actor = model.Actor(self.state_dim, self.action_dim,
                                        self.action_lim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.LR_actor)

        self.critic = model.Critic(self.state_dim, self.action_dim)
        self.target_critic = model.Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 self.LR_critic)

        utils.hard_update(self.target_actor, self.actor)
        utils.hard_update(self.target_critic, self.critic)
    def __init__(self,
                 env: Env = None,
                 capacity=2e6,
                 batch_size=128,
                 action_lim=1,
                 learning_rate=0.001,
                 gamma=0.999,
                 epochs=2):
        if env is None:
            raise "agent 缺少环境env"
        super(DDPGAgent, self).__init__(env, capacity)
        self.state_dim = env.observation_space.shape[0]  # 状态连续,获取状态维度
        self.action_dim = env.action_space.shape[0]  # 行为连续,获取行为维度
        self.action_lim = action_lim  # 行为值限制
        self.batch_size = batch_size  # 批学习一次状态转换数量
        self.learning_rate = learning_rate  # 学习率
        self.gamma = 0.999  # 衰减因子
        self.epochs = epochs  # 统一批状态转换学习的次数
        self.tau = 0.01  # 软拷贝的系数
        self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim)

        self.actor = Actor(self.state_dim, self.action_dim, self.action_lim)
        self.target_actor = Actor(self.state_dim, self.action_dim,
                                  self.action_lim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate)

        self.critic = Critic(self.state_dim, self.action_dim)
        self.target_critic = Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 self.learning_rate)
        # 初始化
        # 将critic和actor网络的参数硬拷贝到目标网络target_critic和target_actor
        hard_update(self.target_actor, self.actor)  # 硬拷贝
        hard_update(self.target_critic, self.critic)  # 硬拷贝
        return
    def __init__(self, act_sp, ob_sp, all_obs, all_acts, hidden_dim=64):
        self.act_sp = act_sp
        self.ob_sp = ob_sp

        self.policy = MLPNetwork(ob_sp, act_sp, sac_policy=True,
                                constrain_out=True, hidden_dim=hidden_dim).to(device)
        self.q_nets_n = 2
        self.qnets = []
        self.qnet_targs = []
        self.q_optimizers = []
        for i in range(self.q_nets_n):
            qnet = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device)
            qnet_targ = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device)
            qnet.to(device)
            qnet_targ.to(device) 
            hard_update(qnet_targ, qnet)
            self.qnets.append(qnet)
            self.qnet_targs.append(qnet_targ)
            self.q_optimizers.append(optim.Adam(qnet.parameters(), lr=LR))

        self.policy.to(device)
        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.action_count = 0
        self.use_warmup = True
Beispiel #28
0
    def __init__(self, num_inputs, action_space, config):

        self.gamma = config['gamma']
        self.tau = config['tau']
        self.alpha = config['alpha']

        self.policy_type = config['policy']
        self.target_update_interval = config['target_update_interval']
        self.automatic_entropy_tuning = config['automatic_entropy_tuning']

        self.device = torch.device(
            'cuda:' + str(config['cuda'])) if torch.cuda.is_available(
            ) and config['cuda'] >= 0 else torch.device('cpu')

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               config['hidden_size']).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=config['lr'])

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      config['hidden_size']).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=config['lr'])

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         config['hidden_size'],
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=config['lr'])
Beispiel #29
0
    def load_model_params(self, dir, i_eps):
        params = torch.load('%s/policy_state_dict_%d.pkl' % (dir, i_eps))
        self.policy_net.load_state_dict(params)

        utils.hard_update(self.target_net, self.policy_net)
Beispiel #30
0
# Agent
if args.Qapproximation == 'baseline':
    agent = SAC_baseline(env.observation_space.shape[0], env.action_space,
                         args)
else:
    agent = SAC_fourier(env.observation_space.shape[0], env.action_space, args)

agent.load_model(actor_path="./models/sac_actor_{}_{}_{}_{}_{}_{}_{}".format(
    'miguelca_test01', args.Qapproximation, args.filter, args.TDfilter,
    str(args.noise),
    str(args.rnoise).replace('.', '_'), str(args.num_steps)),
                 critic_path="./models/sac_critic_{}_{}_{}_{}_{}_{}_{}".format(
                     'miguelca_test01', args.Qapproximation, args.filter,
                     args.TDfilter, str(args.noise),
                     str(args.rnoise).replace('.', '_'), str(args.num_steps)))
hard_update(agent.critic_target, agent.critic)

#TesnorboardX
writer = SummaryWriter(logdir='./runs/{}_SAC_eval_{}_{}_{}'.format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
    args.policy, "autotune" if args.automatic_entropy_tuning else ""))

# normalization constants
action_scale = ((env.action_space.high - env.action_space.low) / 2.)
action_bias = ((env.action_space.high + env.action_space.low) / 2.)

# Training Loop
total_numsteps = 0
updates = 0
action_history = list([])
action_history_w_noise = list([])