Python Critic.parameters Examples

Programming Language: Python

Namespace/Package Name: core.models

Class/Type: Critic

Method/Function: parameters

Examples at hotexamples.com: 5

Python Critic.parameters - 5 examples found. These are the top rated real world Python examples of core.models.Critic.parameters extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

forward(5)

parameters(5)

cuda(4)

Critic(3)

apply(2)

cpu(2)

share_memory(1)

Example #1

Show file

File: A2C.py Project: ShawK91/Hireacrhical-Actor-Critic

class A2C(object):
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        self.actor_target = Actor(args)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(args)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_parameters(self, batch):
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        done_batch = torch.cat(batch.done)
        state_batch.volatile = False
        next_state_batch.volatile = True
        action_batch.volatile = False

        # Critic Update
        vals = self.critic.forward(state_batch)
        new_vals = self.critic.forward(next_state_batch) * (1 - done_batch)
        targets = reward_batch + self.gamma * new_vals
        self.critic_optim.zero_grad()
        dt = self.loss(vals, targets)
        dt.backward()
        self.critic_optim.step()

        # Actor Update
        self.actor_optim.zero_grad()
        state_batch = utils.to_tensor(utils.to_numpy(state_batch))
        targets = utils.to_tensor(utils.to_numpy(targets))
        vals = utils.to_tensor(utils.to_numpy(vals))
        action_logs = self.actor.forward(state_batch)
        entropy_loss = torch.mean(entropy(torch.exp(action_logs)))
        action_logs = F.log_softmax(action_logs)
        dt = targets - vals
        alogs = []
        for i, action in enumerate(action_batch):
            action_i = int(action.cpu().data.numpy())
            alogs.append(action_logs[i, action_i])
        alogs = torch.cat(alogs).unsqueeze(0)

        policy_loss = -torch.mean(dt * alogs.t())
        actor_loss = policy_loss - entropy_loss
        actor_loss.backward()
        self.actor_optim.step()

Example #2

Show file

File: hac.py Project: ShawK91/Hireacrhical-Actor-Critic

class Actor_Critic(object):
    def __init__(self, state_dim, action_dim, gamma, tau, buffer_size,
                 is_mem_cuda, out_act):

        self.actor = Actor(state_dim,
                           action_dim,
                           is_evo=False,
                           out_act=out_act)
        self.actor_target = Actor(state_dim,
                                  action_dim,
                                  is_evo=False,
                                  out_act=out_act)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = gamma
        self.tau = tau
        self.loss = nn.MSELoss()
        self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda)
        self.exploration_noise = OUNoise(action_dim)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def act(self, state, is_noise):
        state = utils.to_tensor(state).unsqueeze(0)
        action = self.actor.forward(state)
        action = action.detach().numpy().flatten()
        if is_noise: action += self.exploration_noise.noise()
        return action

    def train_from_batch(self, batch):
        env_state_batch = torch.cat(batch.state)
        goal_batch = torch.cat(batch.goal)
        uvfa_states = torch.cat((env_state_batch, goal_batch), dim=1).detach()
        next_env_state_batch = torch.cat(batch.next_state)
        next_uvfa_states = torch.cat((next_env_state_batch, goal_batch),
                                     dim=1).detach()
        action_batch = torch.cat(batch.action).detach()
        reward_batch = torch.cat(batch.reward).detach()

        #if self.args.use_done_mask: done_batch = torch.cat(batch.done)

        #Load everything to GPU if not already
        # if self.args.is_memory_cuda and not self.args.is_cuda:
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.critic.cuda()
        uvfa_states = uvfa_states.cuda()
        next_uvfa_states = next_uvfa_states.cuda()
        action_batch = action_batch.cuda()
        reward_batch = reward_batch.cuda()
        #     if self.args.use_done_mask: done_batch = done_batch.cuda()

        #Critic Update
        with torch.no_grad():
            next_action_batch = self.actor_target.forward(next_uvfa_states)
            next_q = self.critic_target.forward(next_uvfa_states,
                                                next_action_batch)
            #if self.args.use_done_mask: next_q = next_q * ( 1 - done_batch.float()) #Done mask
            target_q = reward_batch + (self.gamma * next_q)

        self.critic_optim.zero_grad()
        current_q = self.critic.forward((uvfa_states.detach()),
                                        (action_batch.detach()))
        dt = self.loss(current_q, target_q)
        dt.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.critic_optim.step()

        #Actor Update
        self.actor_optim.zero_grad()
        policy_loss = -self.critic.forward(
            (uvfa_states), self.actor.forward((uvfa_states)))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.actor_optim.step()

        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        #Nets back to CPU if using memory_cuda
        self.actor.cpu()
        self.actor_target.cpu()
        self.critic_target.cpu()
        self.critic.cpu()

Example #3

Show file

class DDPG(object):
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args, init=True)
        self.actor_target = Actor(args, init=True)
        self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4)

        self.critic = Critic(args)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_parameters(self, batch):
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        if self.args.use_done_mask: done_batch = torch.cat(batch.done)

        #Load everything to GPU if not already
        if self.args.is_memory_cuda and not self.args.is_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()
            self.critic.cuda()
            state_batch = state_batch.cuda()
            next_state_batch = next_state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            if self.args.use_done_mask: done_batch = done_batch.cuda()

        #Critic Update
        next_action_batch = self.actor_target.forward(next_state_batch)
        with torch.no_grad():
            next_q = self.critic_target.forward(next_state_batch,
                                                next_action_batch)
            if self.args.use_done_mask:
                next_q = next_q * (1 - done_batch.float())  #Done mask
            target_q = reward_batch + (self.gamma * next_q)

        self.critic_optim.zero_grad()
        current_q = self.critic.forward((state_batch), (action_batch))
        dt = self.loss(current_q, target_q)
        dt.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.critic_optim.step()

        #Actor Update
        self.actor_optim.zero_grad()
        policy_loss = -self.critic.forward(
            (state_batch), self.actor.forward((state_batch)))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.actor_optim.step()

        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        #Nets back to CPU if using memory_cuda
        if self.args.is_memory_cuda and not self.args.is_cuda:
            self.actor.cpu()
            self.actor_target.cpu()
            self.critic_target.cpu()
            self.critic.cpu()

Example #4

Show file

class Off_Policy_Algo(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self,
                 wwid,
                 algo_name,
                 state_dim,
                 action_dim,
                 actor_lr,
                 critic_lr,
                 gamma,
                 tau,
                 init_w=True):

        self.algo_name = algo_name
        self.gamma = gamma
        self.tau = tau

        self.HLoss = HLoss()
        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid, self.algo_name)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid, self.algo_name)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)

        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        if torch.cuda.is_available():
            self.actor_target.cuda()
            self.critic_target.cuda()
            self.actor.cuda()
            self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def save_net(self, path):
        torch.save(self.actor.state_dict(), path)

    def act(self, state):
        return self.actor(state)

    def share_memory(self):
        self.actor.share_memory()
        self.actor_target.share_memory()
        self.critic.share_memory()
        self.critic_target.share_memory()

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          num_epoch=1,
                          **kwargs):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(
                    0, kwargs['policy_noise'],
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -kwargs['policy_noise_clip'],
                                           kwargs['policy_noise_clip'])

                #Compute next action_bacth
                #next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.Gumbel_softmax_sample_distribution(next_state_batch, use_cuda=True))\
                #        if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda()  #this should use one-hot from logits
                next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.forward(next_state_batch)) \
                    if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda()  # this should use one-hot from logits
                if random.random() < 0.0001:
                    print('off_policy line 114, changed next action batch')
                next_action_batch = torch.clamp(next_action_batch, 0, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2, _ = self.critic_target.forward(next_state_batch,
                                                       next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2

                #Select which q to use as next-q (depends on algo)
                if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min' or self.algo_name == 'dis':
                    next_q = torch.min(q1, q2)
                elif self.algo_name == 'DDPG':
                    next_q = q1
                elif self.algo_name == 'TD3_max':
                    next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)

            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward(
                (state_batch),
                (action_batch
                 ))  #here the action batch should be the soft version
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)

            if self.algo_name == 'TD3' or self.algo_name == 'TD3_max' or self.algo_name == 'dis':
                dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())
            #print(dt.item(), "off_policy_algo line 136")

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

                actor_actions = self.actor.Gumbel_softmax_sample_distribution(state_batch, use_cuda=True)\
                    if self.algo_name == 'dis' else self.actor.forward(state_batch)
                #actor_actions = self.actor.forward(state_batch)
                #if random.random() < 0.001: print('actor action changed')
                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                # if self.args.use_advantage: policy_loss = -(Q1 - val)
                policy_loss = -Q1 + 0.1 * self.HLoss(
                    actor_actions
                )  # HLoss is a single scalar, directly regularized logits?

                if random.random() < 0.0005:
                    print('added entropy regularization, off_policy_algo 161')

                self.compute_stats(policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()

                #print(policy_loss, 'off_policy line 157')
                self.actor_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                self.actor_optim.step()

                #if random.random() <= 0.001:
                #    self.test_actor_gradient_descent(state_batch)

            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:
                utils.soft_update(self.actor_target, self.actor, self.tau)
            utils.soft_update(self.critic_target, self.critic, self.tau)

    def test_actor_gradient_descent(self, state_batch):
        #this method test if running gradient descent on the actor actually decrease the loss
        print("test_actor_gradient_descent, off_policy_algo line 179")
        for i in range(10):
            actor_actions = self.actor.forward(state_batch)
            print("logits_",
                  self.actor.w_out(self.actor.logits(state_batch))[0])
            print("action_batch", actor_actions[0])
            Q1, Q2, val = self.critic.forward(state_batch, actor_actions)
            policy_loss = -Q1
            policy_loss = policy_loss.mean()
            print("policy_loss at i = ", i, " is ", policy_loss)
            self.actor_optim.zero_grad()
            policy_loss.backward(retain_graph=True)
            print("gradient_", self.actor.f1.bias.grad[0])
            self.actor_optim.step()
            print("bias_", self.actor.f1.bias[0])

Example #5

Show file

File: off_policy_algo.py Project: mattiasmar/cerl-1

class Off_Policy_Algo(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True):

        self.algo_name = algo_name; self.gamma = gamma; self.tau = tau

        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)


        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.critic_loss = {'mean':[]}
        self.q = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.val = {'min':[], 'max': [], 'mean':[], 'std':[]}

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip'])

                #Compute next action_bacth
                next_action_batch = self.actor_target.forward(next_state_batch) + policy_noise.cuda()
                next_action_batch = torch.clamp(next_action_batch, 0,1)

                #Compute Q-val and value of next state masking by done
                q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2

                #Select which q to use as next-q (depends on algo)
                if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2)
                elif self.algo_name == 'DDPG': next_q = q1
                elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)


            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward((state_batch), (action_batch))
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)

            if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1


            #Delayed Actor Update
            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

                actor_actions = self.actor.forward(state_batch)
                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                # if self.args.use_advantage: policy_loss = -(Q1 - val)
                policy_loss = -Q1

                self.compute_stats(policy_loss,self.policy_loss)
                policy_loss = policy_loss.mean()


                self.actor_optim.zero_grad()



                policy_loss.backward(retain_graph=True)
                self.actor_optim.step()


            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau)
            utils.soft_update(self.critic_target, self.critic, self.tau)