Python ActorCritic.evaluate Examples

Programming Language: Python

Namespace/Package Name: model

Class/Type: ActorCritic

Method/Function: evaluate

Examples at hotexamples.com: 2

Python ActorCritic.evaluate - 2 examples found. These are the top rated real world Python examples of model.ActorCritic.evaluate extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ActorCritic(30)

train(30)

state_dict(30)

parameters(30)

load_state_dict(30)

eval(30)

cuda(15)

share_memory(11)

to(7)

act(5)

actor(3)

remember(3)

get_skip(2)

select_action(2)

named_parameters(2)

get_v(2)

zero_grad(2)

forward(2)

evaluate(2)

critic(2)

apply(2)

calculateLoss(2)

choose_action(2)

clearMemory(2)

get_logproba(1)

sample_noise(1)

updateMemory(1)

train_model(1)

calc_loss(1)

step(1)

policy_class(1)

remove_noise(1)

clear_memory(1)

get_loss_propogate(1)

clip_grads(1)

compute_entropy(1)

name(1)

low_lr(1)

load_weights(1)

learned_embedding(1)

_forward_critic(1)

get_value(1)

Example #1

Show file

File: algo.py Project: RozenAstrayChen/GvGAI_practice

class PPO():
    def __init__(self, state_dim, action_dim, lr, betas, gamma, K_epochs,
                 eps_clip, device):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.device = device

        self.policy = ActorCritic(state_dim, action_dim).to(self.device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=lr,
                                          betas=betas)
        self.polciy_old = ActorCritic(state_dim, action_dim).to(self.device)

        self.MseLoss = nn.MSELoss()

    def update(self, memory):
        # Monte Carlo estimate of state rewards
        rewards = []
        discount_reward = 0
        for reward in reversed(memory.rewards):
            discount_reward = reward + (self.gamma * discount_reward)
            rewards.insert(0, discount_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list in tensor
        old_states = torch.stack(memory.states).to(self.device).detach()
        old_actions = torch.stack(memory.actions).to(self.device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(self.device).detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluating old acions and values:
            logprobs, state_values, dist_entropy = self.policy.evaluate(
                old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta_old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip)
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(
                state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())

Example #2

Show file

File: ppo.py Project: mateusribs/UAV_3d_virtual_env

class PPO:
    def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma,
                 K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=lr,
                                          betas=betas)

        self.policy_old = ActorCritic(state_dim, action_dim,
                                      action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        try:
            self.policy.load_state_dict(
                torch.load('./PPO_continuous_drone.pth', map_location=device))
            self.policy_old.load_state_dict(
                torch.load('./PPO_continuous_old_drone.pth',
                           map_location=device))
            print('Saved models loaded')
        except:
            print('New models generated')
            pass

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()

    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards),
                                       reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(memory.states).to(device),
                                   1).detach()
        old_actions = torch.squeeze(torch.stack(memory.actions).to(device),
                                    1).detach()
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs),
                                     1).to(device).detach()

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(
                old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                1 + self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(
                state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())