Python ActorCritic.parameters Examples

Programming Language: Python

Namespace/Package Name: network

Class/Type: ActorCritic

Method/Function: parameters

Examples at hotexamples.com: 3

Python ActorCritic.parameters - 3 examples found. These are the top rated real world Python examples of network.ActorCritic.parameters extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ActorCritic(7)

parameters(3)

load_state_dict(2)

agent_1_skip(1)

initialize(1)

run_actor(1)

save(1)

share_memory(1)

state_dict(1)

train_actor(1)

train_critic(1)

Example #1

Show file

File: ppo.py Project: CAiM-lab/PPO

class PPO(object):
    """Main PPO class"""
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)

        #self.delta = (self.args.lr-self.args.lr_end)/1e6

    def train(self):
        """Main training function"""
        frame_idx = 0
        state = self.envs.reset()
        mean_100_reward = -np.inf
        self.info()

        while frame_idx < self.args.max_frames:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []
            entropy = self.args.entropy

            for _ in range(self.args.nb_steps):
                state = torch.FloatTensor(state).to(self.device)
                dist, value = self.actor_critic(state)
                action = dist.sample()
                # Make sure action is loaded to CPU (not GPU)
                next_state, reward, done, _ = self.envs.step(
                    action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(self.device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(self.device))

                states.append(state)
                actions.append(action)
                state = next_state
                frame_idx += 1
                #self.scheduler()

                # Evaluate training process and write data to tensorboard
                if frame_idx % 1000 == 0:
                    test_reward = np.mean(
                        [self.test_env(self.args.vis) for _ in range(10)])
                    self.test_rewards.append(test_reward)

                    if self.args.play is False:
                        print("Mean reward: ",
                              np.round(np.mean(self.test_rewards[-101:-1]), 0))
                        if mean_100_reward < np.round(
                                np.mean(self.test_rewards[-101:-1]), 0):
                            mean_100_reward = np.round(
                                np.mean(self.test_rewards[-101:-1]), 0)
                            self.save_network(mean_100_reward)
                        if len(self.test_rewards) >= 10:
                            self.writer.add_scalar(
                                'data/reward',
                                np.mean(self.test_rewards[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/ppo_loss', np.mean(self.loss[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/nb_actions_outside_range',
                                np.mean(self.action_bang_bang[-11:-1]),
                                frame_idx * self.args.num_envs)

                    # if test_reward > threshold_reward: early_stop = True

            next_state = torch.FloatTensor(next_state).to(self.device)
            _, next_value = self.actor_critic(next_state)
            returns = self.calc_gae(next_value, rewards, masks, values,
                                    self.args.gamma, self.args.tau)

            # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations
            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size,
                            states, actions, log_probs, returns, advantage,
                            self.args.clip)

    def make_env(self):
        # Private trunk function for calling the SubprocVecEnv class
        def _trunk():
            env = self.args.env  # in this simple case the class TestEnv() is called (see openAI for more envs)
            env.seed(self.args.seed)
            env.set_scaling(self.args.output_scaling)
            return env

        return _trunk

    def test_env(self, vis=False):
        state = self.env_test.reset()
        if vis:
            self.env_test.render()
        done = False
        total_reward = 0
        action_bang_bang = 0
        step = 0
        while not done:
            step += 1
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            dist, _ = self.actor_critic(state)
            action = dist.sample().cpu().numpy()[0]
            force = action * self.args.output_scaling
            next_state, reward, done, _ = self.env_test.step(action)
            if force > 0.5 or force < -0.5:
                action_bang_bang += 1
            state = next_state
            if vis:
                self.env_test.render()
            total_reward += reward
        self.action_bang_bang.append(action_bang_bang / step)
        return total_reward

    # Plain functions except that one can call them from an instance or the class
    @staticmethod
    def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    @staticmethod
    def ppo_iter(mini_batch_size, states, actions, log_probs, returns,
                 advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[
                rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self,
                   ppo_epochs,
                   mini_batch_size,
                   states,
                   actions,
                   log_probs,
                   returns,
                   advantages,
                   clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(
                    mini_batch_size, states, actions, log_probs, returns,
                    advantages):
                dist, value = self.actor_critic(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
                self.loss.append(loss.item())
                # Important step:
                self.optimizer.zero_grad()
                #pdb.set_trace()
                loss.backward()
                if self.args.grad_norm is not None:
                    nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                             self.args.grad_norm)
                self.optimizer.step()

    def save_network(self, reward):
        network_path = self.output_path + "/network" + str(reward)
        pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb"))

    def load_network(self, path):
        network_new = pickle.load(open(path, "rb"))
        self.actor_critic.load_state_dict(network_new)

    def random_seed(self):
        torch.manual_seed(self.args.seed)
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)

    def scheduler(self):
        for g in self.optimizer.param_groups:
            lr = g["lr"]
            if self.args.lr_end > lr:
                lr = self.args.lr_end
            else:
                lr -= self.delta
            self.lr.append(lr)
            g["lr"] = lr

    def info(self):
        fhandler = logging.FileHandler(filename=self.output_path +
                                       '/mylog.log',
                                       mode='a')
        logger.addHandler(fhandler)
        logger.info("--- INFO ---")
        logger.info("args: {}".format(self.args))

Example #2

Show file

File: ACER_discrete.py Project: rohitgajawada/RL_algos

                    help='rewards discount factor')
parser.add_argument('--entropy_weight', default=0.0001, type=float)
parser.add_argument('--alpha', default=0.95, type=float)
parser.add_argument('--type', default='notrpo', type=str, help='iftrpo')
parser.add_argument('--render', action='store_true', help='render')

args = parser.parse_args()
# print(args)
torch.manual_seed(args.seed)

env = gym.make("CartPole-v0")
replay_buffer = ReplayBuffer(args.capacity, args.max_episode_length)
model = ActorCritic(env.observation_space.shape[0], env.action_space.n).cuda()
average_model = ActorCritic(env.observation_space.shape[0],
                            env.action_space.n).cuda()
optimizer = optim.Adam(model.parameters())

frame_idx = 0
test_rewards = []
episode_count = 0
step_count = 0
state = env.reset()

running_rew = 0
plotcount = 0

while frame_idx < args.max_frames:

    policies = []
    average_policies = []
    actions = []

Example #3

Show file

File: model.py Project: Cenbylin/pytorch-IMPALA

class Learner(object):
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch
        self.network = ActorCritic(opt).to(device)
        self.optimizer = Adam(self.network.parameters(), lr=opt.lr)
        self.network.share_memory()

    def learning(self):
        torch.manual_seed(self.opt.seed)
        coef_hat = torch.Tensor([[self.opt.coef_hat]]).to(device)
        rho_hat = torch.Tensor([[self.opt.rho_hat]]).to(device)
        while True:
            # batch-trace
            # s[batch, n_step+1, 3, width, height]
            # a[batch, n_step, a_space]
            # rew[batch, n_step]
            # a_prob[batch, n_step, a_space]
            s, a, rew, prob = self.q_batch.get(block=True)
            ###########################
            # variables we need later #
            ###########################
            v, coef, rho, entropies, log_prob = [], [], [], [], []
            cx = torch.zeros(self.opt.batch_size, 256).to(device)
            hx = torch.zeros(self.opt.batch_size, 256).to(device)
            for step in range(s.size(1)):
                # value[batch]
                # logit[batch, 12]
                value, logit, (hx, cx) = self.network((s[:, step,
                                                         ...], (hx, cx)))
                v.append(value)
                if step >= a.size(
                        1
                ):  # noted that s[, n_step+1, ...] but a[, n_step,...]
                    break  # loop for n_step+1 because v in n_step+1 is needed.

                # π/μ[batch]
                # TODO: cumprod might produce runtime problem
                logit_a = a[:, step, :] * logit.detach() + (
                    1 - a[:, step, :]) * (1 - logit.detach())
                prob_a = a[:, step, :] * prob[:, step, :] + (
                    1 - a[:, step, :]) * (1 - prob[:, step, :])
                is_rate = torch.cumprod(logit_a / (prob_a + 1e-6), dim=1)[:,
                                                                          -1]
                coef.append(torch.min(coef_hat, is_rate))
                rho.append(torch.min(rho_hat, is_rate))

                # enpy_aspace[batch, 12]
                # calculating the entropy[batch, 1]
                # more specifically there are [a_space] entropy for each batch, sum over them here.
                # noted that ~do not~ use detach here
                enpy_aspace = -torch.log(logit) * logit - torch.log(
                    1 - logit) * (1 - logit)
                enpy = (enpy_aspace).sum(dim=1, keepdim=True)
                entropies.append(enpy)

                # calculating the prob that the action is taken by target policy
                # and the prob_pi_a[batch, 12] and log_prob[batch, 1] of this action
                # noted that ~do not~ use detach here
                prob_pi_a = (a[:, step, :] *
                             logit) + (1 - a[:, step, :]) * (1 - logit)
                log_prob_pi_a = torch.log(prob_pi_a).sum(dim=1, keepdim=True)
                log_prob.append(log_prob_pi_a)
                # prob_pi_a = torch.cumprod(prob_pi_a, dim=1)[:, -1:]
                # log_prob_pi_a = torch.log(prob_pi_a)

            ####################
            # calculating loss #
            ####################
            policy_loss = 0
            value_loss = 0
            # gae = torch.zeros(self.opt.batch_size, 1)
            for rev_step in reversed(range(s.size(1) - 1)):
                # compute v_(s+1)[batch] for policy gradient
                fix_vp = rew[:, rev_step] + self.opt.gamma * (
                    v[rev_step + 1] + value_loss) - v[rev_step]

                # value_loss[batch]
                td = rew[:, rev_step] + self.opt.gamma * v[rev_step +
                                                           1] - v[rev_step]
                value_loss = self.opt.gamma * coef[
                    rev_step] * value_loss + rho[rev_step] * td

                # policy_loss = policy_loss - log_probs[i] * Variable(gae)
                # the td must be detach from network-v

                # # dalta_t[batch]
                # delta_t = rew[:, rev_step] + self.opt.gamma * v[rev_step + 1] - v[rev_step]
                # gae = gae * self.opt.gamma + delta_t.detach()

                policy_loss = policy_loss \
                              - rho[rev_step] * log_prob[rev_step] * fix_vp.detach() \
                              - self.opt.entropy_coef * entropies[rev_step]

            self.optimizer.zero_grad()
            policy_loss = policy_loss.sum()
            value_loss = value_loss.sum()
            loss = policy_loss + self.opt.value_loss_coef * value_loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                           self.opt.max_grad_norm)
            print("v_loss {:.3f} p_loss {:.3f}".format(value_loss.item(),
                                                       policy_loss.item()))
            self.optimizer.step()