Python ActorCritic.act Examples

Programming Language: Python

Namespace/Package Name: model

Class/Type: ActorCritic

Method/Function: act

Examples at hotexamples.com: 5

Python ActorCritic.act - 5 examples found. These are the top rated real world Python examples of model.ActorCritic.act extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ActorCritic(30)

train(30)

state_dict(30)

parameters(30)

load_state_dict(30)

eval(30)

cuda(15)

share_memory(11)

to(7)

act(5)

actor(3)

remember(3)

get_skip(2)

select_action(2)

named_parameters(2)

get_v(2)

zero_grad(2)

forward(2)

evaluate(2)

critic(2)

apply(2)

calculateLoss(2)

choose_action(2)

clearMemory(2)

get_logproba(1)

sample_noise(1)

updateMemory(1)

train_model(1)

calc_loss(1)

step(1)

policy_class(1)

remove_noise(1)

clear_memory(1)

get_loss_propogate(1)

clip_grads(1)

compute_entropy(1)

name(1)

low_lr(1)

load_weights(1)

learned_embedding(1)

_forward_critic(1)

get_value(1)

Example #1

Show file

def main():
    #    try:
    parse_cmd_args()

    sess = tf.Session()
    K.set_session(sess)
    db = Database()
    env = Environment(db, argus)

    actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'],
                               size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem'])

    num_trials = argus['num_trial']  # ?
    # trial_len  = 500   # ?
    # ntp
    env.preheat()

    # First iteration
    cur_state = env._get_obs()  # np.array      (inner_metric + sql)
    cur_state = cur_state.reshape((1, env.state.shape[0]))
    # action = env.action_space.sample()
    action = env.fetch_action()  # np.array
    action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
    new_state, reward, done, _ = env.step(action, 0, 1)  # apply the action -> to steady state -> return the reward
    new_state = new_state.reshape((1, env.state.shape[0]))
    reward_np = np.array([reward])

    print("0-shape-")
    print(new_state.shape)
    actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
    actor_critic.train()  # len<32, useless

    cur_state = new_state
    for i in range(num_trials):
        # env.render()
        cur_state = cur_state.reshape((1, env.state.shape[0]))
        action, isPredicted = actor_critic.act(cur_state)
        print(action)
        action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
        # action.tolist()                                          # to execute
        new_state, reward, done, _ = env.step(action, isPredicted, i + 1)
        new_state = new_state.reshape((1, env.state.shape[0]))

        reward_np = np.array([reward])
        print("%d-shape-" % i)
        print(new_state.shape)

        actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
        actor_critic.train()

        cur_state = new_state
    '''

Example #2

Show file

    early_stop = False

    #init = tf.global_variables_initializer()
    with tf.Session() as sess:
        writer = tf.summary.FileWriter('./log/train', sess.graph)
        sess.run(tf.global_variables_initializer())

        while not early_stop:

            log_probs, values, states, actions, rewards, masks = [], [], [], [], [], []

            for q in range(
                    PPO_STEPS
            ):  #each ppo steps generates actions, states, rewards
                print("PPO_steps:{}".format(q))
                action, value, norm_dist = model.act(state)
                next_state, reward, done, _ = env.step(action)
                # each state, reward, done is a list of results from each parallel environment
                if render:
                    env.render()
                log_prob_ = norm_dist.log_prob(action)

                log_probs.append(log_prob_)
                values.append(value)
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                masks.append(1 - done)
                #storing
                state = next_state
                frame_idx += 1

Example #3

Show file

File: main.py Project: thomount/qtune

    action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
    new_state, reward, done, socre,  _ = env.step(action, 0, 1)  # apply the action -> to steady state -> return the reward
    new_state = new_state.reshape((1, env.state.shape[0]))
    reward_np = np.array([reward])

    print("0-shape")
    print(new_state.shape)
    actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
    actor_critic.train()  # len<32, useless

    cur_state = new_state
    predicted_rewardList = []
    for epoch in range(num_trials):
        # env.render()
        cur_state = cur_state.reshape((1, env.state.shape[0]))
        action, isPredicted = actor_critic.act(cur_state)
        print(action)
        action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
        # action.tolist()                                          # to execute
        new_state, reward, done, score, _ = env.step(action, isPredicted, epoch + 1)
        new_state = new_state.reshape((1, env.state.shape[0]))
        if isPredicted == 1:
            predicted_rewardList.append([epoch, reward])

        reward_np = np.array([reward])
        print("%d-shape" % epoch)
        print(new_state.shape)

        actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
        actor_critic.train()

Example #4

Show file

    def train(self):
        self.NUM_AGENTS = 1
        # self.NUM_AGENTS = len(dict_model)
        # print("train", dict_model)
        # actor_critics = []
        # local_brains = []
        # rollouts = []
        if DEBUG: print(self.config)
        actor_critic = ActorCritic(self.n_in, self.n_out)
        global_brain = Brain(actor_critic, self.config)
        rollout = RolloutStorage(self.NUM_ADVANCED_STEP, self.NUM_PARALLEL,
                                 self.obs_shape, self.device)

        current_obs = torch.zeros(self.NUM_PARALLEL,
                                  self.obs_shape).to(self.device)
        episode_rewards = torch.zeros([self.NUM_PARALLEL, 1])
        final_rewards = torch.zeros([self.NUM_PARALLEL, 1])

        episode = np.zeros(self.NUM_PARALLEL)

        obs = self.envs.reset()
        obs = np.array(obs)
        obs = torch.from_numpy(obs).float()
        current_obs = obs

        rollout.observations[0].copy_(current_obs)

        while True:
            # for step in range(self.NUM_ADVANCED_STEP):
            for step in range(self.max_step):
                print("step", step)
                with torch.no_grad():
                    # action = actor_critic.act(rollouts.observations[step]) # ここでアクション決めて
                    action = torch.zeros(self.NUM_PARALLEL,
                                         self.NUM_AGENTS).long().to(
                                             self.device)  # 各観測に対する，各エージェントの行動
                    if DEBUG:
                        print("actionサイズ", self.NUM_PARALLEL, self.NUM_AGENTS)
                    # for i, (k,v) in enumerate( dict_model.items() ):
                    #     if k == training_target:
                    #         tmp_action = v.act(current_obs)
                    #         target_action = copy.deepcopy(tmp_action)
                    #     else:
                    #         tmp_action = v.act_greedy(current_obs)
                    #     action[:,i] = tmp_action.squeeze()
                    action = actor_critic.act(obs)
                    if DEBUG: print("action", action)
                if DEBUG: print("step前のここ？", action.shape)
                obs, reward, done, infos = self.envs.step(action)  # これで時間を進める
                print("reward(train)", reward)
                episode_rewards += reward

                # if done then clean the history of observation
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                if DEBUG: print("done.shape", done.shape)
                if DEBUG: print("masks.shape", masks.shape)
                if DEBUG: print("obs.shape", obs.shape)
                with open(self.resdir + "/episode_reward.txt", "a") as f:
                    for i, info in enumerate(infos):
                        if 'episode' in info:
                            f.write("{:}\t{:}\t{:}\n".format(
                                episode[i], info['env_id'],
                                info['episode']['r']))
                            print(episode[i], info['env_id'],
                                  info['episode']['r'])
                            episode[i] += 1

                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards

                episode_rewards *= masks
                current_obs *= masks

                current_obs = obs  # ここで観測を更新している

                rollout.insert(current_obs, action.data, reward, masks,
                               self.NUM_ADVANCED_STEP)
                with open(self.resdir + "/reward_log.txt",
                          "a") as f:  # このログはエピソードが終わったときだけでいい->要修正
                    f.write("{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\n".format(
                        episode.mean(), step,
                        reward.max().numpy(),
                        reward.min().numpy(),
                        reward.mean().numpy(),
                        episode_rewards.max().numpy(),
                        episode_rewards.min().numpy(),
                        episode_rewards.mean().numpy()))
                    print(episode.mean(), step,
                          reward.mean().numpy(),
                          episode_rewards.mean().numpy())

            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollout.observations[-1]).detach()

            rollout.compute_returns(next_value, self.gamma)
            value_loss, action_loss, total_loss, entropy = global_brain.update(
                rollout)

            with open(self.resdir + "/loss_log.txt", "a") as f:
                f.write("{:}\t{:}\t{:}\t{:}\t{:}\n".format(
                    episode.mean(), value_loss, action_loss, entropy,
                    total_loss))
                print(
                    "value_loss {:.4f}\taction_loss {:.4f}\tentropy {:.4f}\ttotal_loss {:.4f}"
                    .format(value_loss, action_loss, entropy, total_loss))

            rollout.after_update()

            if int(episode.mean()) + 1 > self.NUM_EPISODES:
                # print("ループ抜ける")
                break
            obs = self.envs.reset()

        if self.args.save:
            save_model(actor_critic, self.resdir + "/model")
        # ここでベストなモデルを保存していた（備忘）
        # print("%s番目のエージェントのtrain終了"%training_target)
        # dict_model[training_target] = actor_critic # {}
        return actor_critic

Example #5

Show file

File: ppo.py Project: mateusribs/UAV_3d_virtual_env

class PPO:
    def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma,
                 K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=lr,
                                          betas=betas)

        self.policy_old = ActorCritic(state_dim, action_dim,
                                      action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        try:
            self.policy.load_state_dict(
                torch.load('./PPO_continuous_drone.pth', map_location=device))
            self.policy_old.load_state_dict(
                torch.load('./PPO_continuous_old_drone.pth',
                           map_location=device))
            print('Saved models loaded')
        except:
            print('New models generated')
            pass

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()

    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards),
                                       reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(memory.states).to(device),
                                   1).detach()
        old_actions = torch.squeeze(torch.stack(memory.actions).to(device),
                                    1).detach()
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs),
                                     1).to(device).detach()

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(
                old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                1 + self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(
                state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())