Esempio n. 1
0
        actions.append(a)
        dones.append(d)
        rewards.append(r)
        logp_ts.append(logp_t)

        o = n_o

        if d:
            ep += 1
            if ep % record_score_size == 0:
                if int(ep / record_score_size) < 600:
                    writer.add_scalar('data/reward', record_score / record_score_size, int(ep / record_score_size))
                    record_score = 0
            writer.add_scalar('data/reward_per_episode', score, ep)
            print(score, ep)
            score = 0
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    a, v_t, logp_t = agent.get_action([o])
    values.append(v_t[0])
    next_value = values[1:]
    value = values[:-1]
    adv, target = get_gaes(rewards, dones, value, next_value, agent.gamma, agent.lamda, False)
    value_loss, kl, ent = agent.update(states, actions, target, adv, logp_ts)

    writer.add_scalar('data/value_loss_per_rollout', value_loss, rollout)
    writer.add_scalar('data/kl_per_rollout', kl, rollout)
    writer.add_scalar('data/ent_per_rollout', ent, rollout)
    writer.add_scalar('data/reward_per_rollout', score_rollout, rollout)

    values, states, actions, dones, logp_ts, rewards = [], [], [], [], [], []
                print(episode, score)
            score = 0

        states = next_states

    total_state = np.stack(total_state).transpose([1, 0, 2
                                                   ]).reshape([-1, state_size])
    total_next_state = np.stack(total_next_state).transpose([1, 0, 2]).reshape(
        [-1, state_size])
    total_reward = np.stack(total_reward).transpose().reshape([-1])
    total_done = np.stack(total_done).transpose().reshape([-1])
    total_action = np.stack(total_action).transpose().reshape([-1])

    total_target, total_adv = [], []
    for idx in range(num_worker):
        value, next_value = agent.get_value(
            total_state[idx * num_step:(idx + 1) * num_step],
            total_next_state[idx * num_step:(idx + 1) * num_step])
        adv, target = get_gaes(
            total_reward[idx * num_step:(idx + 1) * num_step],
            total_done[idx * num_step:(idx + 1) * num_step], value, next_value,
            agent.gamma, agent.lamda, normalize)
        total_target.append(target)
        total_adv.append(adv)

    agent.train_model(total_state, total_action, np.hstack(total_target),
                      np.hstack(total_adv))

    writer.add_scalar('data/reward_per_rollout',
                      sum(total_reward) / (num_worker), global_update)
    saver.save(sess, 'lunarlander_a2c/model')
Esempio n. 3
0
        score += reward

        total_state.append(state)
        total_next_state.append(next_state)
        total_done.append(done)
        total_reward.append(reward)
        total_action.append(action)

        state = next_state

    if ep % train_size == 0:
        update_step += 1
        total_state = np.stack(total_state)
        total_next_state = np.stack(total_next_state)
        total_reward = np.stack(total_reward)
        total_done = np.stack(total_done)
        total_action = np.stack(total_action)

        value, next_value = agent.get_value(total_state, total_next_state)
        adv, target = utils.get_gaes(total_reward, total_done, value,
                                     next_value, agent.gamma, agent.lamda,
                                     False)

        agent.train_model(total_state, total_action, target, adv)
        print(update_step, score / train_size)
        if update_step < 300:
            writer.add_scalar('data/reward', score / train_size, update_step)
            saver.save(sess, 'pendulum_ppo/model')
        total_state, total_reward, total_done, total_next_state, total_action = [], [], [], [], []
        score = 0