Beispiel #1
0
def main():
    env = gym.make('CartPole-v1')
    agent = QNetwork(env.observation_space, env.action_space)

    frame = 0
    while frame < MAX_NUM_FRAMES:
        state = env.reset()
        for t in range(MAX_EPISODE_DURATION):
            env.render()

            action = agent.act(state)
            
            next_state, reward, done, info = env.step(action)

            if done:
                # done doesn't return a negative reward...
                reward *= -1

            frame = frame + 1

            if done or frame == MAX_NUM_FRAMES:
                break

            state = next_state

    env.close()
Beispiel #2
0
def main():
    env = gym.make('CartPole-v1')
    replay_memory = ReplayMemory()
    agent = QNetwork(env.observation_space, env.action_space)
    get_training_batch = make_training_transformer(env.observation_space.shape,
                                                   agent)

    frame = 0
    acc_loss = 0
    acc_state_value = 0
    while frame < MAX_NUM_FRAMES:
        state = env.reset()
        for t in range(MAX_EPISODE_DURATION):
            if take_random_action(frame):
                action = env.action_space.sample()  # pick random action
            else:
                action = agent.act(state)

            next_state, reward, done, info = env.step(action)

            if done:
                # on done doesn't return a negative reward...
                reward *= -1

            experience = (state, action, reward, next_state, done)
            replay_memory.append(experience)
            frame += 1

            experience_samples = replay_memory.sample(BATCH_SIZE)
            state_batch, qs_batch = get_training_batch(experience_samples)
            acc_state_value += np.mean(qs_batch)

            loss = agent.training_step(state_batch, qs_batch)
            acc_loss += loss

            if frame % FRAMES_PER_SAVE == 0:
                model_filename = f"ckpt-loss={loss:.4f}"
                agent.save_model(model_filename)

            if frame % FRAMES_PER_PRINT == 0:
                print(f"Frame: {frame}")
                avg_loss = acc_loss / FRAMES_PER_PRINT
                avg_state_value = acc_state_value / FRAMES_PER_PRINT
                print(
                    f"avg loss: {avg_loss:.4f}; avg value: {avg_state_value:.2f}"
                )
                acc_loss = 0
                acc_state_value = 0

            if done or frame == MAX_NUM_FRAMES:
                break

            state = next_state

    env.close()