Beispiel #1
0
                n_actions=n_actions,
                memory_size=memory_size,
                batch_size=batch_size,
                gamma=gamma,
                alpha=alpha,
                lr=lr,
                action_bounds=action_bounds,
                reward_scale=reward_scale)

if TRAIN:
    for episode in range(1, MAX_EPISODES + 1):
        state = env.reset()
        episode_reward = 0
        done = 0
        start_time = time.time()
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.store(state, reward, done, action, next_state)
            value_loss, q_loss, policy_loss = agent.train()
            if episode % 250 == 0:
                agent.save_weights()
            episode_reward += reward
            state = next_state
        log(episode, start_time, episode_reward, value_loss, q_loss,
            policy_loss, len(agent.memory))

else:
    player = Play(env, agent)
    player.evaluate()
            brain.schedule_lr()
            brain.schedule_clip_range(iteration)
            episode_reward = evaluate_policy(env_name, brain, state_shape)

            if iteration == 1:
                running_reward = episode_reward
            else:
                running_reward = 0.99 * running_reward + 0.01 * episode_reward

            if iteration % log_period == 0:
                print(f"Iter: {iteration}| "
                      f"Ep_reward: {episode_reward:.3f}| "
                      f"Running_reward: {running_reward:.3f}| "
                      f"Total_loss: {total_loss:.3f}| "
                      f"Explained variance:{ev:.3f}| "
                      f"Entropy: {entropy:.3f}| "
                      f"Iter_duration: {time.time() - start_time:.3f}| "
                      f"Lr: {brain.scheduler.get_last_lr()}| "
                      f"Clip_range:{brain.epsilon:.3f}")
                brain.save_params(iteration, running_reward)

            with SummaryWriter(env_name + "/logs") as writer:
                writer.add_scalar("running reward", running_reward, iteration)
                writer.add_scalar("episode reward", episode_reward, iteration)
                writer.add_scalar("explained variance", ev, iteration)
                writer.add_scalar("loss", total_loss, iteration)
                writer.add_scalar("entropy", entropy, iteration)
    else:
        play = Play(env_name, brain)
        play.evaluate()