Esempio n. 1
0
                                       terminal)
                print(agent.tmp_q_values, np.argmax(agent.tmp_q_values),
                      agent.enable_actions.index(action_t))

                # for log
                frame += 1
                steps += 1

                if steps > warmup:
                    loss += agent.current_loss
                    Q_max += np.max(agent.Q_values([state_t]))

            # experience replay
            # warmup中は学習しない
            if steps > warmup:
                agent.backword()

                if steps % n_update_target_network:
                    agent.update_target()

            print(
                "epoch: {:03d}/{:03d} |  loss: {:.4f} | Q_max: {:.4f} | total reward: {} | steps: {}"
                .format(e, n_epochs - 1, loss / frame, Q_max / frame,
                        total_reward, steps))

    except KeyboardInterrupt:
        pass
    finally:
        # ブラウザーを終了する。
        env.driver.quit()
        # save model