Esempio n. 1
0
        while not terminal:
            state_t = state_t_1

            # execute action in environment
            action_t = agent.select_action(state_t, agent.exploration)
            env.execute_action(action_t)

            # observe environment
            state_t_1, reward_t, terminal = env.observe()

            # store experience
            agent.store_experience(state_t, action_t, reward_t, state_t_1,
                                   terminal)

            # experience replay
            agent.experience_replay()

            # for log
            frame += 1
            loss += agent.current_loss
            Q_max += np.max(agent.Q_values(state_t))
        if reward_t >= 1:
            win += 1

        print(
            "EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}"
            .format(e, n_epochs - 1, win, loss / frame, Q_max / frame))

    # save model
    agent.save_model()
Esempio n. 2
0
        state_t_1, reward_t, terminal = env.observe()

        while not terminal:
            state_t = state_t_1

            # execute action in environment
            action_t = agent.select_action(state_t, agent.exploration)
            env.execute_action(action_t)

            # observe environment
            state_t_1, reward_t, terminal = env.observe()

            # store experience
            agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)

            # experience replay
            agent.experience_replay()

            # for log
            frame += 1
            loss += agent.current_loss
            Q_max += np.max(agent.Q_values(state_t))
            if reward_t == 1:
                win += 1

        print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
            e, n_epochs - 1, win, loss / frame, Q_max / frame))

    # save model
    agent.save_model()
Esempio n. 3
0
            action_t = agent.select_action([state_t], agent.exploration)
            env.execute_action(action_t)

            # observe environment
            state_t_1, reward_t, terminal = env.observe()

            # store experience
            start_replay = False
            start_replay = agent.store_experience([state_t], action_t, reward_t, [state_t_1], terminal)

            # experience replay
            if start_replay:
                do_replay_count += 1
                agent.update_exploration(e)
                if do_replay_count > 2:
                    agent.experience_replay(e)
                    do_replay_count = 0

            # update target network
            if total_frame % 500 == 0 and start_replay:
                agent.update_target_model()

            # for log
            frame += 1
            total_frame += 1
            loss += agent.current_loss
            Q_max += np.max(agent.Q_values([state_t]))
            if reward_t == 1:
                win += 1

        if start_replay:
Esempio n. 4
0
            while True is True:
                env.is_available()

                # 手を選ばせる。盤面情報と手のブレ率(random)を与える
                # hand_result = env.random_play()
                action_t = agent.select_action(state_before, agent.exploration)
                hand_result = env.learning_play(action_t)

                if hand_result == "ok":
                    break
                elif hand_result == "ng":
                    state_after = env.observe_ng(action_t)
                    reward_t = -9999
                    agent.store_experience(state_before, action_t, reward_t,
                                           state_after, env.is_playable())
                    agent.experience_replay(n_epochs)
                    n_epochs += 1
                    frame += 1
                    loss += agent.current_loss
                    Q_max += np.max(agent.Q_values(state_before))
                    print "EPOCH: {:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
                        n_epochs, win, loss / frame, Q_max / frame)
                elif hand_result == "pass":
                    break
                else:
                    print "Hung up"

            # 相手の手を進める(基本的に相手が後攻)
            env.learning_next()

            # 1手毎の結果を処理する