class Agent(object):
    """ ゲームルールによらない汎用性を持たす
    action: パターンの数だけ保持
    学習アルゴリズム: Q学習
    a = getNextAction(s)
    lean(S,a,r,S_next)
    """

    def __init__(self, numAction=4):
        self.action_paturn = range(numAction)
        self.learningObj = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh",
                                                activate2="sigmoid")
        self.X = []
        self.Y = []
        self.learnFlg = True

    def displayQ(self):
        self.learningObj.displayQ()

    def setLearnFlg(self, b):
        self.learnFlg = b

    def learn(self, o, a, r, o_next):
        """Q学習 or NeuralNetworkを使って,Q値を学習"""
        dQs = self.learningObj.predict(o)
        qk = dQs[a]
        maxQ = np.max(dQs)
        dQs[a] = qk + ALPHA * (r + GAMMA * maxQ - qk)

        self.X.append(np.asarray(o))
        self.Y.append(np.asarray(dQs))

        if len(self.X) > 500:
            self.X.pop(0)
            self.Y.pop(0)

        err = self.learningObj.fit(np.copy(self.X), np.copy(self.Y), learning_rate=0.2, epochs=500)
        return err

    def getNextAction(self, o):
        Agent_row = o[0]
        Agent_col = o[1]

        # 最大Q値の行動選択, 観測(observe)から、NNでQ値(配列)を取得
        Q_t = self.learningObj.predict(o)

        best_actions = []
        max_Q = -1000000
        for i in range(len(Q_t)):
            q = Q_t[i]
            if q > max_Q:
                max_Q = q
                best_actions = [ACTION[i]]
            elif q == max_Q:
                best_actions.append(ACTION[i])
        # 行動選択(複数ある場合に選ぶ)
        a = np.random.choice(best_actions)

        # 非学習
        if not self.learnFlg:
            return a

        # 学習中
        # greedyの行動選択
        if GREEDY_RATIO < random.random():
            return a
        else:
            return np.random.choice([0, 1, 2, 3])

    def getMaxQvalue(self, o):
        return np.max(self.learningObj.predict(o))

    def get_Q_values(self, o):
        return self.learningObj.predict(o)
    ALPHA = 0.9
    GAMMA = 0.9
    qk = Q[a]
    maxQ = np.max(Q)
    Q[a] = qk + ALPHA * (r + GAMMA * maxQ - qk)
    print 'Qs:',Q


    # 3. NeauralNetworkで学習する
    X.append(np.array(o))
    Y.append(Q)
    if len(X) > 1000:
        X.pop(0)
        Y.pop(0)

    mpl.fit(np.asarray(X), np.asarray(Y))


    # 4. Stateの初期化判定
    if option == GOAL:
        goaled_number += 1
        S = state.getInitState()
    else:
        S = S_next
    print '>> GOAL NUMBER :', goaled_number


print '>> GOAL NUMBER :', goaled_number
# print agent.displayQ()