コード例 #1
0
 def __init__(self, numAction=4):
     self.action_paturn = range(numAction)
     self.learningObj = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh",
                                             activate2="sigmoid")
     self.X = []
     self.Y = []
     self.learnFlg = True
コード例 #2
0
class Agent(object):
    """ ゲームルールによらない汎用性を持たす
    action: パターンの数だけ保持
    学習アルゴリズム: Q学習
    a = getNextAction(s)
    lean(S,a,r,S_next)
    """

    def __init__(self, numAction=4):
        self.action_paturn = range(numAction)
        self.learningObj = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh",
                                                activate2="sigmoid")
        self.X = []
        self.Y = []
        self.learnFlg = True

    def displayQ(self):
        self.learningObj.displayQ()

    def setLearnFlg(self, b):
        self.learnFlg = b

    def learn(self, o, a, r, o_next):
        """Q学習 or NeuralNetworkを使って,Q値を学習"""
        dQs = self.learningObj.predict(o)
        qk = dQs[a]
        maxQ = np.max(dQs)
        dQs[a] = qk + ALPHA * (r + GAMMA * maxQ - qk)

        self.X.append(np.asarray(o))
        self.Y.append(np.asarray(dQs))

        if len(self.X) > 500:
            self.X.pop(0)
            self.Y.pop(0)

        err = self.learningObj.fit(np.copy(self.X), np.copy(self.Y), learning_rate=0.2, epochs=500)
        return err

    def getNextAction(self, o):
        Agent_row = o[0]
        Agent_col = o[1]

        # 最大Q値の行動選択, 観測(observe)から、NNでQ値(配列)を取得
        Q_t = self.learningObj.predict(o)

        best_actions = []
        max_Q = -1000000
        for i in range(len(Q_t)):
            q = Q_t[i]
            if q > max_Q:
                max_Q = q
                best_actions = [ACTION[i]]
            elif q == max_Q:
                best_actions.append(ACTION[i])
        # 行動選択(複数ある場合に選ぶ)
        a = np.random.choice(best_actions)

        # 非学習
        if not self.learnFlg:
            return a

        # 学習中
        # greedyの行動選択
        if GREEDY_RATIO < random.random():
            return a
        else:
            return np.random.choice([0, 1, 2, 3])

    def getMaxQvalue(self, o):
        return np.max(self.learningObj.predict(o))

    def get_Q_values(self, o):
        return self.learningObj.predict(o)
コード例 #3
0
# ログ用の定数
goaled_number = 0
MAX_ITERATE = 10
GOAL = QLearn.GOAL

agent = QLearn.Agent()
state = QLearn.State()


# 初期設定
S = state.getInitState()
print '>> State : Init State'
print S

mpl = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh", activate2="sigmoid")
# X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
# y = np.array([0, 1, 1, 0])
# mpl.fit(X, y)

# for x, y in zip(X, y):
#     print 'X:%s, y:%0.2f, pred:%0.2f' % (ndprint(x), y, mpl.predict(x))
# X = np.array([0,0])
# Y = np.array([0,0,0,0])
X = []
Y = []
for i in range(MAX_ITERATE):
    # if i % (MAX_ITERATE / 20) == 0:
    print '--------------------------'
    print 'Loop : ',i
        # agent.displayQ()