def __init__(self, numAction=4): self.action_paturn = range(numAction) self.learningObj = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh", activate2="sigmoid") self.X = [] self.Y = [] self.learnFlg = True
class Agent(object): """ ゲームルールによらない汎用性を持たす action: パターンの数だけ保持 学習アルゴリズム: Q学習 a = getNextAction(s) lean(S,a,r,S_next) """ def __init__(self, numAction=4): self.action_paturn = range(numAction) self.learningObj = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh", activate2="sigmoid") self.X = [] self.Y = [] self.learnFlg = True def displayQ(self): self.learningObj.displayQ() def setLearnFlg(self, b): self.learnFlg = b def learn(self, o, a, r, o_next): """Q学習 or NeuralNetworkを使って,Q値を学習""" dQs = self.learningObj.predict(o) qk = dQs[a] maxQ = np.max(dQs) dQs[a] = qk + ALPHA * (r + GAMMA * maxQ - qk) self.X.append(np.asarray(o)) self.Y.append(np.asarray(dQs)) if len(self.X) > 500: self.X.pop(0) self.Y.pop(0) err = self.learningObj.fit(np.copy(self.X), np.copy(self.Y), learning_rate=0.2, epochs=500) return err def getNextAction(self, o): Agent_row = o[0] Agent_col = o[1] # 最大Q値の行動選択, 観測(observe)から、NNでQ値(配列)を取得 Q_t = self.learningObj.predict(o) best_actions = [] max_Q = -1000000 for i in range(len(Q_t)): q = Q_t[i] if q > max_Q: max_Q = q best_actions = [ACTION[i]] elif q == max_Q: best_actions.append(ACTION[i]) # 行動選択(複数ある場合に選ぶ) a = np.random.choice(best_actions) # 非学習 if not self.learnFlg: return a # 学習中 # greedyの行動選択 if GREEDY_RATIO < random.random(): return a else: return np.random.choice([0, 1, 2, 3]) def getMaxQvalue(self, o): return np.max(self.learningObj.predict(o)) def get_Q_values(self, o): return self.learningObj.predict(o)
# ログ用の定数 goaled_number = 0 MAX_ITERATE = 10 GOAL = QLearn.GOAL agent = QLearn.Agent() state = QLearn.State() # 初期設定 S = state.getInitState() print '>> State : Init State' print S mpl = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh", activate2="sigmoid") # X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) # y = np.array([0, 1, 1, 0]) # mpl.fit(X, y) # for x, y in zip(X, y): # print 'X:%s, y:%0.2f, pred:%0.2f' % (ndprint(x), y, mpl.predict(x)) # X = np.array([0,0]) # Y = np.array([0,0,0,0]) X = [] Y = [] for i in range(MAX_ITERATE): # if i % (MAX_ITERATE / 20) == 0: print '--------------------------' print 'Loop : ',i # agent.displayQ()