def start(self, verbose=False): ''' 対戦の開始 ''' state = State() current_player_mark = 1 result = None while (True): #print("="*30) current_player = self.players[current_player_mark] if verbose: print("%s" % (state.to_array())) print state.output() print("-" * 5) # プレイヤーの行動の選択 index = current_player.select_index(state) #print("%s selected %i" % (self.players[current_player_mark].mark.to_string(), index)) state = state.set(index, self.players[current_player_mark].mark) # この時点のstateで報酬が発生する場合はここでrewardを判定して学習できる # tic_tac_toeでは勝負が決まるまで報酬は0 current_player.learn(0) if state.is_win(self.players[current_player_mark].mark): result = self.players[current_player_mark].mark # 勝者の報酬 current_player.learn(1, True) # 敗者の報酬 self.players[result.opponent().to_int()].learn(-1, True) if verbose: print("%s" % (state.to_array())) print("-" * 5) state.output() print("-" * 5) print("%s win!!!" % (self.players[current_player_mark].mark.to_string())) break elif state.is_draw(): result = Mark(Empty()) for player in self.players.itervalues(): player.learn(0, True) if verbose: state.output() print("draw.") break current_player_mark = self.players[ current_player_mark].mark.opponent().to_int()
arguments: 状態 報酬 事後状態から行動を行った後の状態 ''' if next_state is None: next_state_value = 0.0 else: #状態価値テーブルから価値を取得 next_state_value = self.value[next_state] # 行動価値の更新式 self.value[state] += self.step_size * (reward + next_state_value - self.value[state]) #print("value[state] %f", self.value[state]) if __name__ == '__main__': value = Value() state = State() #print value.get_value(state) #print value.get_max_action(state, Mark(Maru())) new_state = state.set(3, Mark(Maru())) new_state1 = state.set(4, Mark(Maru())) new_state2 = new_state.set(2, Mark(Batsu())) value.update(state, 10, new_state) value.update(new_state, 10, new_state2) value.update(new_state, 10, new_state2) value.update(new_state1, 10, new_state2) value.update(new_state1, 100, new_state2) #value.update(state, -3, new_state2) print value.get_max_action(state, Mark(Maru()))
状態 報酬 事後状態から行動を行った後の状態 ''' if next_state is None: next_state_value = 0.0 else: #状態価値テーブルから価値を取得 next_state_value = self.value[next_state] # 行動価値の更新式 self.value[state] += self.step_size * (reward + next_state_value - self.value[state]) #print("value[state] %f", self.value[state]) if __name__ == '__main__': value = Value() state = State() #print value.get_value(state) #print value.get_max_action(state, Mark(Maru())) new_state = state.set(3, Mark(Maru())) new_state1 = state.set(4, Mark(Maru())) new_state2 = new_state.set(2, Mark(Batsu())) value.update(state, 10, new_state) value.update(new_state, 10, new_state2) value.update(new_state, 10, new_state2) value.update(new_state1, 10, new_state2) value.update(new_state1, 100, new_state2) #value.update(state, -3, new_state2) print value.get_max_action(state, Mark(Maru()))
def start(self, verbose=False): ''' 対戦の開始 ''' state = State() current_player_mark = 1 result = None while(True): #print("="*30) current_player = self.players[current_player_mark] if verbose: print("%s" % (state.to_array())) print state.output() print("-"*5) # プレイヤーの行動の選択 index = current_player.select_index(state) #print("%s selected %i" % (self.players[current_player_mark].mark.to_string(), index)) state = state.set(index, self.players[current_player_mark].mark) # この時点のstateで報酬が発生する場合はここでrewardを判定して学習できる # tic_tac_toeでは勝負が決まるまで報酬は0 current_player.learn(0) if state.is_win(self.players[current_player_mark].mark): result = self.players[current_player_mark].mark # 勝者の報酬 current_player.learn(1, True) # 敗者の報酬 self.players[result.opponent().to_int()].learn(-1, True) if verbose: print("%s" % (state.to_array())) print("-"*5) state.output() print("-"*5) print("%s win!!!" % (self.players[current_player_mark].mark.to_string())) break elif state.is_draw(): result = Mark(Empty()) for player in self.players.itervalues(): player.learn(0, True) if verbose: state.output() print("draw.") break current_player_mark = self.players[current_player_mark].mark.opponent().to_int()