def play(model): # 학습 데이터 history = [] # 상태 생성 state = State() while True: # 게임 종료 시 if state.is_done(): break # 합법적인 수의 확률 분포 얻기 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 학습 데이터에 상태와 정책 추가 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([[state.pieces, state.enemy_pieces], policies, None]) # 행동 얻기 action = np.random.choice(state.legal_actions(), p=scores) # 다음 상태 얻기 state = state.next(action) # 학습 데이터에 가치 추가 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): history = [] state = State() while True: if state.is_done(): break scores = pv_mcts_scores(model, state, SP_TEMPERATURE) policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([state.pieces_array(), policies, None]) action = np.random.choice(state.legal_actions(), p=scores) state = state.next(action) value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): states = [] ys = [[], None] state = State() while True: if state.end: break scores = pv_mcts_scores(model, MCTS_EVALUATE_COUNT, state) policies = [0] * 9 for action, policy in zip(state.legal_actions, boltzman(scores, 1.0)): policies[action] = policy states.append(state) ys[0].append(policies) state = state.next( np.random.choice(state.legal_actions, p=boltzman(scores, TEMPERATURE))) value = first_player_value(state) ys[1] = tuple(take(len(ys[0]), cycle((value, -value)))) return states, ys
def play(model): # 学習データ history = [] # 状態の生成 state = State() while True: # ゲーム終了時 if state.is_done(): break # 合法手の確率分布の取得 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 学習データに状態と方策を追加 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([state.pieces_array(), policies, None]) # 行動の取得 action = np.random.choice(state.legal_actions(), p=scores) # 次の状態の取得 state = state.next(action) # 学習データに価値を追加 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): # 学習データ history = [] # 状態の生成 state = State() while True: # ゲーム終了時 if state.is_done(): break # 合法手の確率分布の取得 scores, values = pv_mcts_scores(model, state, SP_TEMPERATURE) # 学習データに状態と方策を追加 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy # 行動の取得 action = np.random.choice(state.legal_actions(), p=scores) # state, policy, value, 探索結果, 選ばれた手、それから先の局面 history.append([[state.pieces, state.enemy_pieces], policies, None, values, action, None]) # 次の状態の取得 state = state.next(action) # 学習データに価値を追加 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value # 最後の局面情報を取っておく last_state = history[-1][0] last_policy = [0] * DN_OUTPUT_SIZE v0 = history[0][2] v1 = history[1][2] for i in range(len(history)): rp = [] for inc in range(3): index = i + inc if index < len(history): rp.append(history[i + inc]) else: v = v0 if ((i + inc) % 2) == 0 else v1 a = randint(9) rp.append([last_state, last_policy, v, v, a, None]) history[i][5] = rp return history
def play(model, using_saved_state=False, saving_ontheway_state=False): ''' 1ゲームの実行 ''' # 学習データ history = [] # 状態の生成 if using_saved_state: state = load_state() if not state: state = State() else: state = State() starttime = time.time() print('') while True: # ゲーム終了時 if state.is_done(): endtime = time.time() print("first player is ", "lose" if state.is_lose() else "win") print("first player num:", state.piece_count(state.pieces)) print('elapsed time', endtime - starttime) print(state) break # 合法手の確率分布の取得 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 学習データに状態と方策を追加 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([[state.pieces, state.enemy_pieces], policies, None]) # 行動の取得 if len(history) % 10 == 0: print("state len: ", len(history)) print(state) if saving_ontheway_state and len(history) == 25: save_state(state) action = np.random.choice(state.legal_actions(), p=scores) # 次の状態の取得 state = state.next(action) # 学習データに価値を追加 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(models): state = State() for model in cycle(models): if state.end: break state = state.next( np.random.choice(state.legal_actions, p=boltzman( pv_mcts_scores(model, MCTS_EVALUATE_COUNT, state), TEMPERATURE))) return first_player_point(state)
def play(model): history = [] state = State() while True: if state.is_done(): break scores = pv_mcts_scores(model, state, SP_TEMPERATURE) with open('action_list.txt', 'rb') as f: action_list = pickle.load(f) # print('action_list:', len(action_list)) policies = np.zeros(len(action_list)) # for action_num, policy in zip(state.legal_actions(), scores): # policies[action_num] = policy # print('size check', len(policies), len(scores)) legal_actions = state.legal_actions() for i in range(len(legal_actions)): policies[legal_actions[i]] = scores[i] # print(policies) # print('policies:', policies) history.append([[state.pieces, state.enemy_pieces], policies, None]) # action_list_num = np.arange(len(action_list)) # action_num = np.random.choice(action_list_num, p=scores) action_num = np.random.choice(legal_actions, p=scores) # print(action_num) state.next(action_num) value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): # 학습 데이터 history = [] # 상태 생성 state = State() while True: # 게임 종료 시 if state.is_done(): break # 합법적인 수의 확률 분포 얻기 # (모델, 게임 상태, 온도파라미터:변동성을주기위해사용하는변수) # 각 노드의 점수가 계산 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 학습 데이터에 상태와 정책 추가 policies = [0] * DN_OUTPUT_SIZE # 행동수 :7 # 돌을 놓을수 있는 후보지, 점수를 넣어서 for action, policy in zip(state.legal_actions(), scores): # 행동과 정책을 세팅 # 어떤 열에 정책 세팅 policies[action] = policy # 내역을 기록 ( [내돌상태, 적돌상태], 정책, None(점수)) history.append([[state.pieces, state.enemy_pieces], policies, None]) # 행동 얻기 action = np.random.choice(state.legal_actions(), p=scores) # 다음 상태 얻기 state = state.next(action) # 학습 데이터에 가치 추가 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model, p_fail_count): history = [] state = go.Position() while True: if state.is_game_over(): break scores = pv_mcts_scores(model, state, SP_TEMPERATURE, p_fail_count) policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(get_legal_actions(state.all_legal_moves()), scores): policies[action] = policy x = features.extract_features(state, features.AGZ_FEATURES) history.append([x, policies, None]) pv_mcts_coord = None action = np.random.choice(get_legal_actions(state.all_legal_moves()), p=scores) if action == (go.N * go.N): pv_mcts_coord = None else: coord_row = action // go.N coord_column = action % go.N pv_mcts_coord = (coord_row, coord_column) state = state.play_move(pv_mcts_coord) value = state.result() for i in range(len(history)): history[i][2] = value value = -value return history