Ejemplo n.º 1
0
def play(model):
    # 학습 데이터
    history = []

    # 상태 생성
    state = State()

    while True:
        # 게임 종료 시
        if state.is_done():
            break

        # 합법적인 수의 확률 분포 얻기
        scores = pv_mcts_scores(model, state, SP_TEMPERATURE)

        # 학습 데이터에 상태와 정책 추가
        policies = [0] * DN_OUTPUT_SIZE
        for action, policy in zip(state.legal_actions(), scores):
            policies[action] = policy
        history.append([[state.pieces, state.enemy_pieces], policies, None])

        # 행동 얻기
        action = np.random.choice(state.legal_actions(), p=scores)

        # 다음 상태 얻기
        state = state.next(action)

    # 학습 데이터에 가치 추가
    value = first_player_value(state)
    for i in range(len(history)):
        history[i][2] = value
        value = -value
    return history
Ejemplo n.º 2
0
def play(model):
  history = []

  state = State()

  while True:
    if state.is_done():
      break
    
    scores = pv_mcts_scores(model, state, SP_TEMPERATURE)
    policies = [0] * DN_OUTPUT_SIZE
    for action, policy in zip(state.legal_actions(), scores):
      policies[action] = policy
    history.append([state.pieces_array(), policies, None])

    action = np.random.choice(state.legal_actions(), p=scores)

    state = state.next(action)

  value = first_player_value(state)
  for i in range(len(history)):
    history[i][2] = value
    value = -value
  
  return history
Ejemplo n.º 3
0
def play(model):
    states = []
    ys = [[], None]

    state = State()

    while True:
        if state.end:
            break

        scores = pv_mcts_scores(model, MCTS_EVALUATE_COUNT, state)

        policies = [0] * 9
        for action, policy in zip(state.legal_actions, boltzman(scores, 1.0)):
            policies[action] = policy

        states.append(state)
        ys[0].append(policies)

        state = state.next(
            np.random.choice(state.legal_actions,
                             p=boltzman(scores, TEMPERATURE)))

    value = first_player_value(state)
    ys[1] = tuple(take(len(ys[0]), cycle((value, -value))))

    return states, ys
Ejemplo n.º 4
0
def play(model):
    # 学習データ
    history = []

    # 状態の生成
    state = State()

    while True:
        # ゲーム終了時
        if state.is_done():
            break

        # 合法手の確率分布の取得
        scores = pv_mcts_scores(model, state, SP_TEMPERATURE)

        # 学習データに状態と方策を追加
        policies = [0] * DN_OUTPUT_SIZE
        for action, policy in zip(state.legal_actions(), scores):
            policies[action] = policy
        history.append([state.pieces_array(), policies, None])

        # 行動の取得
        action = np.random.choice(state.legal_actions(), p=scores)

        # 次の状態の取得
        state = state.next(action)

    # 学習データに価値を追加
    value = first_player_value(state)
    for i in range(len(history)):
        history[i][2] = value
        value = -value
    return history
Ejemplo n.º 5
0
def play(model):
    # 学習データ
    history = []

    # 状態の生成
    state = State()

    while True:
        # ゲーム終了時
        if state.is_done():
            break

        # 合法手の確率分布の取得

        scores, values = pv_mcts_scores(model, state, SP_TEMPERATURE)

        # 学習データに状態と方策を追加
        policies = [0] * DN_OUTPUT_SIZE
        for action, policy in zip(state.legal_actions(), scores):
            policies[action] = policy

        # 行動の取得
        action = np.random.choice(state.legal_actions(), p=scores)

        # state, policy, value, 探索結果, 選ばれた手、それから先の局面
        history.append([[state.pieces, state.enemy_pieces], policies, None,
                        values, action, None])

        # 次の状態の取得
        state = state.next(action)

    # 学習データに価値を追加
    value = first_player_value(state)
    for i in range(len(history)):
        history[i][2] = value
        value = -value

    # 最後の局面情報を取っておく
    last_state = history[-1][0]
    last_policy = [0] * DN_OUTPUT_SIZE
    v0 = history[0][2]
    v1 = history[1][2]

    for i in range(len(history)):
        rp = []
        for inc in range(3):
            index = i + inc
            if index < len(history):
                rp.append(history[i + inc])
            else:
                v = v0 if ((i + inc) % 2) == 0 else v1
                a = randint(9)
                rp.append([last_state, last_policy, v, v, a, None])
        history[i][5] = rp

    return history
Ejemplo n.º 6
0
def play(model, using_saved_state=False, saving_ontheway_state=False):
    '''
    1ゲームの実行
    '''

    # 学習データ
    history = []

    # 状態の生成
    if using_saved_state:
        state = load_state()
        if not state:
            state = State()
    else:
        state = State()

    starttime = time.time()
    print('')
    while True:
        # ゲーム終了時
        if state.is_done():
            endtime = time.time()
            print("first player is ", "lose" if state.is_lose() else "win")
            print("first player num:", state.piece_count(state.pieces))
            print('elapsed time', endtime - starttime)
            print(state)
            break

        # 合法手の確率分布の取得

        scores = pv_mcts_scores(model, state, SP_TEMPERATURE)

        # 学習データに状態と方策を追加
        policies = [0] * DN_OUTPUT_SIZE
        for action, policy in zip(state.legal_actions(), scores):
            policies[action] = policy
        history.append([[state.pieces, state.enemy_pieces], policies, None])

        # 行動の取得
        if len(history) % 10 == 0:
            print("state len: ", len(history))
            print(state)

        if saving_ontheway_state and len(history) == 25:
            save_state(state)
        action = np.random.choice(state.legal_actions(), p=scores)

        # 次の状態の取得
        state = state.next(action)

    # 学習データに価値を追加
    value = first_player_value(state)
    for i in range(len(history)):
        history[i][2] = value
        value = -value
    return history
Ejemplo n.º 7
0
def play(models):
    state = State()

    for model in cycle(models):
        if state.end:
            break

        state = state.next(
            np.random.choice(state.legal_actions,
                             p=boltzman(
                                 pv_mcts_scores(model, MCTS_EVALUATE_COUNT,
                                                state), TEMPERATURE)))

    return first_player_point(state)
Ejemplo n.º 8
0
def play(model):
    history = []
    state = State()

    while True:
        if state.is_done():
            break

        scores = pv_mcts_scores(model, state, SP_TEMPERATURE)

        with open('action_list.txt', 'rb') as f:
            action_list = pickle.load(f)

        # print('action_list:', len(action_list))

        policies = np.zeros(len(action_list))
        # for action_num, policy in zip(state.legal_actions(), scores):
        # 	policies[action_num] = policy

        # print('size check', len(policies), len(scores))

        legal_actions = state.legal_actions()

        for i in range(len(legal_actions)):
            policies[legal_actions[i]] = scores[i]
            # print(policies)
        # print('policies:', policies)
        history.append([[state.pieces, state.enemy_pieces], policies, None])

        # action_list_num = np.arange(len(action_list))
        # action_num = np.random.choice(action_list_num, p=scores)
        action_num = np.random.choice(legal_actions, p=scores)
        # print(action_num)
        state.next(action_num)

    value = first_player_value(state)
    for i in range(len(history)):
        history[i][2] = value
        value = -value
    return history
Ejemplo n.º 9
0
def play(model):
    # 학습 데이터
    history = []
    # 상태 생성
    state = State()
    while True:
        # 게임 종료 시
        if state.is_done():
            break

        # 합법적인 수의 확률 분포 얻기
        # (모델, 게임 상태, 온도파라미터:변동성을주기위해사용하는변수)
        # 각 노드의 점수가 계산
        scores = pv_mcts_scores(model, state, SP_TEMPERATURE)

        # 학습 데이터에 상태와 정책 추가
        policies = [0] * DN_OUTPUT_SIZE  # 행동수 :7
        # 돌을 놓을수 있는 후보지, 점수를 넣어서
        for action, policy in zip(state.legal_actions(), scores):
            # 행동과 정책을 세팅
            # 어떤 열에 정책 세팅
            policies[action] = policy
        # 내역을 기록 ( [내돌상태, 적돌상태], 정책, None(점수))
        history.append([[state.pieces, state.enemy_pieces], policies, None])

        # 행동 얻기
        action = np.random.choice(state.legal_actions(), p=scores)

        # 다음 상태 얻기
        state = state.next(action)

    # 학습 데이터에 가치 추가
    value = first_player_value(state)
    for i in range(len(history)):
        history[i][2] = value
        value = -value
    return history
Ejemplo n.º 10
0
def play(model, p_fail_count):
    history = []

    state = go.Position()

    while True:
        if state.is_game_over():
            break
        
        scores = pv_mcts_scores(model, state, SP_TEMPERATURE, p_fail_count)

        policies = [0] * DN_OUTPUT_SIZE
        for action, policy in zip(get_legal_actions(state.all_legal_moves()), scores):
            policies[action] = policy

        x = features.extract_features(state, features.AGZ_FEATURES)

        history.append([x, policies, None])

        pv_mcts_coord = None
        action = np.random.choice(get_legal_actions(state.all_legal_moves()), p=scores)
        if action == (go.N * go.N):
            pv_mcts_coord = None
        else:
            coord_row = action // go.N
            coord_column = action % go.N
            pv_mcts_coord = (coord_row, coord_column)

        state = state.play_move(pv_mcts_coord)

    value = state.result()
    
    for i in range(len(history)):
        history[i][2] = value
        value = -value
    
    return history