コード例 #1
0
def policy_play(agent):
    game = Game(show=False)
    while not game.termination():
        board = game.gameboard.board
        choice = agent.best_move(board)
        game.input_pos(choice[0], choice[1])
    return game.gameboard.score
コード例 #2
0
def td_learning(mode):
    global epsilon
    max_score = 0
    if mode != "new":
        start_episode = int(mode)
        net.load_state_dict(torch.load(network_path + "net" + mode + ".pth"))
        target_net.load_state_dict(torch.load(network_path + "net" + mode + ".pth"))
        load_train_data(mode)
        global epsilon
        epsilon = epsilon - epsilon_decay * start_episode
        print("load the network and train data " + mode)

    else:
        start_episode = 0
    for episode in range(start_episode, nEpisode):
        ## init state
        game = Game(show=False)
        round = 0
        while not game.termination():
            ## pick an action
            possible_actions = game.gameboard.get_available_choices()
            ## choice is a flatten action
            current_state = game.gameboard.board
            choice = greedy_policy(current_state, episode, possible_actions, net)
            choice2d = deflatten_action(choice)
            next_state, reward = game.input_pos(choice2d[0], choice2d[1])
            ## simulation
            last_state, total_reward = game_simulation(next_state)
            total_reward += reward

            if epsilon > final_epsilon and episode > observations_steps:
                epsilon -= epsilon_decay

            replay_memory.append((current_state, choice, total_reward, last_state))

            if episode > observations_steps:
                if round == 0:
                    batch_learning()
                if episode % target_model_period == 0 and game.gameboard.round_index == 1:
                    target_net.load_state_dict(net.state_dict())
                    print("Update the target net")


            if game.gameboard.score > max_score and episode > observations_steps:
                if game.gameboard.round_index == 1 and episode == observations_steps + 1:
                    print("Finish observations")
                max_score = game.gameboard.score
                print("max score is %d in episode %d" % (max_score, episode))

            round = (round + 1) % (batch_size // batch_size)

        if episode % save_model_period == 0:
            print("save model in episode %d" % (episode))
            save_net(net, episode)
            save_train_data(episode)

    print("save model in episode %d" % (nEpisode))
    save_net(net, nEpisode)
    save_train_data(nEpisode)
コード例 #3
0
def mcts_play(agent):
    game = Game(show=False)
    mcts = MCTS(game.gameboard, agent)
    while not game.termination():
        current_node = mcts.get_next_node()
        choice = current_node.state.choice
        game.input_pos(choice[0], choice[1])
    return game.gameboard.score
コード例 #4
0
def play(agent, show=False):
    game = Game(show=show)
    state = game.gameboard.board
    #game.gameboard.print_board()
    while not game.termination():
        choice = agent.get_action(game.gameboard.board)
        game.input_pos(choice[0], choice[1])
    return game.gameboard.score
コード例 #5
0
def play(mode):

    game = Game(filepath)
    load_net(mode)
    state = game.gameboard.board
    while not game.termination():
        choice = get_action(state)
        choice2d = deflatten_action(choice)
        state, reward = game.input_pos(choice2d[0], choice2d[1])
コード例 #6
0
def play(agent, show=False):
    game = Game(show=show)
    state = game.gameboard.board
    #game.gameboard.print_board()
    while not game.termination():
        choice = agent.best_move(game.gameboard.board,
                                 game.gameboard.get_available_choices())
        game.input_pos(choice[0], choice[1])
    return game.gameboard.score
コード例 #7
0
def play(filename, number):
    fd = open("./output/" + filename, 'w')
    for i in range(number):
        game = Game(show=False)
        state = game.gameboard.board
        game.gameboard.print_board()
        while not game.termination():
            choice = game.random_player()
            game.input_pos(choice[0], choice[1])
        fd.write(str(game.gameboard.score) + "\n")
    fd.close()
コード例 #8
0
def play():

    game = Game()
    state = game.gameboard.board
    #game.gameboard.print_board()
    while not game.termination():
        choice = get_action(state)
        choice2d = deflatten_action(choice)
        ### check the action is available?
        state, reward = game.input_pos(choice2d[0], choice2d[1])
    return game.gameboard.score
コード例 #9
0
def get_memory(replay_memory, agent, n=10):
    for _ in range(n):
        game = Game(show=False)
        state = game.gameboard.board
        #game.gameboard.print_board()
        while not game.termination():
            board = copy.deepcopy(game.gameboard.board)
            choice = agent.best_move(game.gameboard.board,
                                     game.gameboard.get_available_choices())
            next_board, reward = game.input_pos(choice[0], choice[1])
            next_board = copy.deepcopy(next_board)
            replay_memory.append((board, choice, reward, next_board))
コード例 #10
0
def play(mode, filename, number):
    load_net(mode)
    fd = open("./output/" + filename, 'w')
    for i in range(number):
        game = Game(show=False)
        state = game.gameboard.board
        game.gameboard.print_board()
        while not game.termination():
            choice = get_action(state)
            choice2d = deflatten_action(choice)
            state, reward = game.input_pos(choice2d[0], choice2d[1])
        fd.write(str(game.gameboard.score) + "\n")
    fd.close()
コード例 #11
0
def run_episode(agent):
    train_data = []
    game = Game(show=False)
    while not game.termination():
        board = copy.deepcopy(game.gameboard.board)
        choice = agent.greedy_policy(board,
                                     game.gameboard.get_available_choices())
        _, reward = game.input_pos(choice[0], choice[1])
        train_data.append([board, reward, choice])

    ## correct the reward
    for i in reversed(range(len(train_data) - 1)):
        train_data[i][1] += GAMMA_RATE * train_data[i + 1][1]

    return train_data[0:-CUT]
コード例 #12
0
def run_episode(agent):
    train_data = []
    game = Game(show=False)

    mcts = MCTS(game.gameboard, agent)

    while not game.termination():
        board = copy.deepcopy(game.gameboard.board)
        current_node = mcts.get_next_node()
        if current_node == None:
            break
        choice = current_node.state.get_choice()
        _, reward = game.input_pos(choice[0], choice[1])
        train_data.append([board, reward, choice])

    return train_data
コード例 #13
0
def run_episode():
    train_data = []
    game = Game(show=False)

    current_node = init_first_node(game.gameboard)

    while not game.termination():

        current_node, pi = monte_carlo_tree_search(current_node)

        choice = current_node.state.get_choice()
        flat_choice = flatten_action(choice)
        net_index = action_index2net_index(flat_choice)
        one_data = [deepcopy(game.gameboard.board), net_index, pi, 0]

        state, reward = game.input_pos(choice[0], choice[1])
        one_data[3] = reward
        train_data.append(one_data)

    ## correct the reward
    for i in reversed(range(len(train_data) - 1)):
        train_data[i][3] += GAMMA_RATE * train_data[i + 1][3]
    return train_data
コード例 #14
0
def run_episode(agent):
    train_data = []
    game = Game(show=False)

    mcts = MCTS(game.gameboard, agent)

    while not game.termination():
        board = copy.deepcopy(game.gameboard.board)
        current_node = mcts.get_next_node()
        if current_node == None:
            break
        choice = current_node.state.get_choice()
        _, reward = game.input_pos(choice[0], choice[1])
        train_data.append([board, reward, choice])

    ## correct the reward
    for i in reversed(range(len(train_data) - 1)):
        train_data[i][1] += GAMMA_RATE * train_data[i + 1][1]


#     for i in range(len(train_data) - 2):
#         train_data[i][1] += train_data[i+1][1] + train_data[i+2][1]

    return train_data