Ejemplo n.º 1
0
class TrainPipeline():
    save_ParaFreq = 200
    MAX_EPISODES = 2000

    def __init__(self, flag_is_shown=False, flag_is_train=True):
        # training params
        self.flag_is_shown = flag_is_shown
        self.flag_is_train = flag_is_train
        self.game = Game(self.flag_is_shown, self.flag_is_train)
        self.NN = PolicyValueNet(
            (4, self.game.board_width, self.game.board_height))
        if not self.flag_is_train:
            self.NN.load_model("./paras/policy.model")
        self.mcts_player = MCTSPlayer(self.NN.propagation)

    def train(self):
        """run the training pipeline"""
        for episode in range(self.MAX_EPISODES):
            if self.flag_is_train:
                winner, play_data = self.game.start_self_play(self.mcts_player)
                self.NN.memory(play_data)
                if len(self.NN.data_buffer) > self.NN.batch_size:
                    loss = self.NN.policy_update()
                else:
                    print(
                        "Collecting data: %d%%, " %
                        (len(self.NN.data_buffer) / self.NN.batch_size * 100),
                        end="")
                # and save the model params
                if (episode + 1) % self.save_ParaFreq == 0:
                    self.NN.save_model('./paras/policy.model')
                print("episode = %d" % episode)
            else:
                self.game.start_play(self.mcts_player)
Ejemplo n.º 2
0
 def __init__(self, flag_is_shown=False, flag_is_train=True):
     # training params
     self.flag_is_shown = flag_is_shown
     self.flag_is_train = flag_is_train
     self.game = Game(self.flag_is_shown, self.flag_is_train)
     self.NN = PolicyValueNet(
         (4, self.game.board_width, self.game.board_height))
     if not self.flag_is_train:
         self.NN.load_model("./paras/policy.model")
     self.mcts_player = MCTSPlayer(self.NN.propagation)
Ejemplo n.º 3
0
def main():
    global Net
    #global epsilon
    global n_cpu
    global test_text

    port = 4001
    test_mode = False
    Net = Net_dnc

    if len(sys.argv) > 1:
        cmd = sys.argv[1]
        for c in list(cmd):
            if c == 't':
                test_mode = True
            elif c == 'T':
                test_mode = True
                test_text = True
            elif c == 'p':
                port = int(sys.argv[2])
                print("port:", port)
            #elif c == 'e':
            #    epsilon = float(sys.argv[2])
            #    print("epsilon:", epsilon)
            elif c == 'd':
                Net = Net_dnc
            elif c == 'l':
                Net = Net_lstm
            elif c == 'a':
                Net = Net_avg
            elif c == 'n':
                Net = Net_none
            elif c == '1':
                n_cpu = 1

    print("Using:", Net)

    #net = Net(len(env.symbols), len(env.actions), len(env.objects))
    net = Net(1254, 6, 36)

    #Try to load from file
    if os.path.isfile(name):
        print("Loading from file..")
        net.load_state_dict(torch.load(name))

    if not test_mode:
        net.share_memory()
        processes = []
        for rank in range(n_cpu):
            p = mp.Process(target=train, args=(net, rank))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
    else:
        env = Game(True, port, max_steps=250)
        test(net, env, True)
Ejemplo n.º 4
0
def main():
    is_test = False
    is_tutorial_world = False
    port = 4001
    if len(sys.argv) > 1:
        assert(sys.argv[1] in ('master', 'fantasy', 'mastert', 'fantasyt'))
        is_tutorial_world = sys.argv[1] in ('fantasy', 'fantasyt')
        is_test = sys.argv[1] in ('mastert', 'fantasyt')
    if len(sys.argv) > 2:
        port = int(sys.argv[2])

    env = Game(is_tutorial_world, port)

    net = Net(len(env.symbols), len(env.actions), len(env.objects))
    #Try to load from file
    if os.path.isfile(name):
        print("Loading from file..")
        net.load_state_dict(torch.load(name))

    if is_test:
        test(net, env, is_tutorial_world)
    else:
        train(net, env)
Ejemplo n.º 5
0
def play_game(config: MuZeroConfig, network: Network) -> Game:
    game = Game.from_config(config)

    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        last_observation = game.make_image(-1)
        root.expand(game.to_play(), game.legal_actions(),
                    network.initial_inference(last_observation).numpy())
        root.add_exploration_noise(config)

        # logging.debug('Running MCTS on step {}.'.format(len(game.history)))
        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the network.
        run_mcts(config, root, game.action_history(), network)
        action = root.select_action(config, len(game.history), network)
        game.apply(action)
        game.store_search_statistics(root)

    logging.info('Finished episode at step {} | cumulative reward: {}' \
        .format(len(game.obs_history), sum(game.rewards)))

    return game
Ejemplo n.º 6
0
        self.canvas.delete(self.explosionAV)
        self.canvas.delete(self.explosionAH)
        self.canvas.delete(self.explosionBV)
        self.canvas.delete(self.explosionBH)
        self.canvas.delete(self.bombA)
        self.canvas.delete(self.bombB)
        self.explosionAV = None
        self.explosionAH = None
        self.explosionBV = None
        self.explosionBH = None
        self.bombA = None
        self.bombB = None


if __name__ == '__main__':
    env = Game()
    vis = Visualiser(env, 80)
    for i in range(100):
        env.step(random.randint(0, 4), random.randint(0, 4))
        vis.update_canvas(env)

    # env.step(1,1)
    # vis.update_canvas(env)
    # env.step(1,1)
    # vis.update_canvas(env)
    # env.step(2,1)
    # vis.update_canvas(env)
    # env.step(2,1)
    # vis.update_canvas(env)
    # env.step(4,1)
    # vis.update_canvas(env)
Ejemplo n.º 7
0
        plt.xlabel('Episodes')
        plt.legend(('Q-Learning', 'MinMax Q-Learning'))
        plt.show()

def assign_bins(obs, bins):
	state = np.zeros(4)
	for i in range(4):
		state[i] = np.digitize(obs[i], bins[i])
	return state

def plotPolicy(player, game):
        for state in player.Q:
            print("\n=================")
            game.draw(game.P)
            # print("State value: %s" % player.V[state])
            player.policyForState(state)


if __name__ == '__main__':
	env = Game()
	# ql_p = test(True, 'saved_players/QR')
	# min_p = test_minmax(True, 'MR')
	# ql_wins, minmax_wins = ql_vs_minmax(False)
	# print(ql_wins)
	# print(minmax_wins)
	# test_rps()
	# test_DQL()
	# testB(cont=True, filenames=("actions", "actionsB"))
	run_optimal()
	# run_optimalB()
	# test_cartpole()
Ejemplo n.º 8
0
import sys, pygame
import numpy as np
from env import Game
pygame.init()
field_size = 4
game = Game(field_size)
game.rand(2, 2)
game.field[0, 0] = 2
game.field[0, 1] = 4
game.field[0, 2] = 8
game.field[0, 3] = 16

size = width, height = 400, 400
screen = pygame.display.set_mode(size)
pygame.display.set_caption("2048")
bg = pygame.Surface((size))
bg.fill(pygame.Color('#bbada0'))
#776e65 - font of numbers
indent = 5
box_size = 90



boxes = []
coords = []

for i in range(field_size ** 2):
    bx = pygame.Surface((box_size, box_size))
    bx.fill(pygame.Color("#eee4da"))
    boxes.append(bx)
        if (delta < theta):
            return delta, Value_table  # 收敛了就退出循环


def policy_improvement(env, dealer, player, Value_table):
    hit = update_value_for_one_state(env, 0, dealer, player, Value_table)
    stick = update_value_for_one_state(env, 1, dealer, player, Value_table)
    # 比较两个动作谁的价值更高就选择哪个
    if (hit > stick):
        Policy[dealer - 1, player - 1] = 0
    else:
        Policy[dealer - 1, player - 1] = 1


if __name__ == "__main__":
    env = Game()
    initial_policy()
    result = []
    for epi in range(episode_num):
        # 策略评估
        pe, Value_table = policy_evaluation(env, Value_table)
        print(pe, epi)
        # 策略提升
        policy_stable = True  # 如果对所有的状态策略都没有更新,那么提前结束
        for dealer in range(1, 11):
            for player in range(1, 22):
                old_action = Policy[dealer - 1, player - 1]
                policy_improvement(env, dealer, player, Value_table)
                if (old_action != Policy[dealer - 1, player - 1]):
                    policy_stable = False
        # print(Policy)
Ejemplo n.º 10
0
def train(net, rank):
    torch.set_num_threads(1)  #also do: export MKL_NUM_THREADS=1

    net.reset()
    env = Game(True, 4000 + rank + 1, max_steps=250)

    target_net = Net(1254, 6, 36)
    target_net.load_state_dict(net.state_dict())
    target_net.reset()

    epsilon = epsilon1

    optimizer = optim.RMSprop(net.parameters(), lr=learning_rate)
    last_save = time.time()
    last_notify = time.time()
    last_sync = time.time()
    episode_number = 0
    terminal = True
    prev_value = None
    available_objects = None
    num_objects = len(env.objects)
    recent_rewards_of_episodes = []
    recent_steps_of_episodes = []

    quest1_reward_cnt = 0
    quest2_reward_cnt = 0
    quest3_reward_cnt = 0
    quest4_reward_cnt = 0
    quest1_rewards = np.zeros(100)
    quest2_rewards = np.zeros(100)
    quest3_rewards = np.zeros(100)
    quest4_rewards = np.zeros(100)

    if rank == 0:
        stats = []

    while True:
        if terminal:
            student_saw_obelisk = False
            quest1_rewards[episode_number % len(quest1_rewards)] = 0
            quest2_rewards[episode_number % len(quest2_rewards)] = 0
            quest3_rewards[episode_number % len(quest3_rewards)] = 0
            quest4_rewards[episode_number % len(quest4_rewards)] = 0
            prev_value = None
            num_steps = 0
            net.reset()
            target_net.reset()
            state, reward, terminal, available_objects = env.reset()
            sum_rewards = reward

        state = torch.LongTensor(state)
        objects_probs = net(Variable(state.unsqueeze(0)))

        _objects_probs = objects_probs.data.numpy()

        #Choose action
        if random.random() < epsilon:
            if available_objects is None:
                objects = list(enumerate(env.objects))
            else:
                objects = [
                    _ for _ in list(enumerate(env.objects))
                    if _[0] in available_objects
                ]

            _object = random.choice(objects)[0]
        else:
            if available_objects is not None:
                mask = np.zeros(num_objects)
                for e in available_objects:
                    mask[e] = 1
                _objects_probs = objects_probs.data.numpy() * mask
                _objects_probs = _objects_probs + (_objects_probs == 0) * -1e30
            _object = int(np.argmax(_objects_probs))

        prev_value = objects_probs[0, _object]

        # step the environment and get new measurements
        state, reward, terminal, available_objects = env.step(5, _object)
        sum_rewards += reward
        num_steps += 1

        if reward > 10 - 0.0001:
            quest4_reward_cnt = quest4_reward_cnt + 1
            quest4_rewards[episode_number % len(quest4_rewards)] = 1
        elif reward > 8 - 0.0001:
            quest3_reward_cnt = quest3_reward_cnt + 1
            quest3_rewards[episode_number % len(quest3_rewards)] = 1
            if not disable_curriculum:
                if not student_saw_obelisk:
                    reward = -8
                    terminal = True
        elif reward > 7 - 0.0001:
            student_saw_obelisk = True
            quest2_reward_cnt = quest2_reward_cnt + 1
            quest2_rewards[episode_number % len(quest2_rewards)] = 1
            if not disable_curriculum:
                if np.mean(quest2_rewards) < 0.75 and random.random() < 0.9:
                    terminal = True
        elif reward > 5 - 0.0001:
            quest1_reward_cnt = quest1_reward_cnt + 1
            quest1_rewards[episode_number % len(quest1_rewards)] = 1
            if not disable_curriculum:
                if np.mean(quest1_rewards) < 0.9 and random.random() < 0.85:
                    terminal = True

        if 2 * epsilon > (epsilon1 + epsilon2):
            if np.mean(quest3_rewards) > .98:
                if np.mean(quest2_rewards) > .98:
                    if np.mean(quest1_rewards) > .98:
                        epsilon = epsilon2
                        if rank == 0:
                            notify("Epsilon is now:" + str(epsilon))

        if terminal:
            next_value = 0
        else:
            if target_q_ts is None:
                next_value = float(np.max(_objects_probs))
            else:
                state = torch.LongTensor(state)
                objects_probs = target_net(Variable(state.unsqueeze(0)))
                _objects_probs = objects_probs.data.numpy()
                if available_objects is not None:
                    mask = np.zeros(num_objects)
                    for e in available_objects:
                        mask[e] = 1
                    _objects_probs = _objects_probs * mask
                    _objects_probs = _objects_probs + (_objects_probs
                                                       == 0) * -1e30
                next_value = float(np.max(_objects_probs))

        loss = (reward + gamma * next_value - prev_value)**2

        #Update for only a tenth of the non important steps
        if abs(reward) > 4 or random.random() < 0.05:
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            nn.utils.clip_grad_norm(net.parameters(), 1)
            optimizer.step()

        if terminal:
            recent_rewards_of_episodes.append(sum_rewards)
            recent_steps_of_episodes.append(num_steps)
            if len(recent_rewards_of_episodes) > 100:
                recent_rewards_of_episodes.pop(0)
            if len(recent_steps_of_episodes) > 100:
                recent_steps_of_episodes.pop(0)

            episode_number += 1
            if target_q_ts is not None and time.time(
            ) - last_sync > target_q_ts:
                if rank == 0:
                    print("Update target")
                target_net.load_state_dict(net.state_dict())
                last_sync = time.time()

            if rank == 0:
                stats.append({})
                stats[-1]["episode_number"] = episode_number
                stats[-1]["sum_rewards"] = sum_rewards
                stats[-1]["num_steps"] = num_steps
                stats[-1]["mean_recent_rewards_of_episodes"] = np.mean(
                    recent_rewards_of_episodes)
                stats[-1]["mean_recent_steps_of_episodes"] = np.mean(
                    recent_steps_of_episodes)
                stats[-1]["quest1_reward_cnt"] = quest1_reward_cnt
                stats[-1]["quest2_reward_cnt"] = quest2_reward_cnt
                stats[-1]["quest3_reward_cnt"] = quest3_reward_cnt
                stats[-1]["quest4_reward_cnt"] = quest4_reward_cnt
                stats[-1]["mean_quest1_rewards"] = np.mean(quest1_rewards)
                stats[-1]["mean_quest2_rewards"] = np.mean(quest2_rewards)
                stats[-1]["mean_quest3_rewards"] = np.mean(quest3_rewards)
                stats[-1]["mean_quest4_rewards"] = np.mean(quest4_rewards)

                summary = "{} {:.4} {} {:.4} {:.4} Qc: {} {} {} {} Q: {} {} {} {}".format(
                    episode_number, sum_rewards, num_steps,
                    np.mean(recent_rewards_of_episodes),
                    np.mean(recent_steps_of_episodes), quest1_reward_cnt,
                    quest2_reward_cnt, quest3_reward_cnt, quest4_reward_cnt,
                    np.mean(quest1_rewards), np.mean(quest2_rewards),
                    np.mean(quest3_rewards), np.mean(quest4_rewards))
                print(summary)

                if save_every is not None:
                    if time.time() - last_save > save_every:
                        print("Saving..")
                        torch.save(net.state_dict(), name)
                        with open(name_stats, "wb") as _fh:
                            pickle.dump(stats, _fh)
                        last_save = time.time()

                if notify_every is not None:
                    if time.time() - last_notify > notify_every:
                        print("Notify..")
                        notify(summary)
                        last_notify = time.time()

                if max_episodes is not None and episode_number == max_episodes:
                    torch.save(net.state_dict(), name)
                    with open(name_stats, "wb") as _fh:
                        pickle.dump(stats, _fh)
                    notify(summary)
                    notify("Done.")
                    print("Done.")
                    sys.exit()