Beispiel #1
0
    def __init__(self,
                 actions,
                 calculate_reward,
                 get_legal_actions,
                 transition,
                 version=0,
                 load_model=True,
                 load_memories=False,
                 best=False,
                 trainer=True,
                 memories=[]):
        create_folders()

        if memories != []:
            self.memories = memories
        else:
            self.memories = []

        self.load_model = load_model
        self.load_memories = load_memories

        self.actions = actions
        self.get_legal_actions = get_legal_actions
        self.calculate_reward = calculate_reward
        self.transition = transition

        self.best = best

        self.io = IOStream("checkpoints/run.log")

        self.cuda = False
        self.models = setup_models(self.io, load_model, self.cuda, trainer)
        self.optims = setup_optims(self.models, self.cuda)
        self.version = version

        if not best:
            if load_memories and version is not "best" and memories == []:
                print("Loading Memories...")
                try:
                    self.memories = pickle.load(
                        open("checkpoints/memories.p", "rb"))
                except FileNotFoundError:
                    print("Memories not found, making new memories.")

            print("Loading History...")
            try:
                self.history = pickle.load(open("checkpoints/history.p", "rb"))
            except FileNotFoundError:
                print("Loss history not found, starting new history.")
                self.history = {
                    "readout": [],
                    "policy": [],
                    "value": [],
                    "total": []
                }

            self.best_net = MCTSnet(self.actions,
                                    self.calculate_reward,
                                    self.get_legal_actions,
                                    self.transition,
                                    self.version,
                                    self.load_model,
                                    self.load_memories,
                                    best=True,
                                    trainer=False)
Beispiel #2
0
 def load_training_model(self):
     self.models = setup_models(self.io,
                                self.load_model,
                                self.cuda,
                                trainer=True)
     self.optims = setup_optims(self.models, self.cuda)
Beispiel #3
0
    def self_play(self,
                  root_state,
                  curr_player=0,
                  save_model=True,
                  T=config.TAU,
                  record_memories=True):
        # Consider separating the network evaluation from the games, since
        # the network evaluation will be through deterministic games
        # So we want a stochastic policy since it will see more states and be more robust
        # but we need to save the best model according to what the best deterministic policy is
        # since that is ultimately what we want.
        eval_mode(self.models)

        root_state = np.array(root_state, dtype="float32")
        joint_state = [np.copy(root_state), np.copy(root_state)]
        results = dict()
        results["player_one"] = 0
        results["player_two"] = 0
        results["draw"] = 0
        np.set_printoptions(precision=3)

        for _ in tqdm(range(config.EPISODES)):
            self.do_round(results,
                          joint_state,
                          curr_player,
                          T=T,
                          record_memories=record_memories)

        # results["player_one"] = 0
        # results["player_two"] = 0
        # results["draw"] = 0
        # for _ in tqdm(range(config.EVALUATION_EPISODES)):
        #     self.do_round(results, joint_state, curr_player,
        #                   T=0, record_memories=False)
        # print("Deterministic Results: ", results)
        if T == 0:
            name = "Deterministic"
        else:
            name = "Stochastic"
        print("{} Results: ".format(name), results)
        if save_model:
            if results["player_one"] > results[
                    "player_two"] * config.SCORING_THRESHOLD:
                self.save_best_model()
                self.best_net.models = setup_models(self.best_net.io,
                                                    self.best_net.load_model,
                                                    self.best_net.cuda,
                                                    trainer=False)
                self.best_net.optims = setup_optims(self.best_net.models,
                                                    self.best_net.cuda)

            elif results["player_two"] > results[
                    "player_one"] * config.SCORING_THRESHOLD:
                # load best model to training model
                self.models = setup_models(self.io,
                                           self.load_model,
                                           self.cuda,
                                           trainer=False)
                self.optims = setup_optims(self.models, self.cuda)

            # self.save_training_model()
        # self.memories = self.memories[-config.MAX_MEMORIES:]
        print("Num memories: {}".format(len(self.memories)))
        # Note, I am loading old memories from a bad version
        # It will eventually get overwritten, but it is a little inefficient to reference those
        return self.memories