def __init__(self, actions, calculate_reward, get_legal_actions, transition, version=0, load_model=True, load_memories=False, best=False, trainer=True, memories=[]): create_folders() if memories != []: self.memories = memories else: self.memories = [] self.load_model = load_model self.load_memories = load_memories self.actions = actions self.get_legal_actions = get_legal_actions self.calculate_reward = calculate_reward self.transition = transition self.best = best self.io = IOStream("checkpoints/run.log") self.cuda = False self.models = setup_models(self.io, load_model, self.cuda, trainer) self.optims = setup_optims(self.models, self.cuda) self.version = version if not best: if load_memories and version is not "best" and memories == []: print("Loading Memories...") try: self.memories = pickle.load( open("checkpoints/memories.p", "rb")) except FileNotFoundError: print("Memories not found, making new memories.") print("Loading History...") try: self.history = pickle.load(open("checkpoints/history.p", "rb")) except FileNotFoundError: print("Loss history not found, starting new history.") self.history = { "readout": [], "policy": [], "value": [], "total": [] } self.best_net = MCTSnet(self.actions, self.calculate_reward, self.get_legal_actions, self.transition, self.version, self.load_model, self.load_memories, best=True, trainer=False)
def load_training_model(self): self.models = setup_models(self.io, self.load_model, self.cuda, trainer=True) self.optims = setup_optims(self.models, self.cuda)
def self_play(self, root_state, curr_player=0, save_model=True, T=config.TAU, record_memories=True): # Consider separating the network evaluation from the games, since # the network evaluation will be through deterministic games # So we want a stochastic policy since it will see more states and be more robust # but we need to save the best model according to what the best deterministic policy is # since that is ultimately what we want. eval_mode(self.models) root_state = np.array(root_state, dtype="float32") joint_state = [np.copy(root_state), np.copy(root_state)] results = dict() results["player_one"] = 0 results["player_two"] = 0 results["draw"] = 0 np.set_printoptions(precision=3) for _ in tqdm(range(config.EPISODES)): self.do_round(results, joint_state, curr_player, T=T, record_memories=record_memories) # results["player_one"] = 0 # results["player_two"] = 0 # results["draw"] = 0 # for _ in tqdm(range(config.EVALUATION_EPISODES)): # self.do_round(results, joint_state, curr_player, # T=0, record_memories=False) # print("Deterministic Results: ", results) if T == 0: name = "Deterministic" else: name = "Stochastic" print("{} Results: ".format(name), results) if save_model: if results["player_one"] > results[ "player_two"] * config.SCORING_THRESHOLD: self.save_best_model() self.best_net.models = setup_models(self.best_net.io, self.best_net.load_model, self.best_net.cuda, trainer=False) self.best_net.optims = setup_optims(self.best_net.models, self.best_net.cuda) elif results["player_two"] > results[ "player_one"] * config.SCORING_THRESHOLD: # load best model to training model self.models = setup_models(self.io, self.load_model, self.cuda, trainer=False) self.optims = setup_optims(self.models, self.cuda) # self.save_training_model() # self.memories = self.memories[-config.MAX_MEMORIES:] print("Num memories: {}".format(len(self.memories))) # Note, I am loading old memories from a bad version # It will eventually get overwritten, but it is a little inefficient to reference those return self.memories