def resume_self_play(): env = TicTacToeEnv() saves = [f for f in listdir(save_dir) if isfile(join(save_dir, f))] recent_file = max(saves) policy = EpsilonGreedy(QConvTicTacToe(env), 0) opposing_policy = EpsilonGreedy(QConvTicTacToe(env), 1) self_play = SelfPlay(policy, opposing_policy) policy.q.policy_net.load_state_dict(torch.load(join(save_dir, recent_file))) self_play.evaluate_policy(100)
def resume_self_play(): env = Connect4Env() saves = [f for f in listdir(save_dir) if isfile(join(save_dir, f))] recent_file = max(saves) policy = EpsilonGreedy(QLinear(env), 0) opposing_policy = EpsilonGreedy(QLinear(env), 0) # Acts greedily self_play = SelfPlay(policy, opposing_policy) policy.q.policy_net.load_state_dict(torch.load(join(save_dir, recent_file))) self_play.evaluate_policy(100)
def run_training(): env = TicTacToeEnv() policy = EpsilonGreedy(QConvTicTacToe(env, buffer_size=5000, batch_size=64), 0.1) opposing_policy = EpsilonGreedy( QConvTicTacToe(env), 1 ) # Make it not act greedily for the moment- exploration Acts greedily self_play = SelfPlay(policy, opposing_policy, env=env) self_play.train_model(20000, resume=False) print("Training Done") saved_name = os.path.join(save_dir, datetime.datetime.now().isoformat()) torch.save(self_play.policy.q.policy_net.state_dict(), saved_name)
def run_training(): env = Connect4Env() # policy = EpsilonGreedy(QConvTicTacToe(env, buffer_size=5000, batch_size=64), 0.1) policy = MCTreeSearch( ConvNetConnect4(), Connect4Env, temperature_cutoff=3, iterations=200, min_memory=64, ) opposing_policy = EpsilonGreedy( QConvConnect4(env), 1 ) # Make it not act greedily for the moment- exploration Acts greedily self_play = SelfPlay(policy, opposing_policy, env=env, swap_sides=True) self_play.train_model(20000, resume=False) print("Training Done") saved_name = os.path.join(save_dir, datetime.datetime.now().isoformat()) torch.save(self_play.policy.q.policy_net.state_dict(), saved_name)