Example #1
0
 def __init__(self, lmbda=0.5, c_puct=1, n_thr=15, time_limit=10):
     self.root = Node(None, 1.0)
     self.policy_net = network.SLPolicy()
     serializers.load_npz('./models/sl_model.npz', self.policy_net)
     self.value_net = network.Value()
     serializers.load_npz('./models/value_model.npz', self.value_net)
     chainer.config.train = False
     chainer.config.enable_backprop = False
     self.lmbda = lmbda
     self.c_puct = c_puct
     self.n_thr = n_thr
     self.time_limit = time_limit
Example #2
0
    def __init__(self, auto):
        # Initialize board state
        if auto:
            self.p1 = "IaGo(SLPolicy)"
            self.model = network.SLPolicy()
            serializers.load_npz('./models/sl_model.npz', self.model)
        else:
            self.p1 = "You"
            self.model = None
        self.p2 = "IaGo(PV-MCTS)"

        self.state = np.zeros([8, 8], dtype=np.float32)
        self.state[4, 3] = 1
        self.state[3, 4] = 1
        self.state[3, 3] = 2
        self.state[4, 4] = 2
        # Initialize game variables
        self.stone_num = 4
        self.play_num = 1
        self.pass_flg = False
        self.date = datetime.now().strftime("%Y-%m-%d-%H-%M")
        self.gamelog = "IaGo \n" + self.date + "\n"
        self.mcts = MCTS.MCTS()
Example #3
0
def main():
    # Set the number of epochs and policy to train
    parser = argparse.ArgumentParser(description='IaGo:')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--policy',
                        '-p',
                        type=str,
                        default="sl",
                        help='Policy to train: sl or rollout')
    parser.add_argument('--gpu', '-g', type=int, default="0", help='GPU ID')
    args = parser.parse_args()

    # Model definition
    if args.policy == "rollout":
        model = network.RolloutPolicy()
    else:
        if args.policy != "sl":
            print(
                'Argument "--policy" is invalid. SLPolicy has been set by default.'
            )
        model = network.SLPolicy()
    optimizer = optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))
    cuda.get_device(args.gpu).use()

    X_test = np.load('../policy_data/npy/states_test.npy')
    y_test = np.load('../policy_data/npy/actions_test.npy')
    X_test, y_test = transform(X_test, y_test)
    # Load train dataset
    X_train = np.load('../policy_data/npy/states.npy')
    y_train = np.load('../policy_data/npy/actions.npy')
    train_size = y_train.shape[0]
    minibatch_size = 4096  # 2**12

    # Learing loop
    for epoch in tqdm(range(args.epoch)):
        model.to_gpu(args.gpu)
        # Shuffle train dataset
        rands = np.random.choice(train_size, train_size, replace=False)
        X_train = X_train[rands, :, :]
        y_train = y_train[rands]

        # Minibatch learning
        for idx in tqdm(range(0, train_size, minibatch_size)):
            x = X_train[idx:min(idx + minibatch_size, train_size), :, :]
            y = y_train[idx:min(idx + minibatch_size, train_size)]
            x, y = transform(x, y)
            pred_train = model(x)
            loss_train = F.softmax_cross_entropy(pred_train, y)
            model.cleargrads()
            loss_train.backward()
            optimizer.update()
        # Calculate loss
        with chainer.using_config('train', False):
            with chainer.using_config('enable_backprop', False):
                pred_test = model(X_test)
        loss_test = F.softmax_cross_entropy(pred_test, y_test).data
        test_acc = F.accuracy(pred_test, y_test).data
        print('\nepoch :', epoch, '  loss :', loss_test, ' accuracy:',
              test_acc)
        # Log
        if args.policy == "rollout":
            with open("../log/rollout.txt", "a") as f:
                f.write(str(loss_test) + ", " + str(test_acc) + "\n")
        else:
            with open("../log/sl.txt", "a") as f:
                f.write(str(loss_test) + ", " + str(test_acc) + "\n")
        # Save models
        model.to_cpu()
        if args.policy == "rollout":
            serializers.save_npz('../models/rollout_model.npz', model)
            serializers.save_npz('../models/rollout_optimizer.npz', optimizer)
        else:
            serializers.save_npz('../models/sl_model.npz', model)
            serializers.save_npz('../models/sl_optimizer.npz', optimizer)
Example #4
0
def main():
    # Set the number of sets
    parser = argparse.ArgumentParser(description='IaGo:')
    parser.add_argument('--models',
                        '-m',
                        type=int,
                        default=1,
                        help='Number of trained models')
    parser.add_argument('--set',
                        '-s',
                        type=int,
                        default=1000,
                        help='Number of game sets played to train')
    args = parser.parse_args()
    N = 32

    # Model definition
    model1 = network.SLPolicy()
    serializers.load_npz("../models/RL/model2.npz", model1)
    optimizer = optimizers.Adam()
    optimizer.setup(model1)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))
    serializers.load_npz("../models/RL/optimizers/2.npz", optimizer)
    # REINFORCE algorithm
    models = args.models
    cnt = 0
    #for set in tqdm(range(0, args.set)):
    while (models <= 20):
        # Randomly choose competitor model from reinforced models
        model2 = network.SLPolicy()
        model2_path = np.random.choice(glob.glob("../models/RL/*.npz"))
        print(model2_path)
        serializers.load_npz(model2_path, model2)

        result = 0
        state_seq, action_seq, reward_seq = [], [], []
        for i in tqdm(range(2 * N)):
            game = rl_self_play.Game(model1, model2)
            if i % 2 == 1:
                # Switch head and tail
                pos = random.choice([[2, 4], [3, 5], [4, 2], [5, 3]])
                game.state[pos[0], pos[1]] = 2
            states, actions, judge = game()
            rewards = [judge] * len(states)
            state_seq += states
            action_seq += actions
            reward_seq += rewards
            if judge == 1:
                result += 1

        # Update model
        x = np.array(state_seq)
        x = np.stack([x == 1, x == 2], axis=0).astype(np.float32)
        x = Variable(x.transpose(1, 0, 2, 3))
        y = Variable(np.array(action_seq).astype(np.int32))
        r = Variable(np.array(reward_seq).astype(np.float32))
        pred = model1(x)
        c = F.softmax_cross_entropy(pred, y, reduce="no")
        model1.cleargrads()
        loss = F.mean(c * r)
        loss.backward()
        optimizer.update()
        rate = result / (2 * N)
        print("Models:" + str(models) + ", Result:" + str(rate) + ", Loss:" +
              str(loss.data))
        with open("../log/rl.txt", "a") as f:
            f.write(str(rate) + ", \n")
        if rate > 0.5:
            cnt += 1
        if cnt > 4 * np.sqrt(models) and rate > 0.6:
            model = copy.deepcopy(model1)
            #model.to_cpu()
            serializers.save_npz("../models/RL/model" + str(models) + ".npz",
                                 model)
            serializers.save_npz(
                "../models/RL/optimizers/" + str(models) + ".npz", optimizer)
            models += 1
            cnt = 0
        if rate < 0.2:
            break