Ejemplo n.º 1
0
def train(snapshotroot, ensembleType, numTrees, depth, seed=0):
    xtrain, ytrain, xtest, ytest = datasets.load_madelon()

    # Labels
    ytrain = ytrain.astype(np.int32)
    ytest = ytest.astype(np.int32)

    xtrain, ytrain, xval, yval = balanced_shuffle(xtrain, ytrain, 1500)

    metric = "logloss"

    earlyStop = max(1, int(0.1 * numTrees))

    clf = ensembleType(max_depth=depth,
                       use_label_encoder=False,
                       tree_method="exact",
                       n_estimators=numTrees,
                       random_state=seed)
    clf.fit(xtrain,
            ytrain,
            eval_set=[(xtrain, ytrain), (xval, yval)],
            eval_metric=metric,
            verbose=False,
            early_stopping_rounds=earlyStop)

    print(
        f"best iteration = {clf.best_iteration}, best_score = {clf.best_score}, best_ntree_limit = {clf.best_ntree_limit}"
    )

    results = clf.evals_result()
    ypred = clf.predict(xtest)

    acc = (ypred == ytest).mean()

    return acc, np.array(results["validation_1"][metric])
Ejemplo n.º 2
0
def train(snapshotroot, ensembleType, numTrees, depth, seed=0):
    xtrain, ytrain, xtest, ytest = datasets.load_madelon()
    
    # Labels
    ytrain = ytrain.astype(np.int32)
    ytest = ytest.astype(np.int32)
    
    clf = ensembleType(random_state=seed, n_estimators=numTrees, max_features="sqrt", max_depth=depth)
    clf.fit(xtrain, ytrain)

    acc = clf.score(xtest, ytest)

    return acc
Ejemplo n.º 3
0
def train(snapshotroot, device, forestType, numTrees, depth):
    xtrain, ytrain, xtest, ytest = datasets.load_madelon()

    # Labels
    ytrain = ytrain.astype(np.int32)
    ytest = ytest.astype(np.int32)

    xtrain, ytrain, xval, yval = balanced_shuffle(xtrain, ytrain, 1500)

    net = Net(forestType, numTrees, depth).to(device)
    criterion = nn.CrossEntropyLoss().to(device)

    # Transfer this data to the device
    xtrain = torch.from_numpy(xtrain).type(torch.float32).to(device)
    ytrain = torch.from_numpy(ytrain).type(torch.long).to(device)
    xval = torch.from_numpy(xval).type(torch.float32).to(device)
    yval = torch.from_numpy(yval).type(torch.long).to(device)
    xtest = torch.from_numpy(xtest).type(torch.float32).to(device)
    ytest = torch.from_numpy(ytest).type(torch.long).to(device)

    #optimizer = optim.Adam(net.parameters(), lr = 0.001)
    optimizer = optim.Adam(net.parameters(), lr=1e-4)

    numEpochs = 200
    batchSize = 25

    indices = [i for i in range(xtrain.shape[0])]

    bestEpoch = numEpochs - 1
    bestLoss = 1000.0
    mu = 10.0
    targetKeep = 10

    valLosses = np.zeros([numEpochs])
    ratesUpdated = False

    for epoch in range(numEpochs):
        random.shuffle(indices)

        xtrain = xtrain[indices, :]
        ytrain = ytrain[indices]

        numKeep = targetKeep + (500 - targetKeep) * max(
            0.0, (numEpochs - 2.0 * epoch) / (2.0 * epoch * mu + numEpochs))
        numKeep = int(numKeep)

        print(f"Info: Epoch = {epoch}, numKeep = {numKeep}", flush=True)
        #net.features.select(numKeep)
        net.features.group_select(numKeep)

        #print(net.features.selection.sum(dim=1))

        if numKeep <= targetKeep and not ratesUpdated:
            ratesUpdated = True
            print("Info: Updating learning rates...", flush=True)
            for g in optimizer.param_groups:
                g['lr'] = 1e-3

        runningLoss = 0.0
        count = 0
        for xbatch, ybatch in batches(xtrain, ytrain, batchSize):
            optimizer.zero_grad()

            outputs = net(xbatch)
            loss = criterion(outputs, ybatch)

            loss.backward()

            optimizer.step()

            runningLoss += loss
            count += 1

        meanLoss = runningLoss / count

        snapshotFile = os.path.join(snapshotroot, f"epoch_{epoch}")
        torch.save(net.state_dict(), snapshotFile)

        runningLoss = 0.0
        count = 0

        with torch.no_grad():
            net.train(False)
            #for xbatch, ybatch in batches(xval, yval, batchSize):
            for xbatch, ybatch in zip([xval], [yval]):
                outputs = net(xbatch)
                loss = criterion(outputs, ybatch)

                runningLoss += loss
                count += 1

            net.train(True)

        valLoss = runningLoss / count

        if valLoss < bestLoss:
            bestLoss = valLoss
            bestEpoch = epoch

        valLosses[epoch] = valLoss

        print(
            f"Info: epoch = {epoch}, loss = {meanLoss}, validation loss = {valLoss}",
            flush=True)

    snapshotFile = os.path.join(snapshotroot, f"epoch_{bestEpoch}")

    net = Net(forestType, numTrees, depth)
    net.load_state_dict(torch.load(snapshotFile, map_location="cpu"))
    net = net.to(device)

    totalCorrect = 0
    count = 0

    with torch.no_grad():
        net.train(False)
        #for xbatch, ybatch in batches(xtest, ytest, batchSize):
        for xbatch, ybatch in zip([xtest], [ytest]):
            outputs = net(xbatch)
            outputs = torch.argmax(outputs, dim=1)

            tmpCorrect = torch.sum(outputs == ybatch)

            totalCorrect += tmpCorrect
            count += xbatch.shape[0]

    accuracy = float(totalCorrect) / float(count)
    print(
        f"Info: Best epoch = {bestEpoch}, test accuracy = {accuracy}, misclassification rate = {1.0 - accuracy}",
        flush=True)

    return accuracy, valLosses