コード例 #1
0
def xgBoost(df,
            test_split=0.2,
            sampler=None,
            param_dist=None,
            nreps=1,
            pca=None,
            plotImportance=False):
    smt = sampler

    avg_auc = 0
    avg_acc = 0
    clf = None
    for rep in range(nreps):

        testDf, trainDf = models.testTrainSplitDataframe(df,
                                                         test_size=test_split)
        trainX, trainY = models.dataFrameCleanGetLabels(trainDf)
        X_smt, y_smt = models.resampleData(trainX, trainY, resampler=smt)

        if param_dist is None:
            param_dist = dict(
                max_depth=7,
                learning_rate=0.1,
                n_estimators=40,
                gamma=10,
                scale_pos_weight=1,
                base_score=sum(y_smt) / len(y_smt),
                subsample=1,
                #colsample_bytree=0.3,
                objective='binary:logistic')

        print('XGBoost training class distribution:', Counter(y_smt))

        # class_balance = len(y) / sum(y) - 1  # n_negative / n_positive
        #param_dist = { 'objective':'binary:logistic', 'n_estimators': 2 }

        testX, testY = models.dataFrameCleanGetLabels(testDf)

        if pca:
            X_smt = pca.transform(X_smt)
            testX = pca.transform(testX)

        auc, acc, clf = models.trainAndTestXGBoost(X_smt,
                                                   y_smt,
                                                   testX,
                                                   testY,
                                                   param_dist,
                                                   verbose=True)
        avg_auc += auc
        avg_acc += acc

    avg_auc /= nreps
    avg_acc /= nreps

    if plotImportance:
        plot_importance(clf)
        plt.show()

    return avg_auc, avg_acc
コード例 #2
0
def runTSNE():
    df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t')
    #trainDf, labels = models.dataFrameGetLabels(df, labelType=float)
    trainX, trainY = models.dataFrameCleanGetLabels(df, labelType=float)

    scaler = StandardScaler()

    trainX = scaler.fit_transform(trainX)

    pca = PCA(n_components=40)
    pca.fit_transform(trainX)

    pca_variance = pca.explained_variance_ratio_

    pca_variance = [v for v in pca_variance if v > 1e-3]

    print(pca_variance)
    """
  plt.figure(figsize=(8, 6))
  plt.bar(range(len(pca_variance)), pca_variance, alpha=0.5, align='center', label='individual variance')
  plt.legend()
  plt.ylabel('Variance ratio')
  plt.xlabel('Principal components')
  plt.show()
  """

    df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t')
    #trainDf, labels = models.dataFrameGetLabels(df, labelType=float)
    trainX, trainY = models.dataFrameCleanGetLabels(df, labelType=float)
    trainX, trainY = models.resampleData(trainX, trainY)

    scaler = StandardScaler()

    trainX = scaler.fit_transform(trainX)
    x_transformed = pca.transform(trainX)

    tsne = TSNE(n_components=2)

    x_embedded = tsne.fit_transform(x_transformed)

    x_plt, y_plt = zip(*x_embedded)

    plt.figure(figsize=(8, 6))
    plt.scatter(x_plt,
                y_plt,
                s=[0.1, 0.2],
                c=trainY,
                cmap=mpl.colors.ListedColormap([[0.1, 0.1, 0.1, 0.2],
                                                [1.0, 0, 0, 0.9]]))
    plt.show()
コード例 #3
0
def feedforward(df,
                test_split=0.2,
                sampler=None,
                opt=None,
                batchSize=200,
                num_models=1):
    if opt is None:
        opt = dict(cuda=True,
                   batchSize=batchSize,
                   testBatchSize=80,
                   lr=0.00005,
                   nEpochs=60,
                   threads=4,
                   seed=123,
                   checkpoint_dir='.')

    if opt['cuda'] and not torch.cuda.is_available():
        raise Exception("No GPU found, please run without cuda enabled.")

    #torch.manual_seed(opt['seed'])
    device = torch.device("cuda" if opt['cuda'] else "cpu")

    testDf, trainDf = models.testTrainSplitDataframe(df, test_size=0.2)

    trainX, trainY = models.dataFrameCleanGetLabels(trainDf, labelType=float)
    testX, testY = models.dataFrameCleanGetLabels(testDf, labelType=float)

    trainX = np.array(trainX)
    testX = np.array(testX)
    trainY = np.array(trainY)
    testY = np.array(testY)

    num_features = trainX.shape[1]

    modelsList = []
    for idx in range(num_models):
        print('Ensemble Model', idx)
        print('\tResampling model training data')

        trainX, validateX, trainY, validateY = train_test_split(trainX,
                                                                trainY,
                                                                test_size=0.2)

        X_smt, y_smt = models.resampleData(trainX, trainY, resampler=sampler)

        train_data = []
        for i in range(len(X_smt)):
            train_data.append([X_smt[i], y_smt[i]])

        test_data = []
        for i in range(len(testX)):
            test_data.append([testX[i], testY[i]])

        training_data_loader = DataLoader(dataset=train_data,
                                          num_workers=opt['threads'],
                                          batch_size=opt['batchSize'],
                                          shuffle=True)
        testing_data_loader = DataLoader(dataset=test_data,
                                         num_workers=opt['threads'],
                                         batch_size=opt['testBatchSize'],
                                         shuffle=False)

        model = Feedforward(num_features).to(device)

        criterion = nn.BCELoss()
        #criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=opt['lr'])

        print('\tTraining Model...')
        print('\tEpoch ... ', end='')
        for epoch in range(1, opt['nEpochs'] + 1):
            train(model, training_data_loader, optimizer, criterion, device,
                  epoch)
            test(model, testing_data_loader, criterion, device)
            print(' {}'.format(epoch), end='')

        #print(' Model trained.')
        #print('\tModel final BCE -- Train: {}, Validate: {}'.format(trainMSE, testMSE))

        #print('\tChecking model against validation data')

        model.eval()
        model.to(torch.device("cpu"))
        preds = model(torch.from_numpy(validateX))
        preds = preds.detach().numpy()
        preds = [1 if val >= 0.5 else 0 for val in preds]

        print('\n\tModel AUC:', performance.getAUC(validateY, preds))

        modelsList.append(model)

    ensembledPredictions = models.getEnsemblePredictionsNN(modelsList, testX)

    auc = performance.getAUC(testY, ensembledPredictions)
    acc = performance.getAccuracy(testY, ensembledPredictions)

    return auc, acc
コード例 #4
0
def feedforwardKFold(df,
                     test_size=0.2,
                     sampler=None,
                     opt=None,
                     batchSize=100,
                     k=5,
                     nreps=1):
    if opt is None:
        opt = dict(cuda=True,
                   batchSize=batchSize,
                   testBatchSize=80,
                   lr=0.00005,
                   nEpochs=60,
                   threads=4,
                   seed=123,
                   checkpoint_dir='.')

    if opt['cuda'] and not torch.cuda.is_available():
        raise Exception("No GPU found, please run without cuda enabled.")

    #torch.manual_seed(opt['seed'])
    device = torch.device("cuda" if opt['cuda'] else "cpu")

    testDf, trainDf = models.testTrainSplitDataframe(df, test_size=test_size)

    if test_size == 0.0:
        testDf = None
        trainDf = df
    else:
        testDf, trainDf = models.testTrainSplitDataframe(df,
                                                         test_size=test_size)

    trainDf, labels = models.dataFrameGetLabels(df, labelType=float)

    trainDf, labels = models.resampleData(trainDf, labels, resampler=sampler)

    foldsCollection = crossValidate.getKfolds(trainDf,
                                              'osu18_groups.tsv',
                                              k,
                                              nreps=nreps)

    modelList = []

    avg_auc = 0
    avg_accuracy = 0
    for rep, foldGroup in enumerate(foldsCollection):
        foldList = foldGroup[0]
        labelList = foldGroup[1]

        print('Rep {}'.format(rep))

        fold_auc = 0
        fold_acc = 0
        for idx in range(len(foldList)):
            testX = foldList[idx]
            testY = labelList[idx]
            print('\tFold {}'.format(idx))
            print('\t\tTest Size: {}'.format(testX.shape[0]))
            #trainX = crossValidate.getRemainder(foldList, testX)
            #trainY = crossValidate.getRemainder(labelList, testY)

            trainX = np.empty(shape=[0, testX.shape[1]])
            trainY = np.empty(shape=[
                0,
            ])

            num_features = trainX.shape[1]

            for j in range(len(foldList)):
                if j != idx:
                    trainX = np.concatenate((trainX, foldList[j]), axis=0)
                    trainY = np.concatenate((trainY, labelList[j]), axis=0)

            X_smt = trainX
            y_smt = trainY

            train_data = []
            for i in range(len(X_smt)):
                train_data.append([X_smt[i], y_smt[i]])

            test_data = []
            for i in range(len(testX)):
                test_data.append([testX[i], testY[i]])

            training_data_loader = DataLoader(dataset=train_data,
                                              num_workers=opt['threads'],
                                              batch_size=opt['batchSize'],
                                              shuffle=True)
            testing_data_loader = DataLoader(dataset=test_data,
                                             num_workers=opt['threads'],
                                             batch_size=opt['testBatchSize'],
                                             shuffle=False)

            model = Feedforward(num_features).to(device)

            criterion = nn.BCELoss()
            #criterion = nn.MSELoss()
            optimizer = optim.Adam(model.parameters(), lr=opt['lr'])

            print('\tTraining Model...')
            print('\tEpoch ... ', end='')
            for epoch in range(1, opt['nEpochs'] + 1):
                train(model, training_data_loader, optimizer, criterion,
                      device, epoch)
                test(model, testing_data_loader, criterion, device)
                print(' {}'.format(epoch), end='')

            #print(' Model trained.')
            #print('\tModel final BCE -- Train: {}, Validate: {}'.format(trainMSE, testMSE))

            #print('\tChecking model against validation data')

            model.eval()
            model.to(torch.device("cpu"))
            preds = model(torch.from_numpy(testX))
            preds = preds.detach().numpy()
            preds = [1 if val >= 0.5 else 0 for val in preds]

            print('\tModel AUC:', performance.getAUC(testY, preds))

            curr_auc = performance.getAUC(testY, preds)
            curr_accuracy = performance.getAccuracy(testY, preds)
            fold_auc += curr_auc
            fold_acc += curr_accuracy

            modelList.append(model)

        fold_auc /= k
        fold_acc /= k
        print('Average K-Fold AUC for all folds: {}'.format(fold_auc))

        avg_auc += fold_auc
        avg_accuracy += fold_acc

    avg_auc /= nreps
    avg_accuracy /= nreps

    if testDf is not None:
        testX, testY = models.dataFrameCleanGetLabels(testDf)

        testX = np.array(testX)
        testY = np.array(testY)

        ensembledPredictions = models.getEnsemblePredictionsNN(
            modelList, testX)

        print('Final Ensemble Predictions')
        ens_auc = performance.getAUC(testY, ensembledPredictions)
        ens_acc = performance.getAccuracy(testY, ensembledPredictions)
        return ens_auc, ens_acc
    else:
        return avg_auc, avg_accuracy
コード例 #5
0
def xgBoostKFold(df,
                 test_split=0.2,
                 sampler=None,
                 param_dist=None,
                 k=10,
                 nreps=1):

    if test_split == 0.0:
        testDf = None
        trainDf = df
    else:
        testDf, trainDf = models.testTrainSplitDataframe(df,
                                                         test_size=test_split)

    trainDf, labels = models.dataFrameGetLabels(trainDf, labelType=int)
    trainDf, labels = models.resampleData(trainDf, labels, resampler=sampler)

    print('XGBoost k-fold training class distribution:', Counter(labels))

    k = 5
    foldsCollection = crossValidate.getKfolds(trainDf,
                                              'osu18_groups.tsv',
                                              k,
                                              nreps=nreps)

    modelList = []

    avg_auc = 0
    avg_accuracy = 0

    for rep, foldGroup in enumerate(foldsCollection):
        foldList = foldGroup[0]
        labelList = foldGroup[1]

        print('Rep {}'.format(rep))

        fold_auc = 0
        fold_accuracy = 0
        for idx in range(len(foldList)):
            testX = foldList[idx]
            testY = labelList[idx]
            print('\tFold {}'.format(idx))
            print('\t\tTest Size: {}'.format(testX.shape[0]))
            #trainX = crossValidate.getRemainder(foldList, testX)
            #trainY = crossValidate.getRemainder(labelList, testY)

            trainX = np.empty(shape=[0, testX.shape[1]])
            trainY = np.empty(shape=[
                0,
            ])

            for j in range(len(foldList)):
                if j != idx:
                    trainX = np.concatenate((trainX, foldList[j]), axis=0)
                    trainY = np.concatenate((trainY, labelList[j]), axis=0)

            X_smt = trainX
            y_smt = trainY

            _RANDOM_STATE = 1337
            # class_balance = len(y) / sum(y) - 1  # n_negative / n_positive
            rare_event_rate = sum(y_smt) / len(y_smt)

            if param_dist is None:
                param_dist = dict(
                    max_depth=7,
                    learning_rate=0.1,
                    n_estimators=40,
                    gamma=10,
                    scale_pos_weight=1,
                    base_score=rare_event_rate,
                    subsample=1,
                    #colsample_bytree=0.3,
                    objective='binary:logistic')

            #param_dist = { 'objective':'binary:logistic', 'n_estimators': 2 }

            curr_auc, curr_accuracy, clf = models.trainAndTestXGBoost(
                X_smt, y_smt, testX, testY, param_dist)

            print('Current fold AUC: {}'.format(curr_auc))
            print('Current fold accuracy: {}'.format(curr_accuracy))
            fold_auc += curr_auc
            fold_accuracy += curr_accuracy

            modelList.append(clf)

        fold_auc /= k
        fold_accuracy /= k
        print('Average K-Fold AUC for all folds: {}'.format(fold_auc))

        avg_auc += fold_auc
        avg_accuracy += fold_accuracy

    avg_auc /= nreps
    avg_accuracy /= nreps

    if testDf is not None:
        testX, testY = models.dataFrameCleanGetLabels(testDf)

        testX = np.array(testX)
        testY = np.array(testY)

        ensembledPredictions = models.getEnsemblePredictionsXGBoost(
            modelList, testX)

        print('Final Ensemble Predictions')
        ens_auc = performance.getAUC(testY, ensembledPredictions)
        ens_acc = performance.getAccuracy(testY, ensembledPredictions)
        return ens_auc, ens_acc
    else:
        return avg_auc, avg_accuracy
コード例 #6
0
print(opt)

if opt.cuda and not torch.cuda.is_available():
    raise Exception("No GPU found, please run without --cuda")

torch.manual_seed(opt.seed)
device = torch.device("cuda" if opt.cuda else "cpu")

print('==> Loading datasets')
df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t')

testDf, trainDf = models.testTrainSplitDataframe(df, test_size=0.2)

print('==> Splitting Data Set into Train/Test sets')

trainX, trainY = models.dataFrameCleanGetLabels(trainDf, labelType=float)
testX, testY = models.dataFrameCleanGetLabels(testDf, labelType=float)

trainX = np.array(trainX)
testX = np.array(testX)
trainY = np.array(trainY)
testY = np.array(testY)

num_features = trainX.shape[1]

modelsList = []
for idx in range(opt.num_models):
    print('Ensemble Model', idx)
    print('\tResampling model training data')

    trainX, validateX, trainY, validateY = train_test_split(trainX,