def xgBoost(df, test_split=0.2, sampler=None, param_dist=None, nreps=1, pca=None, plotImportance=False): smt = sampler avg_auc = 0 avg_acc = 0 clf = None for rep in range(nreps): testDf, trainDf = models.testTrainSplitDataframe(df, test_size=test_split) trainX, trainY = models.dataFrameCleanGetLabels(trainDf) X_smt, y_smt = models.resampleData(trainX, trainY, resampler=smt) if param_dist is None: param_dist = dict( max_depth=7, learning_rate=0.1, n_estimators=40, gamma=10, scale_pos_weight=1, base_score=sum(y_smt) / len(y_smt), subsample=1, #colsample_bytree=0.3, objective='binary:logistic') print('XGBoost training class distribution:', Counter(y_smt)) # class_balance = len(y) / sum(y) - 1 # n_negative / n_positive #param_dist = { 'objective':'binary:logistic', 'n_estimators': 2 } testX, testY = models.dataFrameCleanGetLabels(testDf) if pca: X_smt = pca.transform(X_smt) testX = pca.transform(testX) auc, acc, clf = models.trainAndTestXGBoost(X_smt, y_smt, testX, testY, param_dist, verbose=True) avg_auc += auc avg_acc += acc avg_auc /= nreps avg_acc /= nreps if plotImportance: plot_importance(clf) plt.show() return avg_auc, avg_acc
def runTSNE(): df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t') #trainDf, labels = models.dataFrameGetLabels(df, labelType=float) trainX, trainY = models.dataFrameCleanGetLabels(df, labelType=float) scaler = StandardScaler() trainX = scaler.fit_transform(trainX) pca = PCA(n_components=40) pca.fit_transform(trainX) pca_variance = pca.explained_variance_ratio_ pca_variance = [v for v in pca_variance if v > 1e-3] print(pca_variance) """ plt.figure(figsize=(8, 6)) plt.bar(range(len(pca_variance)), pca_variance, alpha=0.5, align='center', label='individual variance') plt.legend() plt.ylabel('Variance ratio') plt.xlabel('Principal components') plt.show() """ df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t') #trainDf, labels = models.dataFrameGetLabels(df, labelType=float) trainX, trainY = models.dataFrameCleanGetLabels(df, labelType=float) trainX, trainY = models.resampleData(trainX, trainY) scaler = StandardScaler() trainX = scaler.fit_transform(trainX) x_transformed = pca.transform(trainX) tsne = TSNE(n_components=2) x_embedded = tsne.fit_transform(x_transformed) x_plt, y_plt = zip(*x_embedded) plt.figure(figsize=(8, 6)) plt.scatter(x_plt, y_plt, s=[0.1, 0.2], c=trainY, cmap=mpl.colors.ListedColormap([[0.1, 0.1, 0.1, 0.2], [1.0, 0, 0, 0.9]])) plt.show()
def feedforward(df, test_split=0.2, sampler=None, opt=None, batchSize=200, num_models=1): if opt is None: opt = dict(cuda=True, batchSize=batchSize, testBatchSize=80, lr=0.00005, nEpochs=60, threads=4, seed=123, checkpoint_dir='.') if opt['cuda'] and not torch.cuda.is_available(): raise Exception("No GPU found, please run without cuda enabled.") #torch.manual_seed(opt['seed']) device = torch.device("cuda" if opt['cuda'] else "cpu") testDf, trainDf = models.testTrainSplitDataframe(df, test_size=0.2) trainX, trainY = models.dataFrameCleanGetLabels(trainDf, labelType=float) testX, testY = models.dataFrameCleanGetLabels(testDf, labelType=float) trainX = np.array(trainX) testX = np.array(testX) trainY = np.array(trainY) testY = np.array(testY) num_features = trainX.shape[1] modelsList = [] for idx in range(num_models): print('Ensemble Model', idx) print('\tResampling model training data') trainX, validateX, trainY, validateY = train_test_split(trainX, trainY, test_size=0.2) X_smt, y_smt = models.resampleData(trainX, trainY, resampler=sampler) train_data = [] for i in range(len(X_smt)): train_data.append([X_smt[i], y_smt[i]]) test_data = [] for i in range(len(testX)): test_data.append([testX[i], testY[i]]) training_data_loader = DataLoader(dataset=train_data, num_workers=opt['threads'], batch_size=opt['batchSize'], shuffle=True) testing_data_loader = DataLoader(dataset=test_data, num_workers=opt['threads'], batch_size=opt['testBatchSize'], shuffle=False) model = Feedforward(num_features).to(device) criterion = nn.BCELoss() #criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=opt['lr']) print('\tTraining Model...') print('\tEpoch ... ', end='') for epoch in range(1, opt['nEpochs'] + 1): train(model, training_data_loader, optimizer, criterion, device, epoch) test(model, testing_data_loader, criterion, device) print(' {}'.format(epoch), end='') #print(' Model trained.') #print('\tModel final BCE -- Train: {}, Validate: {}'.format(trainMSE, testMSE)) #print('\tChecking model against validation data') model.eval() model.to(torch.device("cpu")) preds = model(torch.from_numpy(validateX)) preds = preds.detach().numpy() preds = [1 if val >= 0.5 else 0 for val in preds] print('\n\tModel AUC:', performance.getAUC(validateY, preds)) modelsList.append(model) ensembledPredictions = models.getEnsemblePredictionsNN(modelsList, testX) auc = performance.getAUC(testY, ensembledPredictions) acc = performance.getAccuracy(testY, ensembledPredictions) return auc, acc
def feedforwardKFold(df, test_size=0.2, sampler=None, opt=None, batchSize=100, k=5, nreps=1): if opt is None: opt = dict(cuda=True, batchSize=batchSize, testBatchSize=80, lr=0.00005, nEpochs=60, threads=4, seed=123, checkpoint_dir='.') if opt['cuda'] and not torch.cuda.is_available(): raise Exception("No GPU found, please run without cuda enabled.") #torch.manual_seed(opt['seed']) device = torch.device("cuda" if opt['cuda'] else "cpu") testDf, trainDf = models.testTrainSplitDataframe(df, test_size=test_size) if test_size == 0.0: testDf = None trainDf = df else: testDf, trainDf = models.testTrainSplitDataframe(df, test_size=test_size) trainDf, labels = models.dataFrameGetLabels(df, labelType=float) trainDf, labels = models.resampleData(trainDf, labels, resampler=sampler) foldsCollection = crossValidate.getKfolds(trainDf, 'osu18_groups.tsv', k, nreps=nreps) modelList = [] avg_auc = 0 avg_accuracy = 0 for rep, foldGroup in enumerate(foldsCollection): foldList = foldGroup[0] labelList = foldGroup[1] print('Rep {}'.format(rep)) fold_auc = 0 fold_acc = 0 for idx in range(len(foldList)): testX = foldList[idx] testY = labelList[idx] print('\tFold {}'.format(idx)) print('\t\tTest Size: {}'.format(testX.shape[0])) #trainX = crossValidate.getRemainder(foldList, testX) #trainY = crossValidate.getRemainder(labelList, testY) trainX = np.empty(shape=[0, testX.shape[1]]) trainY = np.empty(shape=[ 0, ]) num_features = trainX.shape[1] for j in range(len(foldList)): if j != idx: trainX = np.concatenate((trainX, foldList[j]), axis=0) trainY = np.concatenate((trainY, labelList[j]), axis=0) X_smt = trainX y_smt = trainY train_data = [] for i in range(len(X_smt)): train_data.append([X_smt[i], y_smt[i]]) test_data = [] for i in range(len(testX)): test_data.append([testX[i], testY[i]]) training_data_loader = DataLoader(dataset=train_data, num_workers=opt['threads'], batch_size=opt['batchSize'], shuffle=True) testing_data_loader = DataLoader(dataset=test_data, num_workers=opt['threads'], batch_size=opt['testBatchSize'], shuffle=False) model = Feedforward(num_features).to(device) criterion = nn.BCELoss() #criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=opt['lr']) print('\tTraining Model...') print('\tEpoch ... ', end='') for epoch in range(1, opt['nEpochs'] + 1): train(model, training_data_loader, optimizer, criterion, device, epoch) test(model, testing_data_loader, criterion, device) print(' {}'.format(epoch), end='') #print(' Model trained.') #print('\tModel final BCE -- Train: {}, Validate: {}'.format(trainMSE, testMSE)) #print('\tChecking model against validation data') model.eval() model.to(torch.device("cpu")) preds = model(torch.from_numpy(testX)) preds = preds.detach().numpy() preds = [1 if val >= 0.5 else 0 for val in preds] print('\tModel AUC:', performance.getAUC(testY, preds)) curr_auc = performance.getAUC(testY, preds) curr_accuracy = performance.getAccuracy(testY, preds) fold_auc += curr_auc fold_acc += curr_accuracy modelList.append(model) fold_auc /= k fold_acc /= k print('Average K-Fold AUC for all folds: {}'.format(fold_auc)) avg_auc += fold_auc avg_accuracy += fold_acc avg_auc /= nreps avg_accuracy /= nreps if testDf is not None: testX, testY = models.dataFrameCleanGetLabels(testDf) testX = np.array(testX) testY = np.array(testY) ensembledPredictions = models.getEnsemblePredictionsNN( modelList, testX) print('Final Ensemble Predictions') ens_auc = performance.getAUC(testY, ensembledPredictions) ens_acc = performance.getAccuracy(testY, ensembledPredictions) return ens_auc, ens_acc else: return avg_auc, avg_accuracy
def xgBoostKFold(df, test_split=0.2, sampler=None, param_dist=None, k=10, nreps=1): if test_split == 0.0: testDf = None trainDf = df else: testDf, trainDf = models.testTrainSplitDataframe(df, test_size=test_split) trainDf, labels = models.dataFrameGetLabels(trainDf, labelType=int) trainDf, labels = models.resampleData(trainDf, labels, resampler=sampler) print('XGBoost k-fold training class distribution:', Counter(labels)) k = 5 foldsCollection = crossValidate.getKfolds(trainDf, 'osu18_groups.tsv', k, nreps=nreps) modelList = [] avg_auc = 0 avg_accuracy = 0 for rep, foldGroup in enumerate(foldsCollection): foldList = foldGroup[0] labelList = foldGroup[1] print('Rep {}'.format(rep)) fold_auc = 0 fold_accuracy = 0 for idx in range(len(foldList)): testX = foldList[idx] testY = labelList[idx] print('\tFold {}'.format(idx)) print('\t\tTest Size: {}'.format(testX.shape[0])) #trainX = crossValidate.getRemainder(foldList, testX) #trainY = crossValidate.getRemainder(labelList, testY) trainX = np.empty(shape=[0, testX.shape[1]]) trainY = np.empty(shape=[ 0, ]) for j in range(len(foldList)): if j != idx: trainX = np.concatenate((trainX, foldList[j]), axis=0) trainY = np.concatenate((trainY, labelList[j]), axis=0) X_smt = trainX y_smt = trainY _RANDOM_STATE = 1337 # class_balance = len(y) / sum(y) - 1 # n_negative / n_positive rare_event_rate = sum(y_smt) / len(y_smt) if param_dist is None: param_dist = dict( max_depth=7, learning_rate=0.1, n_estimators=40, gamma=10, scale_pos_weight=1, base_score=rare_event_rate, subsample=1, #colsample_bytree=0.3, objective='binary:logistic') #param_dist = { 'objective':'binary:logistic', 'n_estimators': 2 } curr_auc, curr_accuracy, clf = models.trainAndTestXGBoost( X_smt, y_smt, testX, testY, param_dist) print('Current fold AUC: {}'.format(curr_auc)) print('Current fold accuracy: {}'.format(curr_accuracy)) fold_auc += curr_auc fold_accuracy += curr_accuracy modelList.append(clf) fold_auc /= k fold_accuracy /= k print('Average K-Fold AUC for all folds: {}'.format(fold_auc)) avg_auc += fold_auc avg_accuracy += fold_accuracy avg_auc /= nreps avg_accuracy /= nreps if testDf is not None: testX, testY = models.dataFrameCleanGetLabels(testDf) testX = np.array(testX) testY = np.array(testY) ensembledPredictions = models.getEnsemblePredictionsXGBoost( modelList, testX) print('Final Ensemble Predictions') ens_auc = performance.getAUC(testY, ensembledPredictions) ens_acc = performance.getAccuracy(testY, ensembledPredictions) return ens_auc, ens_acc else: return avg_auc, avg_accuracy
print(opt) if opt.cuda and not torch.cuda.is_available(): raise Exception("No GPU found, please run without --cuda") torch.manual_seed(opt.seed) device = torch.device("cuda" if opt.cuda else "cpu") print('==> Loading datasets') df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t') testDf, trainDf = models.testTrainSplitDataframe(df, test_size=0.2) print('==> Splitting Data Set into Train/Test sets') trainX, trainY = models.dataFrameCleanGetLabels(trainDf, labelType=float) testX, testY = models.dataFrameCleanGetLabels(testDf, labelType=float) trainX = np.array(trainX) testX = np.array(testX) trainY = np.array(trainY) testY = np.array(testY) num_features = trainX.shape[1] modelsList = [] for idx in range(opt.num_models): print('Ensemble Model', idx) print('\tResampling model training data') trainX, validateX, trainY, validateY = train_test_split(trainX,