def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio): sampler = None verbose = True if sample_type == SMOTE_REG: sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15) elif sample_type == SMOTE_SVM: # TODO: Make this configurable? svm_args = {'class_weight': 'balanced'} sampler = SMOTE(kind='svm', ratio=ratio, verbose=verbose, k=15, **svm_args) elif sample_type == SMOTE_BORDERLINE_1: sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_BORDERLINE_2: sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_ENN: sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15) elif sample_type == SMOTE_TOMEK: sampler = SMOTETomek(ratio=ratio, verbose=verbose, k=15) elif sample_type == UNDERSAMPLER: sampler = UnderSampler(ratio=ratio, verbose=verbose, replacement=False, random_state=17) elif sample_type == ADASYN_SAMPLER: sampler = ADASYN(k=15, imb_threshold=0.6, ratio=ratio) elif sample_type == TOMEK_LINKS: sampler = TomekLinks() elif sample_type == CLUSTER_CENTROIDS: sampler = ClusterCentroids(ratio=ratio) elif sample_type == NEARMISS: sampler = NearMiss(ratio=ratio) else: print "Unrecoqnized sample technique: " + sample_type print "Returning original data" return train_x, train_y return sampler.fit_transform(train_x, train_y)
print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") X = np.load('data/X51.npy') Y = np.load('data/y51.npy') # fixes errors with Nan data X = preprocessing.Imputer().fit_transform(X) print(X.shape,Y.shape) # Recursive oversampling and undersampling adsn = ADASYN(imb_threshold=0.5,ratio=0.7) X,Y = adsn.fit_transform(X,Y) X,Y = deleteClass(X,Y,100,2) print(int(np.sqrt(X.shape[1]))) # Create the RFE object and compute a cross-validated score. rf = RandomForestClassifier(n_jobs=-1) gbm =xgb.XGBClassifier(n_estimators=300) # The "accuracy" scoring is proportional to the number of correct # classifications param_dist = {"n_estimators": [10,50,100,150,300], "criterion": ['gini', 'entropy'], "bootstrap": [True,False], "max_features": [10,20,30,40,45,48], "class_weight": ['auto']}
def main(): print('-----------------------------') print('| Active Learning Activated |') print('-----------------------------') X = np.load('X.npy') Y = np.load('Y.npy') print(Counter(Y)) # fixes errors with Nan data X = preprocessing.Imputer().fit_transform(X) print(X.shape, Y.shape) adsn = ADASYN(ratio=0.7) X, Y = adsn.fit_transform(X, Y) print(Counter(Y)) X, Y = deleteClass(X, Y, 100, 2) print(Counter(Y)) # The feature division is not clear by their column number, # It was attempted intuitively while cross-checking with the # feature_importance attribute to make two equally good subspaces # Features regarding the first classifier clasOneCols = [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 32] clasOneData = X[:, clasOneCols] # Features regarding the second classifier clasTwoCols = [ 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ] clasTwoData = X[:, clasTwoCols] #print(clasOneData.shape, clasTwoData.shape) #assisning weights to penalize majority class over minority #class_weights={0 : 1, 1 : 0.2 , 2 : 0.1 , 3 : 0.2, 4 :1} rfr1 = RandomForestClassifier(n_estimators=300, class_weight='auto', n_jobs=-1) rfr2 = RandomForestClassifier(n_estimators=300, class_weight='auto', n_jobs=-1) rfr3 = RandomForestClassifier(n_estimators=300, class_weight='auto', n_jobs=-1) n_samples = 700 tolac1 = [] tolac2 = [] tolac3 = [] rate = [] ranges = ['33', '25', '20'] df = [] for i in [3, 4, 5, 10]: skf = StratifiedKFold(Y, n_folds=i, shuffle=True) for test, train in skf: #print(len(train),len(test), float(len(train))/len(test)) rfr1.fit(clasOneData[train], Y[train]) rfr2.fit(clasTwoData[train], Y[train]) rfr3.fit(X[train], Y[train]) pred1 = rfr1.predict(clasOneData[test]) tolac1.append(tolAcc(Y[test], pred1)) #print('Tolerance accuracy 1: %s' % tolAcc(Y[test],pred1)) pred2 = rfr2.predict(clasTwoData[test]) tolac2.append(tolAcc(Y[test], pred2)) # print('Tolerance accuracy 2: %s' % tolAcc(Y[test],pred2)) pred3 = rfr3.predict(X[test]) tolac3.append(tolAcc(Y[test], pred3)) #print('Combined: %s' % tolAcc(Y[test],pred3)) pred1 = pred1.astype(np.int64) pred2 = pred2.astype(np.int64) aggreement_rate = activeLabeling(pred1, pred2) rate.append(aggreement_rate) #print(rfr3.feature_importances_) print(rate[0:3]) print('Mean is : %s' % np.mean(rate[0:3])) print(rate[3:6]) print('Mean is : %s' % np.mean(rate[3:7])) print(rate[9:12]) print('Mean is : %s' % np.mean(rate[7:12])) print(rate[12:16]) print('Mean is : %s' % np.mean(rate[12:-1]))
plt.show() return classification_report(labels_test, pred) def data_prepration(x_features, x_labels): # preparing data for training and testing as we are going to use different data x_features_train,x_features_test,x_labels_train,x_labels_test = train_test_split(x_features,x_labels,test_size=0.2, random_state=0) print("length of training data") print(len(x_features_train)) print("length of test data") print(len(x_features_test)) return(x_features_train, x_features_test, x_labels_train, x_labels_test) adsn = ADASYN(k=7,imb_threshold=0.7, ratio=1, random_state=0) # now we can devided our data into training and test data # Call our method data prepration on our dataset x = data x = x.dropna(axis=0,how='any') x_features = x.ix[:, x.columns != "Class"] x_labels = x.ix[:, x.columns == "Class"] os_data_X, os_data_y = adsn.fit_transform(x_features.values, [i[0] for i in x_labels.values]) data_train_X, data_test_X, data_train_y, data_test_y = data_prepration(os_data_X, os_data_y) columns = x_features.columns print(columns)
def main(argv): # Change to parent directory to load data # os.chdir(os.path.pardir) X = np.load("data/X51.npy") Y = np.load("data/y51.npy") labels = np.load("data/LOO.npy") print(X.shape) # fixes errors with Nan data # X= preprocessing.Imputer().fit_transform(X) # Recursive oversampling and undersampling # adsn = ADASYN(imb_threshold=0.5,ratio=0.7) # X,Y = adsn.fit_transform(X,Y) # X,Y = adsn.fit_transform(X,Y) # X,Y = deleteClass(X,Y,100,2) # Grouping 5 classes to 3 """for i in range(0,Y.shape[0]): if Y[i]==0 or Y[i]==1: Y[i]==0 elif Y[i]==2: Y[i]=1 else: Y[i]=2 """ print(Counter(Y)) # Synthetic data is only to be used during training to # enhance recall of minority classes. New data are appended # as first rows of X,y size_b = X.shape[0] adsn = ADASYN(imb_threshold=0.5, ratio=0.7) X, Y = adsn.fit_transform(X, Y) size_a = X.shape[0] generated_samp = size_a - size_b newX = X[1:generated_samp] newY = Y[1:generated_samp] # Shuffling original data to ensure no time dependence realX, realY = shuffle(X[generated_samp:-1], Y[generated_samp:-1], random_state=0) realX, realY = shuffle(realX, realY, random_state=15) print("--------------") # appending real data after generated so that test set will not contain synthetic data allX = np.concatenate((newX, realX), axis=0) allY = np.concatenate((newY, realY), axis=0) X, Y = deleteClass(allX, allY, 200, 2) print(X.shape, Y.shape) # creating training set with synthetic data, test set only real data train = [i for i in range(0, int(0.7 * X.shape[0]))] test = [i for i in range(int(0.7 * X.shape[0]), X.shape[0])] print(Counter(Y)) if sys.argv[1] == "-ensemble": RF = [] outputRF = [] outRFtest = [] totalacc = 0 totalRF = 0 totalXGB = 0 # Tests with all features / most important # feats =[0,1,2,3,4,5,6,7,13,16,22,23,24,25,26,27,29,30,31,32,33,35,38,39,40,41,44,46,47,50] # X = X[:,feats] print(X.shape, Y.shape) n_folds = 3 skf = StratifiedKFold(Y, n_folds=n_folds) kf = KFold(X.shape[0], n_folds=n_folds, shuffle=True) for traini, testi in kf: print(len(traini), len(testi)) # Although data is oversampled, still a small imbalance is present rfr = RandomForestClassifier( n_estimators=300, class_weight="auto", n_jobs=-1, criterion="entropy", max_features=X.shape[1], min_samples_split=1, ) gbm = xgb.XGBClassifier(n_estimators=50, learning_rate=0.5, colsample_bytree=0.3).fit(X[traini], Y[traini]) rfr.fit(X[traini], Y[traini]) pred = rfr.predict(X[testi]) pred1 = gbm.predict(X[testi]) # Print to screen mean error and Tolerance Score tempacc, trueRF = tolAcc(Y[testi], pred) print("Random Forest: %s" % tempacc) tempacc1, trueXGB = tolAcc(Y[testi], pred1) print("XGBoost: %s" % tempacc1) totalXGB += trueXGB totalRF += trueRF totalacc += tempacc print("True RF: {0}".format(totalRF / n_folds)) print("True XGB: {0}".format(totalXGB / n_folds)) print("LOSO TP accuracy: {0}".format(totalacc / n_folds)) elif sys.argv[1] == "-cali": # These parameters have been computed with RandomizedSearchCV rf_c = RandomForestClassifier( n_estimators=300, bootstrap=False, class_weight="auto", n_jobs=-1, criterion="entropy", max_features=15, min_samples_split=1, ) gbm = xgb.XGBClassifier( n_estimators=300, learning_rate=0.2, colsample_bytree=0.5, objective="multi:softmax", max_depth=15, gamma=0.001, ) # Non-calibrated random forest rf_c.fit(X[train], Y[train]) pred = rf_c.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac) # Using isotonic calibration with 3-fold cv to improve results # Both on RF and XGBoost rf_c1 = RandomForestClassifier( n_estimators=300, bootstrap=False, class_weight="auto", n_jobs=-1, criterion="entropy", max_features=15, min_samples_split=1, ) cc = CalibratedClassifierCV(rf_c1, method="isotonic", cv=3) cc.fit(X[train], Y[train]) pred = cc.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac) cc = CalibratedClassifierCV(gbm, method="isotonic", cv=3) cc.fit(X[train], Y[train]) pred = cc.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac) # Comparing to not-calibrated xgboost gbm.fit(X[train], Y[train]) pred = gbm.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac)
def main(argv): #Change to parent directory to load data #os.chdir(os.path.pardir) X = np.load('data/X51.npy') Y = np.load('data/y51.npy') labels = np.load('data/LOO.npy') print(X.shape) #fixes errors with Nan data # X= preprocessing.Imputer().fit_transform(X) # Recursive oversampling and undersampling #adsn = ADASYN(imb_threshold=0.5,ratio=0.7) #X,Y = adsn.fit_transform(X,Y) #X,Y = adsn.fit_transform(X,Y) #X,Y = deleteClass(X,Y,100,2) #Grouping 5 classes to 3 """for i in range(0,Y.shape[0]): if Y[i]==0 or Y[i]==1: Y[i]==0 elif Y[i]==2: Y[i]=1 else: Y[i]=2 """ print(Counter(Y)) # Synthetic data is only to be used during training to # enhance recall of minority classes. New data are appended # as first rows of X,y size_b = X.shape[0] adsn = ADASYN(imb_threshold=0.5, ratio=0.7) X, Y = adsn.fit_transform(X, Y) size_a = X.shape[0] generated_samp = size_a - size_b newX = X[1:generated_samp] newY = Y[1:generated_samp] #Shuffling original data to ensure no time dependence realX, realY = shuffle(X[generated_samp:-1], Y[generated_samp:-1], random_state=0) realX, realY = shuffle(realX, realY, random_state=15) print('--------------') # appending real data after generated so that test set will not contain synthetic data allX = np.concatenate((newX, realX), axis=0) allY = np.concatenate((newY, realY), axis=0) X, Y = deleteClass(allX, allY, 200, 2) print(X.shape, Y.shape) # creating training set with synthetic data, test set only real data train = [i for i in range(0, int(0.7 * X.shape[0]))] test = [i for i in range(int(0.7 * X.shape[0]), X.shape[0])] print(Counter(Y)) if sys.argv[1] == '-ensemble': RF = [] outputRF = [] outRFtest = [] totalacc = 0 totalRF = 0 totalXGB = 0 #Tests with all features / most important #feats =[0,1,2,3,4,5,6,7,13,16,22,23,24,25,26,27,29,30,31,32,33,35,38,39,40,41,44,46,47,50] #X = X[:,feats] print(X.shape, Y.shape) n_folds = 3 skf = StratifiedKFold(Y, n_folds=n_folds) kf = KFold(X.shape[0], n_folds=n_folds, shuffle=True) for traini, testi in kf: print(len(traini), len(testi)) # Although data is oversampled, still a small imbalance is present rfr = RandomForestClassifier(n_estimators=300, class_weight='auto', n_jobs=-1, criterion='entropy', max_features=X.shape[1], min_samples_split=1) gbm = xgb.XGBClassifier(n_estimators=50, learning_rate=0.5, colsample_bytree=0.3).fit( X[traini], Y[traini]) rfr.fit(X[traini], Y[traini]) pred = rfr.predict(X[testi]) pred1 = gbm.predict(X[testi]) # Print to screen mean error and Tolerance Score tempacc, trueRF = tolAcc(Y[testi], pred) print('Random Forest: %s' % tempacc) tempacc1, trueXGB = tolAcc(Y[testi], pred1) print('XGBoost: %s' % tempacc1) totalXGB += trueXGB totalRF += trueRF totalacc += tempacc print('True RF: {0}'.format(totalRF / n_folds)) print('True XGB: {0}'.format(totalXGB / n_folds)) print('LOSO TP accuracy: {0}'.format(totalacc / n_folds)) elif sys.argv[1] == '-cali': # These parameters have been computed with RandomizedSearchCV rf_c = RandomForestClassifier(n_estimators=300, bootstrap=False, class_weight='auto', n_jobs=-1, criterion='entropy', max_features=15, min_samples_split=1) gbm = xgb.XGBClassifier(n_estimators=300, learning_rate=0.2, colsample_bytree=0.5, objective='multi:softmax', max_depth=15, gamma=0.001) #Non-calibrated random forest rf_c.fit(X[train], Y[train]) pred = rf_c.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac) # Using isotonic calibration with 3-fold cv to improve results # Both on RF and XGBoost rf_c1 = RandomForestClassifier(n_estimators=300, bootstrap=False, class_weight='auto', n_jobs=-1, criterion='entropy', max_features=15, min_samples_split=1) cc = CalibratedClassifierCV(rf_c1, method='isotonic', cv=3) cc.fit(X[train], Y[train]) pred = cc.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac) cc = CalibratedClassifierCV(gbm, method='isotonic', cv=3) cc.fit(X[train], Y[train]) pred = cc.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac) #Comparing to not-calibrated xgboost gbm.fit(X[train], Y[train]) pred = gbm.predict(X[test]) tolac, trueacc = tolAcc(Y[test], pred) print(tolac)
def main(): print('-----------------------------') print('| Active Learning Activated |') print('-----------------------------') X = np.load('X.npy') Y = np.load('Y.npy') print(Counter(Y)) # fixes errors with Nan data X = preprocessing.Imputer().fit_transform(X) print(X.shape,Y.shape) adsn = ADASYN(ratio=0.7) X,Y = adsn.fit_transform(X,Y) print(Counter(Y)) X,Y = deleteClass(X,Y,100,2) print(Counter(Y)) # The feature division is not clear by their column number, # It was attempted intuitively while cross-checking with the # feature_importance attribute to make two equally good subspaces # Features regarding the first classifier clasOneCols = [0,1,2,3,4,5,9,10,11,12,13,14,15,16,32] clasOneData= X[:,clasOneCols] # Features regarding the second classifier clasTwoCols = [6,7,8,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] clasTwoData= X[:,clasTwoCols] #print(clasOneData.shape, clasTwoData.shape) #assisning weights to penalize majority class over minority #class_weights={0 : 1, 1 : 0.2 , 2 : 0.1 , 3 : 0.2, 4 :1} rfr1= RandomForestClassifier(n_estimators=300,class_weight='auto',n_jobs=-1) rfr2= RandomForestClassifier(n_estimators=300,class_weight='auto',n_jobs=-1) rfr3= RandomForestClassifier(n_estimators=300,class_weight='auto',n_jobs=-1) n_samples = 700 tolac1 = [] tolac2 = [] tolac3 = [] rate =[] ranges=['33','25','20'] df =[] for i in [3,4,5,10]: skf = StratifiedKFold(Y,n_folds=i,shuffle=True) for test,train in skf: #print(len(train),len(test), float(len(train))/len(test)) rfr1.fit(clasOneData[train],Y[train]) rfr2.fit(clasTwoData[train],Y[train]) rfr3.fit(X[train],Y[train]) pred1 = rfr1.predict(clasOneData[test]) tolac1.append(tolAcc(Y[test],pred1)) #print('Tolerance accuracy 1: %s' % tolAcc(Y[test],pred1)) pred2 = rfr2.predict(clasTwoData[test]) tolac2.append(tolAcc(Y[test],pred2)) # print('Tolerance accuracy 2: %s' % tolAcc(Y[test],pred2)) pred3 = rfr3.predict(X[test]) tolac3.append(tolAcc(Y[test],pred3)) #print('Combined: %s' % tolAcc(Y[test],pred3)) pred1 = pred1.astype(np.int64) pred2 = pred2.astype(np.int64) aggreement_rate = activeLabeling(pred1,pred2) rate.append(aggreement_rate) #print(rfr3.feature_importances_) print(rate[0:3]) print('Mean is : %s' % np.mean(rate[0:3])) print(rate[3:6]) print('Mean is : %s' % np.mean(rate[3:7])) print(rate[9:12]) print('Mean is : %s' % np.mean(rate[7:12])) print(rate[12:16]) print('Mean is : %s' % np.mean(rate[12:-1]))
print(classification_report(labels_test, pred)) def data_prepration( x_features, x_labels ): # preparing data for training and testing as we are going to use different data x_features_train, x_features_test, x_labels_train, x_labels_test = train_test_split( x_features, x_labels, test_size=0.3, random_state=0) print("length of training data") print(len(x_features_train)) print("length of test data") print(len(x_features_test)) return (x_features_train, x_features_test, x_labels_train, x_labels_test) adsn = ADASYN(k=7, imb_threshold=0.6, ratio=6.5) x = data x_features = x.ix[:, x.columns != "Class"] x_labels = x.ix[:, x.columns == "Class"] os_data_X, os_data_y = adsn.fit_transform( x_features.values, [i[0] for i in x_labels.values]) # first oversampling # then splitting data_train_X, data_test_X, data_train_y, data_test_y = data_prepration( os_data_X, os_data_y) columns = x_features.columns print(columns) data_train_X = pd.DataFrame(data=data_train_X, columns=columns) data_train_y = pd.DataFrame(data=data_train_y, columns=["Class"]) data_test_X = pd.DataFrame(data=data_test_X, columns=columns) data_test_y = pd.DataFrame(data=data_test_y, columns=["Class"])
print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") X = np.load('data/X51.npy') Y = np.load('data/y51.npy') # fixes errors with Nan data X = preprocessing.Imputer().fit_transform(X) print(X.shape, Y.shape) # Recursive oversampling and undersampling adsn = ADASYN(imb_threshold=0.5, ratio=0.7) X, Y = adsn.fit_transform(X, Y) X, Y = deleteClass(X, Y, 100, 2) print(int(np.sqrt(X.shape[1]))) # Create the RFE object and compute a cross-validated score. rf = RandomForestClassifier(n_jobs=-1) gbm = xgb.XGBClassifier(n_estimators=300) # The "accuracy" scoring is proportional to the number of correct # classifications param_dist = { "n_estimators": [10, 50, 100, 150, 300], "criterion": ['gini', 'entropy'], "bootstrap": [True, False], "max_features": [10, 20, 30, 40, 45, 48],
X_trai=X_train.copy().as_matrix() y_trai=y_train.copy().as_matrix() X_tes=X_test.copy() columns=X_train.columns.values oversample=False from sklearn.grid_search import ParameterGrid grid = ParameterGrid({"k": [3,4,5,6,7], "ratio": [0.1,0.2,0.3,0.4,0.5] }) def plot_scores(scores): scores=pd.DataFrame(scores,index=['RT','gbc','ets','lgr','adboost','dt','voting'],columns=['auc','accuracy','f1','precision','recall','kappa']) scores.plot() if(oversample): # scores=[] # for params in grid: # print params adsn = ADASYN(imb_threshold=0.8,ratio=1) X_trai, y_trai = adsn.fit_transform(X_trai,y_trai) # your imbalanced dataset is in X,y # X_trai,y_trai=u.test_rest(X_trai,y_trai) u.all_lassifer(X_trai,y_trai,X_test,y_test) # u.all_lassifer(X_trai,y_trai,X_tes,y_tes) # scores.append(u.boostingClassifier(X_trai,y_trai,X_tes,y_tes)) # scroes=pd.DataFrame(scores,columns=['auc','f1','accuracy','precision','recall','kappa']) # print Counter(y_trai) else: # scores=[] predcit=[] grid = ParameterGrid({'c':[0] }) for params in grid: print params X_trai,y_trai=u.test_rest(X_trai,y_trai,ratio=3,**params) X_trai,y_trai=u.test_smote(X_trai,y_trai,c=0)
def adasyn_this_test(all_x, all_y, tolerable_imbalance=0.9): from adasyn import ADASYN adsn = ADASYN(k=7, imb_threshold=tolerable_imbalance, ratio=0.75) new_X, new_y = adsn.fit_transform(all_x, all_y) return new_X, new_y