print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

X = np.load('data/X51.npy')
Y = np.load('data/y51.npy')

	# fixes errors with Nan data
X = preprocessing.Imputer().fit_transform(X)
print(X.shape,Y.shape)

   # Recursive oversampling and undersampling
adsn = ADASYN(imb_threshold=0.5,ratio=0.7)
X,Y = adsn.fit_transform(X,Y)
X,Y = deleteClass(X,Y,100,2)
print(int(np.sqrt(X.shape[1])))

# Create the RFE object and compute a cross-validated score.
rf = RandomForestClassifier(n_jobs=-1)
gbm =xgb.XGBClassifier(n_estimators=300)
# The "accuracy" scoring is proportional to the number of correct
# classifications

param_dist = {"n_estimators": [10,50,100,150,300],
                "criterion": ['gini', 'entropy'],
                "bootstrap": [True,False],
                "max_features": [10,20,30,40,45,48],
                "class_weight": ['auto']}
param_dist_xgb = {"max_depth": [5,10,15,25,30],
Exemple #2
0
    print("length of test data")
    print(len(x_features_test))
    return(x_features_train, x_features_test, x_labels_train, x_labels_test)


adsn = ADASYN(k=7,imb_threshold=0.7, ratio=1, random_state=0)


# now we can devided our data into training and test data
# Call our method data prepration on our dataset
x = data
x = x.dropna(axis=0,how='any')
x_features = x.ix[:, x.columns != "Class"]

x_labels = x.ix[:, x.columns == "Class"]
os_data_X, os_data_y = adsn.fit_transform(x_features.values, [i[0] for i in x_labels.values])
data_train_X, data_test_X, data_train_y, data_test_y = data_prepration(os_data_X, os_data_y)
columns = x_features.columns
print(columns)


data_train_X = pd.DataFrame(data=data_train_X, columns=columns )
data_train_y= pd.DataFrame(data=data_train_y, columns=["Class"])



print("Length of oversampled data is ", len(data_train_X))
print("Number of normal transcation in oversampled data", len(data_train_y[data_train_y["Class"] == 0]))
print("No.of fraud transcation", len(data_train_y[data_train_y["Class"]==1]))
print("Number of normal people in oversampled data is ",len(data_train_y[data_train_y["Class"] == 0])/len(data_train_X))
print("Number of people having diabetes in oversampled data is ", len(data_train_y[data_train_y["Class"]==1])/len(data_train_X))
Exemple #3
0
def main():
    print('-----------------------------')
    print('| Active Learning Activated |')
    print('-----------------------------')

    X = np.load('X.npy')
    Y = np.load('Y.npy')
    print(Counter(Y))
    # fixes errors with Nan data
    X = preprocessing.Imputer().fit_transform(X)
    print(X.shape, Y.shape)

    adsn = ADASYN(ratio=0.7)
    X, Y = adsn.fit_transform(X, Y)
    print(Counter(Y))

    X, Y = deleteClass(X, Y, 100, 2)
    print(Counter(Y))

    # The feature division is not clear by their column number,
    # It was attempted intuitively while cross-checking with the
    # feature_importance attribute to make two equally good subspaces

    # Features regarding the first classifier
    clasOneCols = [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 32]
    clasOneData = X[:, clasOneCols]

    # Features regarding the second classifier
    clasTwoCols = [
        6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
    ]
    clasTwoData = X[:, clasTwoCols]

    #print(clasOneData.shape, clasTwoData.shape)

    #assisning weights to penalize majority class over minority
    #class_weights={0 : 1, 1 : 0.2 , 2 : 0.1 , 3 : 0.2, 4 :1}
    rfr1 = RandomForestClassifier(n_estimators=300,
                                  class_weight='auto',
                                  n_jobs=-1)
    rfr2 = RandomForestClassifier(n_estimators=300,
                                  class_weight='auto',
                                  n_jobs=-1)
    rfr3 = RandomForestClassifier(n_estimators=300,
                                  class_weight='auto',
                                  n_jobs=-1)

    n_samples = 700
    tolac1 = []
    tolac2 = []
    tolac3 = []
    rate = []

    ranges = ['33', '25', '20']
    df = []
    for i in [3, 4, 5, 10]:
        skf = StratifiedKFold(Y, n_folds=i, shuffle=True)
        for test, train in skf:
            #print(len(train),len(test), float(len(train))/len(test))
            rfr1.fit(clasOneData[train], Y[train])
            rfr2.fit(clasTwoData[train], Y[train])
            rfr3.fit(X[train], Y[train])

            pred1 = rfr1.predict(clasOneData[test])
            tolac1.append(tolAcc(Y[test], pred1))
            #print('Tolerance accuracy 1: %s' % tolAcc(Y[test],pred1))

            pred2 = rfr2.predict(clasTwoData[test])
            tolac2.append(tolAcc(Y[test], pred2))
            #	print('Tolerance accuracy 2: %s' % tolAcc(Y[test],pred2))

            pred3 = rfr3.predict(X[test])
            tolac3.append(tolAcc(Y[test], pred3))
            #print('Combined: %s' % tolAcc(Y[test],pred3))

            pred1 = pred1.astype(np.int64)
            pred2 = pred2.astype(np.int64)
            aggreement_rate = activeLabeling(pred1, pred2)
            rate.append(aggreement_rate)
        #print(rfr3.feature_importances_)
    print(rate[0:3])
    print('Mean is : %s' % np.mean(rate[0:3]))
    print(rate[3:6])
    print('Mean is : %s' % np.mean(rate[3:7]))
    print(rate[9:12])
    print('Mean is : %s' % np.mean(rate[7:12]))
    print(rate[12:16])
    print('Mean is : %s' % np.mean(rate[12:-1]))
Exemple #4
0
def main(argv):
    #Change to parent directory to load data
    #os.chdir(os.path.pardir)
    X = np.load('data/X51.npy')
    Y = np.load('data/y51.npy')
    labels = np.load('data/LOO.npy')
    print(X.shape)
    #fixes errors with Nan data
    #	X= preprocessing.Imputer().fit_transform(X)

    # Recursive oversampling and undersampling
    #adsn = ADASYN(imb_threshold=0.5,ratio=0.7)
    #X,Y = adsn.fit_transform(X,Y)
    #X,Y = adsn.fit_transform(X,Y)
    #X,Y = deleteClass(X,Y,100,2)

    #Grouping 5 classes to 3
    """for i in range(0,Y.shape[0]):
		if Y[i]==0 or Y[i]==1:
			Y[i]==0
		elif Y[i]==2:
			Y[i]=1
		else:
			Y[i]=2
"""
    print(Counter(Y))

    # Synthetic data is only to be used during training to
    # enhance recall of minority classes. New data are appended
    # as first rows of X,y

    size_b = X.shape[0]
    adsn = ADASYN(imb_threshold=0.5, ratio=0.7)
    X, Y = adsn.fit_transform(X, Y)
    size_a = X.shape[0]
    generated_samp = size_a - size_b

    newX = X[1:generated_samp]
    newY = Y[1:generated_samp]

    #Shuffling original data to ensure no time dependence
    realX, realY = shuffle(X[generated_samp:-1],
                           Y[generated_samp:-1],
                           random_state=0)
    realX, realY = shuffle(realX, realY, random_state=15)

    print('--------------')
    # appending real data after generated so that test set will not contain synthetic data
    allX = np.concatenate((newX, realX), axis=0)
    allY = np.concatenate((newY, realY), axis=0)

    X, Y = deleteClass(allX, allY, 200, 2)
    print(X.shape, Y.shape)

    # creating training set with synthetic data, test set only real data
    train = [i for i in range(0, int(0.7 * X.shape[0]))]
    test = [i for i in range(int(0.7 * X.shape[0]), X.shape[0])]
    print(Counter(Y))

    if sys.argv[1] == '-ensemble':
        RF = []
        outputRF = []
        outRFtest = []
        totalacc = 0
        totalRF = 0
        totalXGB = 0

        #Tests with all features / most important
        #feats =[0,1,2,3,4,5,6,7,13,16,22,23,24,25,26,27,29,30,31,32,33,35,38,39,40,41,44,46,47,50]
        #X = X[:,feats]
        print(X.shape, Y.shape)

        n_folds = 3
        skf = StratifiedKFold(Y, n_folds=n_folds)
        kf = KFold(X.shape[0], n_folds=n_folds, shuffle=True)
        for traini, testi in kf:
            print(len(traini), len(testi))

            # Although data is oversampled, still a small imbalance is present
            rfr = RandomForestClassifier(n_estimators=300,
                                         class_weight='auto',
                                         n_jobs=-1,
                                         criterion='entropy',
                                         max_features=X.shape[1],
                                         min_samples_split=1)
            gbm = xgb.XGBClassifier(n_estimators=50,
                                    learning_rate=0.5,
                                    colsample_bytree=0.3).fit(
                                        X[traini], Y[traini])

            rfr.fit(X[traini], Y[traini])
            pred = rfr.predict(X[testi])
            pred1 = gbm.predict(X[testi])
            # Print to screen mean error and Tolerance Score
            tempacc, trueRF = tolAcc(Y[testi], pred)
            print('Random Forest: %s' % tempacc)

            tempacc1, trueXGB = tolAcc(Y[testi], pred1)
            print('XGBoost: %s' % tempacc1)
            totalXGB += trueXGB
            totalRF += trueRF
            totalacc += tempacc

        print('True RF: {0}'.format(totalRF / n_folds))
        print('True XGB: {0}'.format(totalXGB / n_folds))
        print('LOSO TP accuracy: {0}'.format(totalacc / n_folds))

    elif sys.argv[1] == '-cali':
        # These parameters have been computed with RandomizedSearchCV
        rf_c = RandomForestClassifier(n_estimators=300,
                                      bootstrap=False,
                                      class_weight='auto',
                                      n_jobs=-1,
                                      criterion='entropy',
                                      max_features=15,
                                      min_samples_split=1)
        gbm = xgb.XGBClassifier(n_estimators=300,
                                learning_rate=0.2,
                                colsample_bytree=0.5,
                                objective='multi:softmax',
                                max_depth=15,
                                gamma=0.001)

        #Non-calibrated random forest
        rf_c.fit(X[train], Y[train])
        pred = rf_c.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)

        # Using isotonic calibration with 3-fold cv to improve results
        # Both on RF and XGBoost
        rf_c1 = RandomForestClassifier(n_estimators=300,
                                       bootstrap=False,
                                       class_weight='auto',
                                       n_jobs=-1,
                                       criterion='entropy',
                                       max_features=15,
                                       min_samples_split=1)

        cc = CalibratedClassifierCV(rf_c1, method='isotonic', cv=3)
        cc.fit(X[train], Y[train])
        pred = cc.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)

        cc = CalibratedClassifierCV(gbm, method='isotonic', cv=3)
        cc.fit(X[train], Y[train])
        pred = cc.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)

        #Comparing to not-calibrated xgboost
        gbm.fit(X[train], Y[train])
        pred = gbm.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)
def main(argv):
    # Change to parent directory to load data
    # os.chdir(os.path.pardir)
    X = np.load("data/X51.npy")
    Y = np.load("data/y51.npy")
    labels = np.load("data/LOO.npy")
    print(X.shape)
    # fixes errors with Nan data
    # 	X= preprocessing.Imputer().fit_transform(X)

    # Recursive oversampling and undersampling
    # adsn = ADASYN(imb_threshold=0.5,ratio=0.7)
    # X,Y = adsn.fit_transform(X,Y)
    # X,Y = adsn.fit_transform(X,Y)
    # X,Y = deleteClass(X,Y,100,2)

    # Grouping 5 classes to 3
    """for i in range(0,Y.shape[0]):
		if Y[i]==0 or Y[i]==1:
			Y[i]==0
		elif Y[i]==2:
			Y[i]=1
		else:
			Y[i]=2
"""
    print(Counter(Y))

    # Synthetic data is only to be used during training to
    # enhance recall of minority classes. New data are appended
    # as first rows of X,y

    size_b = X.shape[0]
    adsn = ADASYN(imb_threshold=0.5, ratio=0.7)
    X, Y = adsn.fit_transform(X, Y)
    size_a = X.shape[0]
    generated_samp = size_a - size_b

    newX = X[1:generated_samp]
    newY = Y[1:generated_samp]

    # Shuffling original data to ensure no time dependence
    realX, realY = shuffle(X[generated_samp:-1], Y[generated_samp:-1], random_state=0)
    realX, realY = shuffle(realX, realY, random_state=15)

    print("--------------")
    # appending real data after generated so that test set will not contain synthetic data
    allX = np.concatenate((newX, realX), axis=0)
    allY = np.concatenate((newY, realY), axis=0)

    X, Y = deleteClass(allX, allY, 200, 2)
    print(X.shape, Y.shape)

    # creating training set with synthetic data, test set only real data
    train = [i for i in range(0, int(0.7 * X.shape[0]))]
    test = [i for i in range(int(0.7 * X.shape[0]), X.shape[0])]
    print(Counter(Y))

    if sys.argv[1] == "-ensemble":
        RF = []
        outputRF = []
        outRFtest = []
        totalacc = 0
        totalRF = 0
        totalXGB = 0

        # Tests with all features / most important
        # feats =[0,1,2,3,4,5,6,7,13,16,22,23,24,25,26,27,29,30,31,32,33,35,38,39,40,41,44,46,47,50]
        # X = X[:,feats]
        print(X.shape, Y.shape)

        n_folds = 3
        skf = StratifiedKFold(Y, n_folds=n_folds)
        kf = KFold(X.shape[0], n_folds=n_folds, shuffle=True)
        for traini, testi in kf:
            print(len(traini), len(testi))

            # Although data is oversampled, still a small imbalance is present
            rfr = RandomForestClassifier(
                n_estimators=300,
                class_weight="auto",
                n_jobs=-1,
                criterion="entropy",
                max_features=X.shape[1],
                min_samples_split=1,
            )
            gbm = xgb.XGBClassifier(n_estimators=50, learning_rate=0.5, colsample_bytree=0.3).fit(X[traini], Y[traini])

            rfr.fit(X[traini], Y[traini])
            pred = rfr.predict(X[testi])
            pred1 = gbm.predict(X[testi])
            # Print to screen mean error and Tolerance Score
            tempacc, trueRF = tolAcc(Y[testi], pred)
            print("Random Forest: %s" % tempacc)

            tempacc1, trueXGB = tolAcc(Y[testi], pred1)
            print("XGBoost: %s" % tempacc1)
            totalXGB += trueXGB
            totalRF += trueRF
            totalacc += tempacc

        print("True RF: {0}".format(totalRF / n_folds))
        print("True XGB: {0}".format(totalXGB / n_folds))
        print("LOSO TP accuracy: {0}".format(totalacc / n_folds))

    elif sys.argv[1] == "-cali":
        # These parameters have been computed with RandomizedSearchCV
        rf_c = RandomForestClassifier(
            n_estimators=300,
            bootstrap=False,
            class_weight="auto",
            n_jobs=-1,
            criterion="entropy",
            max_features=15,
            min_samples_split=1,
        )
        gbm = xgb.XGBClassifier(
            n_estimators=300,
            learning_rate=0.2,
            colsample_bytree=0.5,
            objective="multi:softmax",
            max_depth=15,
            gamma=0.001,
        )

        # Non-calibrated random forest
        rf_c.fit(X[train], Y[train])
        pred = rf_c.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)

        # Using isotonic calibration with 3-fold cv to improve results
        # Both on RF and XGBoost
        rf_c1 = RandomForestClassifier(
            n_estimators=300,
            bootstrap=False,
            class_weight="auto",
            n_jobs=-1,
            criterion="entropy",
            max_features=15,
            min_samples_split=1,
        )

        cc = CalibratedClassifierCV(rf_c1, method="isotonic", cv=3)
        cc.fit(X[train], Y[train])
        pred = cc.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)

        cc = CalibratedClassifierCV(gbm, method="isotonic", cv=3)
        cc.fit(X[train], Y[train])
        pred = cc.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)

        # Comparing to not-calibrated xgboost
        gbm.fit(X[train], Y[train])
        pred = gbm.predict(X[test])
        tolac, trueacc = tolAcc(Y[test], pred)
        print(tolac)
def main():
	print('-----------------------------')
	print('| Active Learning Activated |')
	print('-----------------------------')
	
	X = np.load('X.npy')
	Y = np.load('Y.npy')
	print(Counter(Y))
	# fixes errors with Nan data
	X = preprocessing.Imputer().fit_transform(X)
	print(X.shape,Y.shape)

	adsn = ADASYN(ratio=0.7)
	X,Y = adsn.fit_transform(X,Y)
	print(Counter(Y))

	X,Y = deleteClass(X,Y,100,2)
	print(Counter(Y))
	
	# The feature division is not clear by their column number,
	# It was attempted intuitively while cross-checking with the 
	# feature_importance attribute to make two equally good subspaces 
	 
	# Features regarding the first classifier
	clasOneCols = [0,1,2,3,4,5,9,10,11,12,13,14,15,16,32]
	clasOneData= X[:,clasOneCols]

	# Features regarding the second classifier
	clasTwoCols = [6,7,8,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
	clasTwoData= X[:,clasTwoCols]

	#print(clasOneData.shape, clasTwoData.shape)

	#assisning weights to penalize majority class over minority
	#class_weights={0 : 1, 1 : 0.2 , 2 : 0.1 , 3 : 0.2, 4 :1}
	rfr1= RandomForestClassifier(n_estimators=300,class_weight='auto',n_jobs=-1)
	rfr2= RandomForestClassifier(n_estimators=300,class_weight='auto',n_jobs=-1)
	rfr3= RandomForestClassifier(n_estimators=300,class_weight='auto',n_jobs=-1)

	n_samples = 700
	tolac1 = []
	tolac2 = []
	tolac3 = []
	rate =[]

	ranges=['33','25','20']
	df =[]
	for i in [3,4,5,10]:
		skf = StratifiedKFold(Y,n_folds=i,shuffle=True)
		for test,train in skf:
			#print(len(train),len(test), float(len(train))/len(test))
			rfr1.fit(clasOneData[train],Y[train])
			rfr2.fit(clasTwoData[train],Y[train])	
			rfr3.fit(X[train],Y[train])
			
			pred1 = rfr1.predict(clasOneData[test])
			tolac1.append(tolAcc(Y[test],pred1))
			#print('Tolerance accuracy 1: %s' % tolAcc(Y[test],pred1))

			pred2 = rfr2.predict(clasTwoData[test])
			tolac2.append(tolAcc(Y[test],pred2))
		#	print('Tolerance accuracy 2: %s' % tolAcc(Y[test],pred2))

			pred3 = rfr3.predict(X[test])
			tolac3.append(tolAcc(Y[test],pred3))
			#print('Combined: %s' % tolAcc(Y[test],pred3))

			pred1 = pred1.astype(np.int64)
			pred2 = pred2.astype(np.int64)
			aggreement_rate = activeLabeling(pred1,pred2)
			rate.append(aggreement_rate)
		#print(rfr3.feature_importances_)
	print(rate[0:3])
	print('Mean is : %s' % np.mean(rate[0:3]))
	print(rate[3:6])
	print('Mean is : %s' % np.mean(rate[3:7]))
	print(rate[9:12])
	print('Mean is : %s' % np.mean(rate[7:12]))
	print(rate[12:16])
	print('Mean is : %s' % np.mean(rate[12:-1]))
    x_features, x_labels
):  # preparing data for training and testing as we are going to use different data
    x_features_train, x_features_test, x_labels_train, x_labels_test = train_test_split(
        x_features, x_labels, test_size=0.3, random_state=0)
    print("length of training data")
    print(len(x_features_train))
    print("length of test data")
    print(len(x_features_test))
    return (x_features_train, x_features_test, x_labels_train, x_labels_test)


adsn = ADASYN(k=7, imb_threshold=0.6, ratio=6.5)
x = data
x_features = x.ix[:, x.columns != "Class"]
x_labels = x.ix[:, x.columns == "Class"]
os_data_X, os_data_y = adsn.fit_transform(
    x_features.values, [i[0] for i in x_labels.values])  # first oversampling
#  then splitting
data_train_X, data_test_X, data_train_y, data_test_y = data_prepration(
    os_data_X, os_data_y)
columns = x_features.columns
print(columns)

data_train_X = pd.DataFrame(data=data_train_X, columns=columns)
data_train_y = pd.DataFrame(data=data_train_y, columns=["Class"])
data_test_X = pd.DataFrame(data=data_test_X, columns=columns)
data_test_y = pd.DataFrame(data=data_test_y, columns=["Class"])
os_data_X = data_train_X
os_data_y = data_train_y
data_test_X_pandas = pd.DataFrame(data=data_test_X, columns=columns)
# now we divide our data into training and test data
# Call our method data preparation on our data-set
Exemple #8
0
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            score.mean_validation_score, np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


X = np.load('data/X51.npy')
Y = np.load('data/y51.npy')

# fixes errors with Nan data
X = preprocessing.Imputer().fit_transform(X)
print(X.shape, Y.shape)

# Recursive oversampling and undersampling
adsn = ADASYN(imb_threshold=0.5, ratio=0.7)
X, Y = adsn.fit_transform(X, Y)
X, Y = deleteClass(X, Y, 100, 2)
print(int(np.sqrt(X.shape[1])))

# Create the RFE object and compute a cross-validated score.
rf = RandomForestClassifier(n_jobs=-1)
gbm = xgb.XGBClassifier(n_estimators=300)
# The "accuracy" scoring is proportional to the number of correct
# classifications

param_dist = {
    "n_estimators": [10, 50, 100, 150, 300],
    "criterion": ['gini', 'entropy'],
    "bootstrap": [True, False],
    "max_features": [10, 20, 30, 40, 45, 48],
    "class_weight": ['auto']
Exemple #9
0
y_trai=y_train.copy().as_matrix()
X_tes=X_test.copy()
columns=X_train.columns.values
oversample=False
from sklearn.grid_search import ParameterGrid
grid = ParameterGrid({"k": [3,4,5,6,7],
                          "ratio": [0.1,0.2,0.3,0.4,0.5] })
def plot_scores(scores):
    scores=pd.DataFrame(scores,index=['RT','gbc','ets','lgr','adboost','dt','voting'],columns=['auc','accuracy','f1','precision','recall','kappa'])
    scores.plot()
if(oversample):
#    scores=[]
#    for params in grid:
#        print params
        adsn = ADASYN(imb_threshold=0.8,ratio=1)
        X_trai, y_trai = adsn.fit_transform(X_trai,y_trai)  # your imbalanced dataset is in X,y
#        X_trai,y_trai=u.test_rest(X_trai,y_trai) 
        u.all_lassifer(X_trai,y_trai,X_test,y_test)
#        u.all_lassifer(X_trai,y_trai,X_tes,y_tes)
#        scores.append(u.boostingClassifier(X_trai,y_trai,X_tes,y_tes))
#        scroes=pd.DataFrame(scores,columns=['auc','f1','accuracy','precision','recall','kappa'])
#        print Counter(y_trai)
else:
#    scores=[]
    predcit=[]
    grid = ParameterGrid({'c':[0] })
    for params in grid:
        print params
        X_trai,y_trai=u.test_rest(X_trai,y_trai,ratio=3,**params)
        X_trai,y_trai=u.test_smote(X_trai,y_trai,c=0)
        
Exemple #10
0
def adasyn_this_test(all_x, all_y, tolerable_imbalance=0.9):
    from adasyn import ADASYN
    adsn = ADASYN(k=7, imb_threshold=tolerable_imbalance, ratio=0.75)
    new_X, new_y = adsn.fit_transform(all_x, all_y)
    return new_X, new_y