def test_rest(x, y): print('Random under-sampling') US = UnderSampler(verbose=verbose) usx, usy = US.fit_transform(x, y) print('Tomek links') TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) print('Clustering centroids') CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) print('NearMiss-1') NM1 = NearMiss(version=1, verbose=verbose) nm1x, nm1y = NM1.fit_transform(x, y) print('NearMiss-2') NM2 = NearMiss(version=2, verbose=verbose) nm2x, nm2y = NM2.fit_transform(x, y) print('NearMiss-3') NM3 = NearMiss(version=3, verbose=verbose) nm3x, nm3y = NM3.fit_transform(x, y) print('Neighboorhood Cleaning Rule') NCR = NeighbourhoodCleaningRule(verbose=verbose) ncrx, ncry = NCR.fit_transform(x, y) print('Random over-sampling') OS = OverSampler(verbose=verbose) ox, oy = OS.fit_transform(x, y) print('SMOTE Tomek links') STK = SMOTETomek(verbose=verbose) stkx, stky = STK.fit_transform(x, y) print('SMOTE ENN') SENN = SMOTEENN(verbose=verbose) sennx, senny = SENN.fit_transform(x, y) print('EasyEnsemble') EE = EasyEnsemble(verbose=verbose) eex, eey = EE.fit_transform(x, y)
def clustering_centroids(self): # 'Clustering centroids' CC = ClusterCentroids(verbose=self.verbose) ccx, ccy = CC.fit_transform(self.x, self.y) print "Clustering Centroids Transformed" return ccx, ccy
plt.scatter(x_vis[y==1, 0], x_vis[y==1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor='blue', linewidth=0.15) plt.legend() plt.show() # Generate the new dataset using under-sampling method verbose = False # 'Random under-sampling' US = UnderSampler(verbose=verbose) usx, usy = US.fit_transform(x, y) # 'Tomek links' TL = TomekLinks(verbose=verbose) tlx, tly = TL.fit_transform(x, y) # 'Clustering centroids' CC = ClusterCentroids(verbose=verbose) ccx, ccy = CC.fit_transform(x, y) # 'NearMiss-1' NM1 = NearMiss(version=1, verbose=verbose) nm1x, nm1y = NM1.fit_transform(x, y) # 'NearMiss-2' NM2 = NearMiss(version=2, verbose=verbose) nm2x, nm2y = NM2.fit_transform(x, y) # 'NearMiss-3' NM3 = NearMiss(version=3, verbose=verbose) nm3x, nm3y = NM3.fit_transform(x, y) # 'Condensed Nearest Neighbour' CNN = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51, verbose=verbose) cnnx, cnny = CNN.fit_transform(x, y) # 'One-Sided Selection' OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51, verbose=verbose)
def main(argv): X=np.load('numdata/epochFeats.npy') Y=np.load('numdata/epochLabels.npy') labels= np.load('numdata/LOO.npy') print(X.shape,Y.shape) X,Y = deleteClass(X,Y,330,2) X,Y = deleteClass(X,Y,70,1) if sys.argv[1]=='-first': print(X.shape, Y.shape, labels.shape) folds=10 #Pipeline stuff forest = RandomForestRegressor(n_estimators=100, n_jobs = -1) scaler = preprocessing.StandardScaler() lolo = LeaveOneLabelOut(labels) print(lolo,len(lolo)) acc = 0 us = UnderSampler(verbose=True) #X,Y = us.fit_transform(X,Y) kf = KFold(Y.shape[0],n_folds=folds) for train_index,test_index in lolo: print(len(train_index),len(test_index)) Xtrain,Xtest = X[train_index], X[test_index] ytrain,ytest = Y[train_index], Y[test_index] forest.fit(Xtrain,ytrain) scores = forest.predict(Xtest) #acc += tolAcc(ytest,scores) print(acc/folds) # Ensemble Random Forest Regressor stacked with Random Forest Classifier elif sys.argv[1]=='-ensemble': RF = [] outputRF = [] outRFtest=[] us = UnderSampler(verbose=True) cc = ClusterCentroids(verbose=True) #X,Y = cc.fit_transform(X,Y) print(X.shape,Y.shape) # separating features into categories for Ensemble Training activityData = X[:,0:3 ] screenData = X[:,3:14] conversationData = X[:,14:20 ] colocationData = X[:,20:26] audioData = X[:,26:X.shape[1]] # Custom Nested Cross-Validation # Indexes is used to split the dataset in a 40/40/20 manner # NOTE: 30/30/40 seemed to produce very similar results indexes = np.array([i for i in range(X.shape[0])]) np.random.shuffle(indexes) lolo = LeaveOneLabelOut(labels) # print(lolo,len(lolo)) # separating data to 3 subsets: # 1) Train RF # 2) Get RF outputs with which train NN # 3) Test NN output on the rest train_index = indexes[0: int(0.5*X.shape[0])] train_index2 = indexes[int(0.5*X.shape[0]):int(0.8*X.shape[0])] test_index = indexes[int(0.8*X.shape[0]):X.shape[0]] print(len(train_index),len(train_index2),len(test_index )) # Training 5 regressors on 5 types of features i=0 for data in [activityData,screenData,conversationData,colocationData,audioData]: RF.append(RandomForestRegressor(n_estimators=300,max_features=data.shape[1],n_jobs=-1)) RF[i].fit(data[train_index],Y[train_index]) outputRF.append( RF[i].predict(data[train_index2]) ) outRFtest.append(RF[i].predict(data[test_index])) i += 1 middleTrainMat = np.transpose(np.array(outputRF)) testMat = np.transpose(np.array(outRFtest)) # RF classifier to combine regressors class_weights={0 : 1, 1 : 0.5 , 2 : 0.1 , 3 : 0.6, 4 :1} print(class_weights) rfr= ExtraTreesClassifier(n_estimators=300,class_weight=class_weights,n_jobs=-1) rfr.fit(middleTrainMat,Y[train_index2]) print(middleTrainMat.shape) pred = rfr.predict(testMat) # Print to screen mean error and Tolerance Score print(tolAcc(Y[test_index],pred,testMat))