Beispiel #1
0
def test_rest(x, y):

    print('Random under-sampling')
    US = UnderSampler(verbose=verbose)
    usx, usy = US.fit_transform(x, y)

    print('Tomek links')
    TL = TomekLinks(verbose=verbose)
    tlx, tly = TL.fit_transform(x, y)

    print('Clustering centroids')
    CC = ClusterCentroids(verbose=verbose)
    ccx, ccy = CC.fit_transform(x, y)

    print('NearMiss-1')
    NM1 = NearMiss(version=1, verbose=verbose)
    nm1x, nm1y = NM1.fit_transform(x, y)

    print('NearMiss-2')
    NM2 = NearMiss(version=2, verbose=verbose)
    nm2x, nm2y = NM2.fit_transform(x, y)

    print('NearMiss-3')
    NM3 = NearMiss(version=3, verbose=verbose)
    nm3x, nm3y = NM3.fit_transform(x, y)

    print('Neighboorhood Cleaning Rule')
    NCR = NeighbourhoodCleaningRule(verbose=verbose)
    ncrx, ncry = NCR.fit_transform(x, y)

    print('Random over-sampling')
    OS = OverSampler(verbose=verbose)
    ox, oy = OS.fit_transform(x, y)

    print('SMOTE Tomek links')
    STK = SMOTETomek(verbose=verbose)
    stkx, stky = STK.fit_transform(x, y)

    print('SMOTE ENN')
    SENN = SMOTEENN(verbose=verbose)
    sennx, senny = SENN.fit_transform(x, y)

    print('EasyEnsemble')
    EE = EasyEnsemble(verbose=verbose)
    eex, eey = EE.fit_transform(x, y)
 def clustering_centroids(self):
     # 'Clustering centroids'
     CC = ClusterCentroids(verbose=self.verbose)
     ccx, ccy = CC.fit_transform(self.x, self.y)
     print "Clustering Centroids Transformed"
     return ccx, ccy
plt.scatter(x_vis[y==1, 0], x_vis[y==1, 1], label="Class #1", alpha=0.5, 
            edgecolor=almost_black, facecolor='blue', linewidth=0.15)

plt.legend()
plt.show()

# Generate the new dataset using under-sampling method
verbose = False
# 'Random under-sampling'
US = UnderSampler(verbose=verbose)
usx, usy = US.fit_transform(x, y)
# 'Tomek links'
TL = TomekLinks(verbose=verbose)
tlx, tly = TL.fit_transform(x, y)
# 'Clustering centroids'
CC = ClusterCentroids(verbose=verbose)
ccx, ccy = CC.fit_transform(x, y)
# 'NearMiss-1'
NM1 = NearMiss(version=1, verbose=verbose)
nm1x, nm1y = NM1.fit_transform(x, y)
# 'NearMiss-2'
NM2 = NearMiss(version=2, verbose=verbose)
nm2x, nm2y = NM2.fit_transform(x, y)
# 'NearMiss-3'
NM3 = NearMiss(version=3, verbose=verbose)
nm3x, nm3y = NM3.fit_transform(x, y)
# 'Condensed Nearest Neighbour'
CNN = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51, verbose=verbose)
cnnx, cnny = CNN.fit_transform(x, y)
# 'One-Sided Selection'
OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51, verbose=verbose)
Beispiel #4
0
def main(argv):
	X=np.load('numdata/epochFeats.npy')
	Y=np.load('numdata/epochLabels.npy')
	labels= np.load('numdata/LOO.npy')
	print(X.shape,Y.shape)
	X,Y = deleteClass(X,Y,330,2)
	X,Y = deleteClass(X,Y,70,1)



	if sys.argv[1]=='-first':
		print(X.shape, Y.shape, labels.shape)
		folds=10
		#Pipeline stuff 
		forest = RandomForestRegressor(n_estimators=100, n_jobs = -1)
		scaler = preprocessing.StandardScaler()

		lolo = LeaveOneLabelOut(labels)	
		print(lolo,len(lolo))
		acc = 0

		us = UnderSampler(verbose=True)

		#X,Y = us.fit_transform(X,Y)
		kf = KFold(Y.shape[0],n_folds=folds)
		for train_index,test_index in lolo:

			print(len(train_index),len(test_index))
			Xtrain,Xtest = X[train_index], X[test_index]
			ytrain,ytest = Y[train_index], Y[test_index]
			
			forest.fit(Xtrain,ytrain)


			scores = forest.predict(Xtest)
			#acc += tolAcc(ytest,scores)
			
		print(acc/folds)



	# Ensemble Random Forest Regressor stacked with Random Forest Classifier
	elif sys.argv[1]=='-ensemble':
		RF  = []
		outputRF = []
		outRFtest=[]
	
		us = UnderSampler(verbose=True)
		cc = ClusterCentroids(verbose=True)
		#X,Y = cc.fit_transform(X,Y)
		print(X.shape,Y.shape)

		# separating features into categories for Ensemble Training
		activityData = X[:,0:3 ]
		screenData = X[:,3:14]	
		conversationData = X[:,14:20 ]
		colocationData = X[:,20:26]
		audioData = X[:,26:X.shape[1]]

		# Custom Nested Cross-Validation
		# Indexes is used to split the dataset in a 40/40/20 manner
		# NOTE: 30/30/40 seemed to produce very similar results
		indexes = np.array([i for i in range(X.shape[0])])
		np.random.shuffle(indexes)

		lolo = LeaveOneLabelOut(labels)	
	#	print(lolo,len(lolo))
		# separating data to 3 subsets: 
		# 1) Train RF
		# 2) Get RF outputs with which train NN
		# 3) Test NN output on the rest
		train_index = indexes[0: int(0.5*X.shape[0])]
		train_index2 =  indexes[int(0.5*X.shape[0]):int(0.8*X.shape[0])]
		test_index = indexes[int(0.8*X.shape[0]):X.shape[0]]
		print(len(train_index),len(train_index2),len(test_index	))
		# Training 5 regressors on 5 types of features
		i=0
		for data in [activityData,screenData,conversationData,colocationData,audioData]:
			RF.append(RandomForestRegressor(n_estimators=300,max_features=data.shape[1],n_jobs=-1))
			RF[i].fit(data[train_index],Y[train_index])
			outputRF.append( RF[i].predict(data[train_index2]) )
			outRFtest.append(RF[i].predict(data[test_index]))
			i += 1

		middleTrainMat = np.transpose(np.array(outputRF))
		testMat = np.transpose(np.array(outRFtest))
	

		# RF classifier to combine regressors
		class_weights={0 : 1, 1 : 0.5 , 2 : 0.1 , 3 : 0.6, 4 :1}
		print(class_weights)
		rfr= ExtraTreesClassifier(n_estimators=300,class_weight=class_weights,n_jobs=-1)
		rfr.fit(middleTrainMat,Y[train_index2])
		print(middleTrainMat.shape)

		
		pred = rfr.predict(testMat)
		# Print to screen mean error and Tolerance Score
		print(tolAcc(Y[test_index],pred,testMat))