Example #1
0
 def __init__(self, X, categorical_features):
     OneHotEncoder.__init__(self,
                            categorical_features=categorical_features,
                            sparse=False)
     self.fit(X)
Example #2
0
def preProcessData(trainFeatureMatrix, testFeatureMatrix):
	totalFeatureNum = 52
	singleValueIndexList = [17, 19, 20, 23]
	categoricalAttriIndexList = [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 44, 45, 46]	
	categoricalFeatureValueNumList = [13, 112, 2, 13, 13, 112, 2, 13, 145, 4, 3031, 4, 138, 102, 102, 2090]
	cateNumericIndexList = [1, 6, 15, 16, 18,21,22,24,25,26,27,28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,49,50,51]

	numericAttriIndexList = [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51]
	
	# for i in range(len(trainFeatureSpace[0])):
	# 	if not i in categoricalAttriIndexList:
	# 		#print 'numerical', i, len(list(set(trainFeatureSpace[:,i])))
	# 		print '%s, numerical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i]))))
	# 	else:
	# 		print '%s, categorical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i]))))
	


	tempResultMatrix =  np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0)

	# print len(trainFeatureMatrix), len(trainFeatureMatrix[0])
	# print len(testFeatureMatrix), len(testFeatureMatrix[0])
	# print len(tempResultMatrix), len(tempResultMatrix[0])

	# exit()

	# for i in range(len(trainFeatureMatrix)):
	# 	for j in range(len(trainFeatureMatrix[0])):
	# 		if j in cateNumericIndexList:
	# 			trainFeatureMatrix[i][j] = int(trainFeatureMatrix[i][j])

	# for i in range(len(testFeatureMatrix)):
	# 	for j in range(len(testFeatureMatrix[0])):
	# 		if j in cateNumericIndexList:
	# 			testFeatureMatrix[i][j] = int(testFeatureMatrix[i][j])

	#selectedFeatureList = []
	# for i in range(53):
	# 	if not i in singleValueIndexList:
	# 		selectedFeatureList.append(i)

	# trainFeatureMatrix = trainFeatureMatrix[ : , selectedFeatureList]
	# testFeatureMatrix = testFeatureMatrix[ : , selectedFeatureList]
	from sklearn.preprocessing import OneHotEncoder
	enc = OneHotEncoder()
	enc.__init__(categorical_features = categoricalAttriIndexList + cateNumericIndexList)
	enc.fit(tempResultMatrix)
	trainFeatureMatrix = enc.transform(trainFeatureMatrix).toarray()
	testFeatureMatrix = enc.transform(testFeatureMatrix).toarray()

	print 'old feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0])


	#tempResultMatrix =  np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0)


	sel = VarianceThreshold()
	sel.fit(trainFeatureMatrix)
	trainFeatureMatrix = sel.transform(trainFeatureMatrix)
	testFeatureMatrix = sel.transform(testFeatureMatrix)
	print 'new feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0])
	#exit()
	return trainFeatureMatrix, testFeatureMatrix