Python preprocess Examples

Programming Language: Python

Namespace/Package Name: PreprocessData

Method/Function: preprocess

Examples at hotexamples.com: 2

Python preprocess - 2 examples found. These are the top rated real world Python examples of PreprocessData.preprocess extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: main.py Project: CUBigDataClass/Kiwi

def predict_testdata(cat_info,dataroot):
## input should be a data frame
##  output is a list of popular skus in the right order
	testdat = pd.read_csv(dataroot)
	num_samples = len(testdat)
	skulist = []

	for i in xrange(num_samples):
		dat = testdat.iloc[i:i+1] ## output should be a data frame
		cat = dat['category'].iloc[0]
		catdat = preprocess(dat)

		try:
			catdic = cat_info[cat]['sku_info']
		except KeyError:
			print "Category %s is unseen!" % str(cat)
			raise KeyError

		testfsets = get_test_featuresets(catdat,catdic)
		cls = cat_info[cat]['cls']
		yclasses =  cls.classes_
		yall = cls.predict_proba(testfsets)
		ysort = np.argsort(-yall)

		n = 5
		try:
			ybest = ysort[:,:n] ## get the most frequent n
		except IndexError:
			try:
				ybest = ysort[:,:len(yclasses)] ## only one class
			except IndexError:
				ybest = ysort # if ysort is shorter than n, get ysort
		
		yout = yclasses[ybest]
		skulist.append(yout.flatten().tolist())
	
	return skulist

Example #2

Show file

File: main.py Project: CUBigDataClass/Kiwi

def main():
	start = timeit.default_timer()
	print "read train data"
	dataroot = "../data/train.csv"

	gcat_dic = groupByCat(dataroot)

	cat_list = gcat_dic.keys()
####################
## preprocess the data
####################

####################
## feature selections
####################
	cat_info = dict()
	for cat in cat_list:
		#print "preprocessing data"
		catdat = preprocess(gcat_dic[cat])

		#print "feature selections"
		sku_info,fset,skus = getFeatureSet(catdat)

		cat_info[cat] = {}
		cat_info[cat]['sku_info'] = sku_info
	
##### method 1 #####
## choose the most frequent 5 skus

##### method 2 #####
## or choose the skus with frequency > n, n is user specified

## the final feature set is a matrix X, (n_samples, n_skus)
##	and a column of sku, (n_skus)
####################
## train NB classifiers
####################
		#print "training data"
		cls = naive_bayes.MultinomialNB(alpha=0.1)
		cls.fit(fset,skus)
		cat_info[cat]['cls'] = cls
	
#
### release the memory
#	gcat_dic = dict()
####################
## predict
####################
## preprocess test data
	print "read test data"
	dataroot = "../data/test_part.csv"

	skulist = predict_testdata(cat_info,dataroot)
	#skulist = predict_testdata_bycat(cat_info,dataroot)
	print skulist
## predict by nb_dic
		
####################
## compute elapsed CPU time
####################
	stop = timeit.default_timer()

	print 'time is', stop - start