コード例 #1
0
ファイル: explore.py プロジェクト: daanelson/ml_exercise
	#values to remove based on initial inspection of the data (r.describe, pandas.summary())
	handRemoved = ['STATE','ZIP','CONTROLN']
	rawdata = rawdata.drop(handRemoved,axis=1)

	#binarize resulting data (all columns with "object" datatype)
	bindata = pd.get_dummies(rawdata,prefix = 'BIN')			

	#normalizes all remaining non-binary features, fills N/A data w/mean
	bindata = utils.normalize(bindata)
	bindata = bindata.fillna(0)

	#based on ANOVA test with TARGET_B, removes all but val best features. Use this as sweep
	vals = np.arange(200,525,25)
	for val in vals:
		_, featdata = utils.bestfeat(bindata,val)

		#split into training/test 
		feat_train, feat_test, b_train, d_train, b_test, d_test = utils.trainTest(featdata,.30)

		#logistic regression
		classWeights = {0:1,1:20}
		clf = LogisticRegression(class_weight = classWeights, penalty = 'l1')
		clf.fit(feat_train, b_train)

		#scoring - predicted $ vs. actual $ from test sample as % and 
		predictions = clf.predict(feat_test)

		#sum of all non-zero prediction actual donations minus cost * number of nonzero predictions
		predCash = np.sum(np.multiply(predictions,d_test)) - 0.68*(np.sum(predictions))
		normCash = np.sum(d_test) - 0.68*len(d_test)
コード例 #2
0
ファイル: predict.py プロジェクト: daanelson/ml_exercise
	#file to read as input argument
	rawdata = pd.read_csv(sys.argv[1],error_bad_lines = False)

	#values to remove based on initial inspection of the data
	handRemoved = ['STATE','ZIP','CONTROLN']
	rawdata = rawdata.drop(handRemoved,axis=1)

	#binarize resulting data (all columns with "object" datatype)
	bindata = pd.get_dummies(rawdata,prefix_sep = 'BIN')			

	#normalizes all remaining non-binary features, fills N/A data w/mean
	featdata = utils.normalize(bindata)
	bindata = bindata.fillna(0)

	#based on anova test with TARGET_B, removes all but 400 best features. Return kbest as well for use with test data
	kbest, featdata = utils.bestfeat(bindata,400)

	#split into training/test
	feat_train, feat_test, b_train, d_train, b_test, d_test = utils.trainTest(featdata,.30)

	#logistic regression
	classWeights = {0:1,1:20}
	clf = LogisticRegression(class_weight = classWeights, penalty = 'l1')
	feat_train = feat_train.drop(['OSOURCEBINCLL','OSOURCEBINPTP','RFA_3BINA2C','RFA_6BINU1C','RFA_10BINA2B'],axis=1)	#overfitting, perhaps? These were not present in test data
	clf.fit(feat_train, b_train)

	#prepping prediction data using same steps as training data
	predata = pd.read_csv(sys.argv[2],error_bad_lines = False)
	controln = predata['CONTROLN']
	predata = predata.drop(handRemoved,axis=1)
	binpred = pd.get_dummies(predata,prefix_sep = 'BIN')