#values to remove based on initial inspection of the data (r.describe, pandas.summary()) handRemoved = ['STATE','ZIP','CONTROLN'] rawdata = rawdata.drop(handRemoved,axis=1) #binarize resulting data (all columns with "object" datatype) bindata = pd.get_dummies(rawdata,prefix = 'BIN') #normalizes all remaining non-binary features, fills N/A data w/mean bindata = utils.normalize(bindata) bindata = bindata.fillna(0) #based on ANOVA test with TARGET_B, removes all but val best features. Use this as sweep vals = np.arange(200,525,25) for val in vals: _, featdata = utils.bestfeat(bindata,val) #split into training/test feat_train, feat_test, b_train, d_train, b_test, d_test = utils.trainTest(featdata,.30) #logistic regression classWeights = {0:1,1:20} clf = LogisticRegression(class_weight = classWeights, penalty = 'l1') clf.fit(feat_train, b_train) #scoring - predicted $ vs. actual $ from test sample as % and predictions = clf.predict(feat_test) #sum of all non-zero prediction actual donations minus cost * number of nonzero predictions predCash = np.sum(np.multiply(predictions,d_test)) - 0.68*(np.sum(predictions)) normCash = np.sum(d_test) - 0.68*len(d_test)
#file to read as input argument rawdata = pd.read_csv(sys.argv[1],error_bad_lines = False) #values to remove based on initial inspection of the data handRemoved = ['STATE','ZIP','CONTROLN'] rawdata = rawdata.drop(handRemoved,axis=1) #binarize resulting data (all columns with "object" datatype) bindata = pd.get_dummies(rawdata,prefix_sep = 'BIN') #normalizes all remaining non-binary features, fills N/A data w/mean featdata = utils.normalize(bindata) bindata = bindata.fillna(0) #based on anova test with TARGET_B, removes all but 400 best features. Return kbest as well for use with test data kbest, featdata = utils.bestfeat(bindata,400) #split into training/test feat_train, feat_test, b_train, d_train, b_test, d_test = utils.trainTest(featdata,.30) #logistic regression classWeights = {0:1,1:20} clf = LogisticRegression(class_weight = classWeights, penalty = 'l1') feat_train = feat_train.drop(['OSOURCEBINCLL','OSOURCEBINPTP','RFA_3BINA2C','RFA_6BINU1C','RFA_10BINA2B'],axis=1) #overfitting, perhaps? These were not present in test data clf.fit(feat_train, b_train) #prepping prediction data using same steps as training data predata = pd.read_csv(sys.argv[2],error_bad_lines = False) controln = predata['CONTROLN'] predata = predata.drop(handRemoved,axis=1) binpred = pd.get_dummies(predata,prefix_sep = 'BIN')