def main(): train, test = u.get_train_test_df() user_events_dict = u.get_user_events_dict(test) for user in sorted(user_events_dict): random.shuffle(user_events_dict[user]) u.write_submission("random_benchmark.csv", user_events_dict)
def main(): train, test = u.get_train_test_df() user_events_dict = u.get_user_events_dict(test) event_attendees = u.get_event_attendees() event_yes = u.get_event_responses_dict(event_attendees["event"], event_attendees["yes"]) for user in user_events_dict: user_events_dict[user] = sorted(user_events_dict[user], key=lambda e: len(event_yes[e]), reverse=True) u.write_submission("event_popularity_benchmark.csv", user_events_dict)
def clean_df(): train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: if train[col].dtype == np.dtype('object'): s = np.unique(train[col].fillna(-1).values) mapping = pd.Series([x[0] for x in enumerate(s)], index = s) train_fea = train_fea.join(train[col].map(mapping).fillna(-1)) test_fea = test_fea.join(test[col].map(mapping).fillna(-1)) else: train_fea = train_fea.join(train[col].fillna(0)) test_fea = test_fea.join(test[col].fillna(0)) train_fea = train_fea.join(train['SalePrice']) return train_fea, test_fea
def preprocess1(): train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: print col types = set(type(x) for x in train[col]) if str in types: print "in:", col s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) #print str_to_categorical #return train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) train_fea.to_csv("train_fea.csv", index=False) test_fea.to_csv("test_fea.csv", index=False) train["SalePrice"].to_csv("target.csv", index=False) train["SalesID"].to_csv("train_salesID.csv", index=False) test["SalesID"].to_csv("test_salesID.csv", index=False)
def preprocess(): train, test = util.get_train_test_df() columns = set(train.columns) #columns.remove("SalesID") #columns.remove("SalePrice") #columns.remove("saledate") #train_fea = get_date_dataframe(train["saledate"]) #test_fea = get_date_dataframe(test["saledate"]) #parseColumns = ["UsageBand"] parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"] #"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc" # this is redundant "fiModelDesc", and has too many options... # Q, AC, AL AR AS colDict = {} for col in parseColumns: colDict[col] = [] colMap = {} notInTest = [] for index, col in enumerate(train.columns): print "MAP:", col, index colMap[col] = index if col in parseColumns: #print "start" s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x s.update(x for x in test[col].fillna(0)) # math.isnan(x) colDict[col] = s print s if col == "fiBaseModel": a = set(x for x in train[col].fillna(0)) b = set(x for x in test[col].fillna(0)) print "fiBaseModel" print print # found 11 type in test not in train print [x for x in b if x not in a] print print # found several hundred in train that are not in test, try dropping these... print [x for x in a if x not in b] notInTest = [x for x in a if x not in b] SaleIDArr = [] trainSalePriceArr = [] count = 0 csv_io.delete_file("train1.csv") for row in train.iterrows(): trainSalePrice = [] rowVals = row[1].fillna(0) newSet = [] newRow = [] if rowVals["fiBaseModel"] not in notInTest: continue trainSalePrice.append(rowVals["SalePrice"]) trainSalePriceArr.append(trainSalePrice) SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",") csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",") # ------------------------------------------- SaleIDArr = [] count = 0 csv_io.delete_file("test1.csv") for row in test.iterrows(): rowVals = row[1].fillna(0) newSet = [] newRow = [] SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",") if __name__=="__main__": preprocess()
def preprocess(): train, test = util.get_train_test_df() columns = set(train.columns) #columns.remove("SalesID") #columns.remove("SalePrice") #columns.remove("saledate") #train_fea = get_date_dataframe(train["saledate"]) #test_fea = get_date_dataframe(test["saledate"]) #parseColumns = ["UsageBand"] parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"] #"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc" # this is redundant "fiModelDesc", and has too many options... # Q, AC, AL AR AS colDict = {} for col in parseColumns: colDict[col] = [] colMap = {} notInTest = [] for index, col in enumerate(train.columns): print "MAP:", col, index colMap[col] = index if col in parseColumns: #print "start" s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x s.update(x for x in test[col].fillna(0)) # math.isnan(x) colDict[col] = s print s if col == "fiBaseModel": a = set(x for x in train[col].fillna(0)) b = set(x for x in test[col].fillna(0)) print "fiBaseModel" print print # found 11 type in test not in train print [x for x in b if x not in a] print print # found several hundred in train that are not in test, try dropping these... print [x for x in a if x not in b] notInTest = [x for x in a if x not in b] SaleIDArr = [] trainSalePriceArr = [] count = 0 csv_io.delete_file("train1.csv") for row in train.iterrows(): trainSalePrice = [] rowVals = row[1].fillna(0) newSet = [] newRow = [] if rowVals["fiBaseModel"] not in notInTest: continue trainSalePrice.append(rowVals["SalePrice"]) trainSalePriceArr.append(trainSalePrice) SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",") csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",") # ------------------------------------------- SaleIDArr = [] count = 0 csv_io.delete_file("test1.csv") for row in test.iterrows(): rowVals = row[1].fillna(0) newSet = [] newRow = [] SaleID = [] SaleID.append(rowVals["SalesID"]) SaleIDArr.append(SaleID) for col in colDict.keys(): for val in colDict[col]: if val == rowVals[col] : newRow.append(1) else: newRow.append(0) #newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year newRow.append(rowVals["MachineHoursCurrentMeter"]) count += 1 if count % 10000 == 0: print "Count", count newSet.append(newRow) csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a") csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",")
from collections import defaultdict import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor import util def get_date_dataframe(date_column): return pd.DataFrame({ "SaleYear": [d.year for d in date_column], "SaleMonth": [d.month for d in date_column], "SaleDay": [d.day for d in date_column] }, index=date_column.index) train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else:
def data_to_fea(): """Main preprocessing""" train, test = util.get_train_test_df() if testing == 1: # This is the to test predict test = train[(train["saledate"] >= datetime.datetime(2011,1,1)) & (train["saledate"] < datetime.datetime(2011,5,1))] test = test.reset_index(drop = True) test_Y = test["SalePrice"] train = train[(train["saledate"] < datetime.datetime(2011,1,1))] train = train.reset_index(drop = True) # drops the original index del test["SalePrice"] print 'test shape', test.shape train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) train_fea = map_external_data( train_fea) test_fea = map_external_data(test_fea) print "Cleaning Columns" train, train_fea = clean_columns(train, train_fea) test, test_fea = clean_columns(test, test_fea) # train[["power_min", "power_max", "SalePrice", "ProductGroup", "YearMade"]].to_csv('power_min_out.csv', index=True) test_columns = set(test.columns) columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") for col in columns: # these deleted already["ProductGroupDesc", "fiProductClassDesc"] if col not in col_list : # REAL ONE #if col == "fiBaseModel": # col_list = ["ProductSize","UsageBand",'fiProductClassDesc'] # "ProductGroup" #if col == "Backhoe_Mounting": # Testing - error in the fillna "convert string to float" but why? #print "starting", col # find newest. populates fiSecondaryDesc # Need to fix so that train_fea returns SaleYear #if col == 'fiBaseModel': # train_fea = newest_model(train) # test_fea = newest_model(test) # Binarize these, even if numerical if col in [ 'ProductGroup', 'state','fiModelSeries','fiModelDescriptor', 'auctioneerID', 'power_u', 'MfgID', 'Enclosure', 'SaleDay', "SaleWkDay", "fiProductClassDesc"] : print "binarize", col if col in ['auctioneerID', 'MfgID']: train[col]=train[col].fillna(value =0) test[col] = test[col].fillna(value = 0) else: train[col]=train[col].fillna(value ="") test[col] = test[col].fillna(value = "") s = np.unique(x for x in np.append(train[col].values,test[col].values)) #print col, ":", len(s), ":",s[:10] # would binarize BaseModel but too much memory train_fea, test_fea = binarize_cols(col, train, test, train_fea, test_fea) elif train[col].dtype == np.dtype('object') and col not in ['power_min', 'power_max']: #print "filling na" train[col]=train[col].fillna(value ="") test[col] = test[col].fillna(value = "") # s = np.unique(x for x in np.append(train[col].values,test[col].values)) #print col, ":", len(s), ":", s[:10] # Binarize these ones: if len(s) >2 and len(s) < 100 and col not in "Thumb": # in [ 'fiBaseModel'] # try on len(s) > 3 # don't binarize datasource (6 values), lower performance #print "binarize", col # would binarize BaseModel but too much memory train_fea, test_fea = binarize_cols(col, train, test, train_fea, test_fea) # Just enumerate these else: #if 1: #print "enumerate",col #Don't need below line for full data set usually if test[col].dtype != np.dtype(object): test[col] = test[col].astype(object) # in case test had diff type if len(s) == 2: # assume one is "" # one replace function would have been able to do this new_arr_train = [] new_arr_test = [] for x in train[col]: repl = 0 if x == "" else 1 new_arr_train.append(repl) for x in test[col]: repl = 0 if x == "" else 1 new_arr_test.append(repl) train_fea[col] = new_arr_train test_fea[col] = new_arr_test else: # Regular dumb indexing mapping = pd.Series([x[0] for x in enumerate(s)], index = s) # Original code train_fea[col] = train[col].map(mapping) test_fea[col] = test[col].map(mapping) #train_fea[col] = np.log(train_fea[col]+1) #test_fea[col] = np.log(test_fea[col]+1) if pd.isnull(train_fea[col]).any() or pd.isnull(test_fea[col]).any(): print "HAS NAN", col else: # Numeric field #print col, " as number" #if train[col].dtype != np.dtype('object'): #m = np.mean([x for x in train[col] if not np.isnan(x)]) # can use pd.isnull(frame).any() TRY if col =="MachineID" or col == "ModelID" or col == "datasource": m = 0 # value to fill train_m = 0 test_m = 0 elif col in [ "MachineHoursCurrentMeter", 'Stick_Length']: # Diff w/ 0 is not large train_m = round(np.mean([x for x in train[col] if x > 0]),1) test_m = round(np.mean([x for x in test[col] if x > 0]),1) #elif col == "Stick_Length": #train_m = round(np.median([x for x in train[col] if x > 0]),1) #test_m = round(np.median([x for x in test[col] if x > 0]),1) else: # Calculate median, better oob performance than mean train_m= np.median(train[col]) # if x > 0? test_m= np.median(test[col]) #print m #if col == 'power_min': # train[col] = train[col].fillna(value =0) # test[col] = test[col].fillna(value=0) if col in ["Stick_Length"]: train[col] = train[col].replace(0,train_m) test[col] = test[col].replace(0,test_m) else: train[col] = train[col].fillna(value =train_m) test[col] = test[col].fillna(value=test_m) #print col, train_m # "mean is nan" if np.isnan(train_m): train[col] = train[col].fillna(value =0) if np.isnan(test_m):test[col] = test[col].fillna(value=0) #train[col] = train[col].astype('float64') #test[col] = test[col].astype('float64') train[col] = np.log(train[col]+1) test[col] = np.log(test[col]+1) train_fea[col] = train[col] test_fea[col] = test[col] #if col != "power_min": # del train[col] # del test[col] #train_fea = train_fea.join(train[col]) #test_fea = test_fea.join(test[col]) if pd.isnull(train_fea[col]).any() or pd.isnull(test_fea[col]).any(): print "HAS NAN", col #except: #print "Error with col", col #train_fea = train_fea[train_fea["SaleYear"] < 2011] # try this out # now at the top #train = train[train["saledate"] <= datetime.datetime(2011,1,1)] # try this out # now at the top del train_fea["SaleYear"] del test_fea["SaleYear"] if testing == 0: return train_fea, test_fea, [x for x in train["SalePrice"]] else: return train_fea, test_fea, [x for x in train["SalePrice"]], test_Y
def main(): train, test = u.get_train_test_df() user_events_dict = u.get_user_events_dict(test) u.write_submission("given_order.csv", user_events_dict)
def run_stack(SEED): train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join( pd.DataFrame( {col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join( pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",") #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") #trainBase = trainBase[0:5000] #targetX = targetX[0:5000] #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",") #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #train_fea, train["SalePrice"] print "Data size: ", len(train_fea), len(test_fea) #dataset_blend_train = np.zeros((len(train_fea), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros( (len(test_fea), len(clfs))) # np.zeros(len(train_fea), len(clfs)) dataset_blend_train = np.zeros((len(train_fea), len(clfs))) print "Begin Training" lenTrainBase = 401125 # len(train_fea) lenTest = 11573 # len(test_fea) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: targetX = [train["SalePrice"][i] for i in train_index] trainX = [train_fea.ix[i] for i in train_index] targetTest = [train["SalePrice"][i] for i in test_index] trainTest = [train_fea.ix[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(trainX), len(targetX) #print trainX[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(trainX, targetX) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] #print targetTest[i], probX if probX < 0: # some are comming out negative. probX = -probX probSum += math.pow( math.log10(targetTest[i]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum / len(prob)) avg += math.sqrt(probSum / len(prob)) / NumFolds gc.collect() predicted_probs = [] for i in range(0, lenTest): predicted_probs.append(clf.predict(test_fea.ix[i])) #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single( "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single( "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:, ExecutionIndex]) csv_io.write_delimited_file("../predictions/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", "" ], filemode="a", delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
import numpy as np import pandas as pd import util train, test = util.get_train_test_df() median_price = np.median(train["SalePrice"]) print("The median price is %0.2f" % median_price) util.write_submission("median_benchmark.csv", [median_price for i in range(len(test))])
def run_stack(SEED): train, test = util.get_train_test_df() columns = set(train.columns) columns.remove("SalesID") columns.remove("SalePrice") columns.remove("saledate") train_fea = get_date_dataframe(train["saledate"]) test_fea = get_date_dataframe(test["saledate"]) for col in columns: types = set(type(x) for x in train[col]) if str in types: s = set(x for x in train[col]) str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)]) train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index)) test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index)) else: train_fea = train_fea.join(train[col]) test_fea = test_fea.join(test[col]) model = "" print "Running Stack." avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",") #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",") #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",") #trainBase = trainBase[0:5000] #targetX = targetX[0:5000] #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",") #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",") predicted_list = [] bootstrapLists = [] clfs = [ GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1) ] #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1) #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1), #train_fea, train["SalePrice"] print "Data size: ", len(train_fea) , len(test_fea) #dataset_blend_train = np.zeros((len(train_fea), len(clfs))) #dataset_blend_test = np.zeros((len(test), len(clfs))) dataset_blend_test = np.zeros((len(test_fea), len(clfs))) # np.zeros(len(train_fea), len(clfs)) dataset_blend_train = np.zeros((len(train_fea), len(clfs))) print "Begin Training" lenTrainBase = 401125 # len(train_fea) lenTest = 11573 # len(test_fea) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True) for train_index, test_index in Folds: targetX = [train["SalePrice"][i] for i in train_index] trainX = [train_fea.ix[i] for i in train_index] targetTest = [train["SalePrice"][i] for i in test_index] trainTest = [train_fea.ix[i] for i in test_index] gc.collect() print print "Iteration: ", foldCount print "LEN: ", len(trainX), len(targetX) #print trainX[0] #print target[0] #return print "Start", datetime.datetime.now() clf.fit(trainX, targetX) prob = clf.predict(trainTest) print "End ", datetime.datetime.now() dataset_blend_train[test_index, ExecutionIndex] = prob gc.collect() probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] #print targetTest[i], probX if probX < 0: # some are comming out negative. probX = -probX probSum += math.pow(math.log10(targetTest[i]) - math.log10(probX), 2) #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", math.sqrt(probSum/len(prob)) avg += math.sqrt(probSum/len(prob))/NumFolds gc.collect() predicted_probs = [] for i in range(0,lenTest): predicted_probs.append(clf.predict(test_fea.ix[i])) #predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] gc.collect() foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
import util as u import pandas as pd import operator import numpy as np from sklearn import ensemble, cross_validation, metrics import re ## todo: gradient boosted model? if __name__ == '__main__': train, test = u.get_train_test_df(test_set=False) ## define columns to use for features ## not used: SalesID, MachineID, ModelID, Saledate, fiModelDesc ## ProductClassDesc, ProductGroupDesk ## fiBaseModel, fiSecondaryDesc, fiModelSeries, fiModelDescriptor categorical = {'datasource', 'auctioneerID', 'UsageBand', 'ProductSize', 'ProductGroup', 'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control', 'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension', 'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics', 'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size', 'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow', 'Track_Type', 'Thumb', 'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type', 'Travel_Controls', 'Differential_Type', 'Steering_Controls'} numerical = {'YearMade', 'MachineHoursCurrentMeter', 'Undercarriage_Pad_Width', 'Stick_Length'} def get_date_dataframe(date_column): '''Create a data frame with columns for each date constituent''' return pd.DataFrame({ 'SaleYear' : [d.year for d in date_column], 'SaleMonth' : [d.month for d in date_column],