def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)

    for user in sorted(user_events_dict):
        random.shuffle(user_events_dict[user])

    u.write_submission("random_benchmark.csv", user_events_dict)
def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)
    event_attendees = u.get_event_attendees()
    event_yes = u.get_event_responses_dict(event_attendees["event"], event_attendees["yes"])

    for user in user_events_dict:
        user_events_dict[user] = sorted(user_events_dict[user],
            key=lambda e: len(event_yes[e]), reverse=True)

    u.write_submission("event_popularity_benchmark.csv", user_events_dict)
Exemple #3
0
def clean_df():
    train, test = util.get_train_test_df()
    columns = set(train.columns)
    columns.remove("SalesID")
    columns.remove("SalePrice")
    columns.remove("saledate")
    train_fea = get_date_dataframe(train["saledate"])
    test_fea = get_date_dataframe(test["saledate"])
    for col in columns:
        if train[col].dtype == np.dtype('object'):
            s = np.unique(train[col].fillna(-1).values)
            mapping = pd.Series([x[0] for x in enumerate(s)], index = s)
            train_fea = train_fea.join(train[col].map(mapping).fillna(-1))
            test_fea = test_fea.join(test[col].map(mapping).fillna(-1))
        else:
            train_fea = train_fea.join(train[col].fillna(0))
            test_fea = test_fea.join(test[col].fillna(0))
    train_fea = train_fea.join(train['SalePrice']) 
    return train_fea, test_fea
Exemple #4
0
def preprocess1():

	train, test = util.get_train_test_df()

	columns = set(train.columns)
	columns.remove("SalesID")
	columns.remove("SalePrice")
	columns.remove("saledate")

	train_fea = get_date_dataframe(train["saledate"])
	test_fea = get_date_dataframe(test["saledate"])

	for col in columns:
		print col
		types = set(type(x) for x in train[col])
		if str in types:
			print "in:", col
			s = set(x for x in train[col])
			str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)])
			
			#print str_to_categorical
			#return
			
			train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index))
			test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index))
		else:
			train_fea = train_fea.join(train[col])
			test_fea = test_fea.join(test[col])

			
	train_fea.to_csv("train_fea.csv", index=False)				
	test_fea.to_csv("test_fea.csv", index=False)		
			
			
	train["SalePrice"].to_csv("target.csv", index=False)
	
	train["SalesID"].to_csv("train_salesID.csv", index=False)			
	test["SalesID"].to_csv("test_salesID.csv", index=False)
Exemple #5
0
def preprocess():

	train, test = util.get_train_test_df()

	
	columns = set(train.columns)
	#columns.remove("SalesID")
	#columns.remove("SalePrice")
	#columns.remove("saledate")

	#train_fea = get_date_dataframe(train["saledate"])
	#test_fea = get_date_dataframe(test["saledate"])

	#parseColumns = ["UsageBand"]
	parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"]
	
	#"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc"
	# this is redundant "fiModelDesc", and has too many options...
	
	# Q, AC, AL AR AS
	
	colDict = {}
	for col in parseColumns:
		colDict[col] = []
		
	colMap = {}	
	notInTest = []
	for index, col in enumerate(train.columns):
		print "MAP:", col, index
		colMap[col] = index
		if col in parseColumns:
			#print "start"			
			s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x
			s.update(x for x in test[col].fillna(0)) # math.isnan(x)
			
			colDict[col] = s
			print s
			
			if col == "fiBaseModel":
				a = set(x for x in train[col].fillna(0))
				b = set(x for x in test[col].fillna(0))		
				print "fiBaseModel"
				print
				print
				# found 11 type in test not in train
				print [x for x in b if x not in a]
				print
				print
				# found several hundred in train that are not in test, try dropping these...
				print [x for x in a if x not in b]
				notInTest = [x for x in a if x not in b]

				
	SaleIDArr = []		
	trainSalePriceArr = []

	count = 0
	csv_io.delete_file("train1.csv")
	for row in train.iterrows():
		trainSalePrice = []
	
		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		if rowVals["fiBaseModel"] not in notInTest:
			continue
		
		trainSalePrice.append(rowVals["SalePrice"])
		trainSalePriceArr.append(trainSalePrice)
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a")

		
	csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",")
	csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
	# -------------------------------------------	
	
	SaleIDArr = []
	
	count = 0
	csv_io.delete_file("test1.csv")
	for row in test.iterrows():

		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a")
	
	csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
	


if __name__=="__main__":
	preprocess()
Exemple #6
0
def preprocess():

	train, test = util.get_train_test_df()

	
	columns = set(train.columns)
	#columns.remove("SalesID")
	#columns.remove("SalePrice")
	#columns.remove("saledate")

	#train_fea = get_date_dataframe(train["saledate"])
	#test_fea = get_date_dataframe(test["saledate"])

	#parseColumns = ["UsageBand"]
	parseColumns = [ "UsageBand","fiBaseModel","fiModelSeries","fiModelDescriptor","ProductSize","ProductGroup","Drive_System","Enclosure","Forks","Pad_Type","Ride_Control","Stick","Transmission","Turbocharged","Blade_Extension","Blade_Width","Enclosure_Type","Engine_Horsepower","Hydraulics","Pushblock","Ripper","Scarifier","Tip_ControlCoupler","Coupler_System","Grouser_Tracks","Hydraulics_Flow","Track_Type","Thumb","Pattern_Changer","Grouser_Type","Backhoe_Mounting","Blade_Type","Travel_Controls","Differential_Type","Steering_Controls"]
	
	#"auctioneerID","state","ProductGroupDesc",,"fiSecondaryDesc"
	# this is redundant "fiModelDesc", and has too many options...
	
	# Q, AC, AL AR AS
	
	colDict = {}
	for col in parseColumns:
		colDict[col] = []
		
	colMap = {}	
	notInTest = []
	for index, col in enumerate(train.columns):
		print "MAP:", col, index
		colMap[col] = index
		if col in parseColumns:
			#print "start"			
			s = set(x for x in train[col].fillna(0)) # 0 if x == "" or not isinstance(x, float) else x
			s.update(x for x in test[col].fillna(0)) # math.isnan(x)
			
			colDict[col] = s
			print s
			
			if col == "fiBaseModel":
				a = set(x for x in train[col].fillna(0))
				b = set(x for x in test[col].fillna(0))		
				print "fiBaseModel"
				print
				print
				# found 11 type in test not in train
				print [x for x in b if x not in a]
				print
				print
				# found several hundred in train that are not in test, try dropping these...
				print [x for x in a if x not in b]
				notInTest = [x for x in a if x not in b]

				
	SaleIDArr = []		
	trainSalePriceArr = []

	count = 0
	csv_io.delete_file("train1.csv")
	for row in train.iterrows():
		trainSalePrice = []
	
		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		if rowVals["fiBaseModel"] not in notInTest:
			continue
		
		trainSalePrice.append(rowVals["SalePrice"])
		trainSalePriceArr.append(trainSalePrice)
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("train1.csv", newSet ,header=None, delimiter=",", filemode="a")

		
	csv_io.write_delimited_file("target.csv", trainSalePriceArr ,header=None, delimiter=",")
	csv_io.write_delimited_file("train_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
	# -------------------------------------------	
	
	SaleIDArr = []
	
	count = 0
	csv_io.delete_file("test1.csv")
	for row in test.iterrows():

		rowVals = row[1].fillna(0)
		newSet = []
		newRow = []
		
		SaleID = []
		SaleID.append(rowVals["SalesID"])
		SaleIDArr.append(SaleID)
		
		for col in colDict.keys():
			for val in colDict[col]:
				if val == rowVals[col] :
					newRow.append(1)
				else:
					newRow.append(0)

		#newRow.append(rowVals["YearMade"]) # need to calculate age, sale date minus year
		newRow.append(rowVals["MachineHoursCurrentMeter"])
		
		count += 1
		if count % 10000 == 0:
			print "Count", count
			
		newSet.append(newRow)
		csv_io.write_delimited_file("test1.csv", newSet ,header=None, delimiter=",", filemode="a")
	
	csv_io.write_delimited_file("test_salesID.csv", SaleIDArr ,header=None, delimiter=",")		
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import util

def get_date_dataframe(date_column):
    return pd.DataFrame({
        "SaleYear": [d.year for d in date_column],
        "SaleMonth": [d.month for d in date_column],
        "SaleDay": [d.day for d in date_column]
        }, index=date_column.index)

train, test = util.get_train_test_df()

columns = set(train.columns)
columns.remove("SalesID")
columns.remove("SalePrice")
columns.remove("saledate")

train_fea = get_date_dataframe(train["saledate"])
test_fea = get_date_dataframe(test["saledate"])

for col in columns:
    types = set(type(x) for x in train[col])
    if str in types:
        s = set(x for x in train[col])
        str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)])
        train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index))
        test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index))
    else:
Exemple #8
0
def data_to_fea():
    """Main preprocessing"""
    train, test = util.get_train_test_df()
    
    if testing == 1:
        # This is the to test predict
        test = train[(train["saledate"] >= datetime.datetime(2011,1,1)) & (train["saledate"] < datetime.datetime(2011,5,1))]
        test = test.reset_index(drop = True)
        test_Y = test["SalePrice"]
        train = train[(train["saledate"] < datetime.datetime(2011,1,1))]
        train = train.reset_index(drop = True) # drops the original index
        del test["SalePrice"]

    print 'test shape',  test.shape
    
    train_fea = get_date_dataframe(train["saledate"])
    test_fea = get_date_dataframe(test["saledate"])
    
    train_fea = map_external_data( train_fea)
    test_fea = map_external_data(test_fea)


    print "Cleaning Columns"
    train, train_fea = clean_columns(train, train_fea)
    test, test_fea = clean_columns(test, test_fea)
    # train[["power_min", "power_max", "SalePrice", "ProductGroup", "YearMade"]].to_csv('power_min_out.csv', index=True)
    test_columns = set(test.columns)
    columns = set(train.columns)
    
    columns.remove("SalesID")
    columns.remove("SalePrice")
    columns.remove("saledate")
    for col in columns:
        # these deleted already["ProductGroupDesc", "fiProductClassDesc"]
      if col not in col_list : # REAL ONE
        #if col == "fiBaseModel":
        # col_list = ["ProductSize","UsageBand",'fiProductClassDesc'] # "ProductGroup"
        #if col == "Backhoe_Mounting": # Testing - error in the fillna "convert string to float" but why?
            #print "starting", col
            # find newest. populates fiSecondaryDesc
        # Need to fix so that train_fea returns SaleYear
        #if col == 'fiBaseModel':
        #   train_fea = newest_model(train)
        #   test_fea = newest_model(test)
        
        # Binarize these, even if numerical
        if col  in [  'ProductGroup', 'state','fiModelSeries','fiModelDescriptor',
                        'auctioneerID', 'power_u', 'MfgID', 'Enclosure', 'SaleDay', "SaleWkDay", "fiProductClassDesc"] :
            print "binarize", col
        
            if col in ['auctioneerID', 'MfgID']:
                train[col]=train[col].fillna(value =0)
                test[col] = test[col].fillna(value = 0)
            else:
                train[col]=train[col].fillna(value ="")
                test[col] = test[col].fillna(value = "")

            s = np.unique(x for x in np.append(train[col].values,test[col].values))
            #print col, ":", len(s), ":",s[:10]
                # would binarize BaseModel but too much memory  
            train_fea, test_fea = binarize_cols(col, train, test, train_fea, test_fea)
                
        elif train[col].dtype == np.dtype('object') and col not in ['power_min', 'power_max']:
            #print "filling na"
            train[col]=train[col].fillna(value ="")
            test[col] = test[col].fillna(value = "")
            #
            s = np.unique(x for x in np.append(train[col].values,test[col].values))
            #print  col, ":", len(s), ":", s[:10]
            
            # Binarize these ones:
            if len(s) >2 and len(s) < 100 and col not in "Thumb": # in [ 'fiBaseModel']
            # try on len(s) > 3
            # don't binarize datasource (6 values), lower performance
                #print "binarize", col
                # would binarize BaseModel but too much memory
                train_fea, test_fea = binarize_cols(col, train, test, train_fea, test_fea)
            
            # Just enumerate these
            else:
            #if 1:
                #print "enumerate",col
                #Don't need below line for full data set usually
                if test[col].dtype != np.dtype(object):
                    test[col] = test[col].astype(object) # in case test had diff type
                
                if len(s) == 2:
                    # assume one is ""
                    # one replace function would have been able to do this
                    new_arr_train = []
                    new_arr_test = []
                    for x in train[col]:
                        repl = 0 if x == "" else 1
                        new_arr_train.append(repl)
                    for x in test[col]:
                        repl = 0 if x == "" else 1
                        new_arr_test.append(repl)
                    train_fea[col] = new_arr_train
                    test_fea[col] = new_arr_test
                else:
                    # Regular dumb indexing
                    mapping = pd.Series([x[0] for x in enumerate(s)], index = s) # Original code
                    train_fea[col] = train[col].map(mapping)
                    test_fea[col] = test[col].map(mapping)
                    #train_fea[col] = np.log(train_fea[col]+1)
                    #test_fea[col] = np.log(test_fea[col]+1)
                if pd.isnull(train_fea[col]).any() or pd.isnull(test_fea[col]).any():
                    print "HAS NAN", col
                
        else:
        # Numeric field
        
            #print col, " as number"
        #if train[col].dtype != np.dtype('object'):
            #m = np.mean([x for x in train[col] if not np.isnan(x)])
            
            # can use pd.isnull(frame).any() TRY 
            if col =="MachineID" or col == "ModelID" or col == "datasource":
                m = 0 # value to fill
                train_m = 0
                test_m = 0
            elif col in [ "MachineHoursCurrentMeter", 'Stick_Length']:
                # Diff w/ 0 is not large
                train_m = round(np.mean([x for x in train[col] if x > 0]),1)
                test_m = round(np.mean([x for x in test[col] if x > 0]),1)
            #elif col == "Stick_Length":
                #train_m = round(np.median([x for x in train[col] if x > 0]),1)
                #test_m = round(np.median([x for x in test[col] if x > 0]),1)
            else:
                # Calculate median, better oob performance than mean
                train_m= np.median(train[col]) # if x > 0?
                test_m= np.median(test[col])
            #print m
            #if col == 'power_min':
            #    train[col] = train[col].fillna(value =0)
            #    test[col] = test[col].fillna(value=0)

            if col in ["Stick_Length"]:
                train[col] = train[col].replace(0,train_m)
                test[col] = test[col].replace(0,test_m)
            else:
                train[col] = train[col].fillna(value =train_m)
                test[col] = test[col].fillna(value=test_m)
            #print col, train_m
            
            # "mean is nan" 
            if np.isnan(train_m): train[col] = train[col].fillna(value =0)
            if np.isnan(test_m):test[col] = test[col].fillna(value=0)
            
            #train[col] = train[col].astype('float64')
            #test[col] = test[col].astype('float64')
            train[col] = np.log(train[col]+1)
            test[col] = np.log(test[col]+1)
            
            train_fea[col]  = train[col]
            test_fea[col] = test[col]
            
            #if col != "power_min":
            #    del train[col]
            #    del test[col]
            #train_fea = train_fea.join(train[col])
            #test_fea = test_fea.join(test[col])
            if pd.isnull(train_fea[col]).any() or pd.isnull(test_fea[col]).any():
                print "HAS NAN", col
        
      #except:
            #print "Error with col", col

    #train_fea = train_fea[train_fea["SaleYear"] <  2011] # try this out # now at the top
    #train = train[train["saledate"] <= datetime.datetime(2011,1,1)] # try this out # now at the top

    del train_fea["SaleYear"]
    del test_fea["SaleYear"]
    if testing == 0:
        return train_fea, test_fea, [x for x in train["SalePrice"]]
    else:
        return train_fea, test_fea, [x for x in train["SalePrice"]], test_Y
def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)
    u.write_submission("given_order.csv", user_events_dict)
Exemple #10
0
def run_stack(SEED):

    train, test = util.get_train_test_df()

    columns = set(train.columns)
    columns.remove("SalesID")
    columns.remove("SalePrice")
    columns.remove("saledate")

    train_fea = get_date_dataframe(train["saledate"])
    test_fea = get_date_dataframe(test["saledate"])

    for col in columns:
        types = set(type(x) for x in train[col])
        if str in types:
            s = set(x for x in train[col])
            str_to_categorical = defaultdict(lambda: -1,
                                             [(x[1], x[0])
                                              for x in enumerate(s)])
            train_fea = train_fea.join(
                pd.DataFrame(
                    {col: [str_to_categorical[x] for x in train[col]]},
                    index=train.index))
            test_fea = test_fea.join(
                pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]},
                             index=test.index))
        else:
            train_fea = train_fea.join(train[col])
            test_fea = test_fea.join(test[col])

    model = ""
    print "Running Stack."

    avg = 0
    NumFolds = 5  # 5 is good, but 10 yeilds a better mean since outliers are less significant.

    #targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",")
    #trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",")
    #test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

    #trainBase = trainBase[0:5000]
    #targetX = targetX[0:5000]

    #train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",")
    #test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",")

    predicted_list = []
    bootstrapLists = []

    clfs = [
        GradientBoostingRegressor(loss='lad',
                                  learn_rate=0.05,
                                  subsample=0.5,
                                  max_depth=6,
                                  n_estimators=3000,
                                  random_state=166,
                                  min_samples_leaf=1)
    ]
    #GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)
    #GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),
    #GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),

    #train_fea, train["SalePrice"]
    print "Data size: ", len(train_fea), len(test_fea)
    #dataset_blend_train = np.zeros((len(train_fea), len(clfs)))
    #dataset_blend_test = np.zeros((len(test), len(clfs)))
    dataset_blend_test = np.zeros(
        (len(test_fea), len(clfs)))  # np.zeros(len(train_fea), len(clfs))
    dataset_blend_train = np.zeros((len(train_fea), len(clfs)))

    print "Begin Training"

    lenTrainBase = 401125  # len(train_fea)

    lenTest = 11573  # len(test_fea)
    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print clf
        avg = 0

        predicted_list = []

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
        Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)

        for train_index, test_index in Folds:

            targetX = [train["SalePrice"][i] for i in train_index]
            trainX = [train_fea.ix[i] for i in train_index]

            targetTest = [train["SalePrice"][i] for i in test_index]
            trainTest = [train_fea.ix[i] for i in test_index]

            gc.collect()
            print
            print "Iteration: ", foldCount
            print "LEN: ", len(trainX), len(targetX)

            #print trainX[0]
            #print target[0]
            #return

            print "Start", datetime.datetime.now()
            clf.fit(trainX, targetX)
            prob = clf.predict(trainTest)
            print "End  ", datetime.datetime.now()

            dataset_blend_train[test_index, ExecutionIndex] = prob

            gc.collect()

            probSum = 0
            weightSum = 0
            # totalOffByHalf = 0
            # totalPositive = 0
            # totalPositiveOffByHalf = 0
            # totalPositivePredictions = 0

            for i in range(0, len(prob)):
                probX = prob[i]
                #print targetTest[i], probX

                if probX < 0:  # some are comming out negative.
                    probX = -probX

                probSum += math.pow(
                    math.log10(targetTest[i]) - math.log10(probX), 2)

                #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
                #weightSum += weights[test_index[i]][0]

                #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX

                # log loss cal
                #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
                # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
                # totalOffByHalf = totalOffByHalf + 1

                # if ( int(targetTest[i]) == 1 ):
                # totalPositive = totalPositive + 1
                # if ( int(targetTest[i]) == 1 and probX < 0.5):
                # totalPositiveOffByHalf = totalPositiveOffByHalf + 1
                # if (probX > 0.5):
                # totalPositivePredictions = totalPositivePredictions + 1

            # print
            # print "Stats:"
            # print "Total Off By > 0.5 ", totalOffByHalf
            # print "Total Positive ", totalPositive
            # print "Total Positive Off By Half ", totalPositiveOffByHalf
            # print "Total Positive Predictions ", totalPositivePredictions
            #print -probSum/len(prob)
            print "Score: ", math.sqrt(probSum / len(prob))

            avg += math.sqrt(probSum / len(prob)) / NumFolds

            gc.collect()

            predicted_probs = []

            for i in range(0, lenTest):
                predicted_probs.append(clf.predict(test_fea.ix[i]))

            #predicted_probs = clf.predict(testScaled)
            #predicted_list.append([x[1] for x in predicted_probs])
            dataset_blend_test_set[:, foldCount] = predicted_probs  #[0]
            gc.collect()

            foldCount = foldCount + 1

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        #print "Saving NP"
        #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
        #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
        #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
        #print "Done Saving NP"

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        csv_io.write_delimited_file_single(
            "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" +
            str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_test_set.mean(1))

        csv_io.write_delimited_file_single(
            "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") +
            "_" + str(avg) + "_" + str(clf)[:12] + ".csv",
            dataset_blend_train[:, ExecutionIndex])

        csv_io.write_delimited_file("../predictions/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", model, "", ""
        ],
                                    filemode="a",
                                    delimiter=",")

        print "------------------------Average: ", avg

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Exemple #11
0
import numpy as np
import pandas as pd
import util

train, test = util.get_train_test_df()
median_price = np.median(train["SalePrice"])
print("The median price is %0.2f" % median_price)

util.write_submission("median_benchmark.csv",
                      [median_price for i in range(len(test))])
def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)
    u.write_submission("given_order.csv", user_events_dict)
Exemple #13
0
def run_stack(SEED):


	train, test = util.get_train_test_df()

	columns = set(train.columns)
	columns.remove("SalesID")
	columns.remove("SalePrice")
	columns.remove("saledate")

	
	train_fea = get_date_dataframe(train["saledate"])
	test_fea = get_date_dataframe(test["saledate"])

	for col in columns:
		types = set(type(x) for x in train[col])
		if str in types:
			s = set(x for x in train[col])
			str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)])
			train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index))
			test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index))
		else:
			train_fea = train_fea.join(train[col])
			test_fea = test_fea.join(test[col])


	model = "" 
	print "Running Stack."

	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. 

	#targetX = csv_io.read_data("target.csv", skipFirstLine = False, split = ",")
	#trainBase = csv_io.read_data("train1.csv", skipFirstLine = False, split = ",")	
	#test = csv_io_np.read_data("test1.csv", skipFirstLine = False, split = ",")

	#trainBase = trainBase[0:5000]
	#targetX = targetX[0:5000]
	
	#train_saleID = csv_io.read_data("train_salesID.csv", skipFirstLine = False, split = ",")
	#test_salesID = csv_io.read_data("test_salesID.csv", skipFirstLine = False, split = ",")
	

	predicted_list = []
	bootstrapLists = []


	clfs = [
	
			GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1)
	]		
	#GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=300, random_state=166, min_samples_leaf=1)	
	#GradientBoostingRegressor(loss='lad', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	#GradientBoostingRegressor(loss='huber', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1),	
	

	#train_fea, train["SalePrice"]
	print "Data size: ", len(train_fea) , len(test_fea)
	#dataset_blend_train = np.zeros((len(train_fea), len(clfs)))
	#dataset_blend_test = np.zeros((len(test), len(clfs)))
	dataset_blend_test = np.zeros((len(test_fea), len(clfs))) # np.zeros(len(train_fea), len(clfs))	
	dataset_blend_train = np.zeros((len(train_fea), len(clfs)))	

	
	print "Begin Training"

	lenTrainBase = 401125 # len(train_fea)



	lenTest = 11573 # len(test_fea)
	gc.collect()
	
	for ExecutionIndex, clf in enumerate(clfs):
		print clf
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((lenTest, NumFolds))
		
		foldCount = 0
		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(lenTrainBase, k=NumFolds, indices=True)
			
		
		for train_index, test_index in Folds:

			targetX = [train["SalePrice"][i] for i in train_index]
			trainX = [train_fea.ix[i] for i in train_index]
			
			targetTest = [train["SalePrice"][i] for i in test_index]	
			trainTest = [train_fea.ix[i] for i in test_index]
			

			gc.collect()
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(trainX), len(targetX)
			
			#print trainX[0]
			#print target[0]
			#return
			
			print "Start", datetime.datetime.now()
			clf.fit(trainX, targetX)
			prob = clf.predict(trainTest) 
			print "End  ", datetime.datetime.now()
			
			dataset_blend_train[test_index, ExecutionIndex] = prob

			gc.collect()

	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]
				#print targetTest[i], probX
				
				if probX < 0: # some are comming out negative.
					probX = -probX			

				probSum += math.pow(math.log10(targetTest[i]) - math.log10(probX), 2)
				
				#probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				#weightSum += weights[test_index[i]][0] 
				
				
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", math.sqrt(probSum/len(prob))
 
			avg += 	math.sqrt(probSum/len(prob))/NumFolds

			gc.collect()
			
		
			predicted_probs = []
			
			for i in range(0,lenTest):
				predicted_probs.append(clf.predict(test_fea.ix[i]))
				
			
			#predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
			gc.collect()
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), "AVG." , str(avg), str(clf), "Folds:", str(NumFolds), "Model", model, "", ""], filemode="a",delimiter=",")
		
		
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
Exemple #14
0
import util as u
import pandas as pd
import operator
import numpy as np
from sklearn import ensemble, cross_validation, metrics
import re

## todo: gradient boosted model?

if __name__ == '__main__':
    train, test = u.get_train_test_df(test_set=False)

    ## define columns to use for features
    ## not used: SalesID, MachineID, ModelID, Saledate, fiModelDesc
    ## ProductClassDesc, ProductGroupDesk
    ## fiBaseModel, fiSecondaryDesc, fiModelSeries, fiModelDescriptor
    categorical = {'datasource', 'auctioneerID', 'UsageBand', 'ProductSize', 'ProductGroup', 'Drive_System',
                   'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control', 'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
                   'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics', 'Pushblock', 'Ripper', 'Scarifier',
                   'Tip_Control', 'Tire_Size', 'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow', 'Track_Type',
                   'Thumb', 'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type', 'Travel_Controls',
                   'Differential_Type', 'Steering_Controls'}
    numerical = {'YearMade', 'MachineHoursCurrentMeter', 'Undercarriage_Pad_Width', 'Stick_Length'}
    

    
    def get_date_dataframe(date_column):
        '''Create a data frame with columns for each date constituent'''
        return pd.DataFrame({
            'SaleYear' : [d.year for d in date_column],
            'SaleMonth' : [d.month for d in date_column],